# EDA


In [1]:
import pandas as pd
from datetime import datetime
import re
from pathlib import Path
from text_processing import (
    clean_dataframe_column,
    get_word_counts,
    get_bigram_counts,
    get_post_statistics,
    save_pickle_file,
)

[nltk_data] Downloading package stopwords to /home/junc/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Process datasets


In [2]:
dataset_folder = Path("../datasets")
datasets = {}

In [3]:
# Dataset 1
filename1 = "Palo_Alto_20251007_235943_reddit.pkl"
df = pd.read_pickle(dataset_folder / filename1)
df = clean_dataframe_column(df, column="comments_flat")
df = get_post_statistics(df)
datasets["dataset1"] = df
print(f"dataset1: {len(df)} posts")

dataset1: 79 posts


In [4]:
# Dataset 2
filename2 = "Oklahoma_City_20251008_000300_reddit.pkl"
df = pd.read_pickle(dataset_folder / filename2)
df = clean_dataframe_column(df, column="comments_flat")
df = get_post_statistics(df)
datasets["dataset2"] = df
print(f"dataset2: {len(df)} posts")

dataset2: 135 posts


## Compare statistics across datasets


In [5]:
for name, df in datasets.items():
    total_comments = df["cleaned_comments"].apply(len).sum()
    print(f"{name}:")
    print(f"  Total posts: {len(df)}")
    print(f"  Total comments: {total_comments}")
    print(f"  Average comment length: {df['avg_comment_length'].mean():.1f} words")
    print(f"  Total tokens: {df['total_tokens'].sum()}")
    print()

dataset1:
  Total posts: 79
  Total comments: 3308
  Average comment length: 28.2 words
  Total tokens: 91043

dataset2:
  Total posts: 135
  Total comments: 10010
  Average comment length: 22.0 words
  Total tokens: 228638



## Top 5 words by dataset


In [6]:
for name, df in datasets.items():
    all_comments = []
    for comments_list in df["cleaned_comments"]:
        all_comments.extend(comments_list)

    words = get_word_counts(all_comments, top_n=5)
    print(f"{name}:")
    print(words)
    print()

dataset1:
     word  count
0  people    775
1    like    688
2  school    611
3     get    507
4   would    487

dataset2:
     word  count
0    like   2063
1  people   1680
2     one   1328
3   would   1318
4     get   1161



### Save Datasets with Cleaned Column

In [7]:
def save_dataset(df, dataset_name, dataset_folder):
    """
    Save a single DataFrame using the district extracted from df['query'] as filename prefix.
    """
    # extract district from first query (text inside quotes)
    if "query" in df.columns and not df["query"].empty:
        m = re.search(r'"([^"]+)"', str(df["query"].iloc[0]))
        district = m.group(1) if m else "unknown_district"
    else:
        district = "unknown_district"

    # sanitize
    district = re.sub(r"[^A-Za-z0-9_-]+", "_", district)

    # ensure folder
    folder = Path(dataset_folder or ".")
    folder.mkdir(parents=True, exist_ok=True)

    # filename
    ts = datetime.now().strftime("%Y%m%d_%H%M%S")
    path = folder / f"{district}_cleaned_{ts}_reddit.pkl"

    df.to_pickle(path)
    print(f"✅ Saved {dataset_name} → {path.name}")
    return path

In [8]:
for name, dataset in datasets.items():
    save_dataset(dataset, name, dataset_folder)

✅ Saved dataset1 → Palo_Alto_cleaned_20251008_005822_reddit.pkl
✅ Saved dataset2 → Oklahoma_City_cleaned_20251008_005822_reddit.pkl
