# EDA

In [8]:
import pandas as pd
from pathlib import Path
from text_processing import (
    clean_dataframe_column,
    get_word_counts,
    get_bigram_counts,
    get_post_statistics
)


## Process datasets

In [32]:
dataset_folder = Path("../datasets")
datasets = {}

In [33]:
#Dataset 1
df= df = pd.read_pickle(dataset_folder / "20251004_065231_reddit_posts.pkl")
df= clean_dataframe_column(df, column='comments_flat')
df= get_post_statistics(df)
datasets['dataset1'] = df
print(f"dataset1: {len(df)} posts")


dataset1: 78 posts


In [34]:
#Dataset 2
df= df = pd.read_pickle(dataset_folder / "20251004_063950_reddit_posts.pkl")
df= clean_dataframe_column(df, column='comments_flat')
df= get_post_statistics(df)
datasets['dataset12'] = df
print(f"dataset2: {len(df)} posts")

dataset2: 78 posts


In [35]:
# Dataset 3
df = pd.read_pickle(dataset_folder / "20251001_173930_reddit_posts.pkl")
df = clean_dataframe_column(df, column='flat_corpus')
df = get_post_statistics(df)
datasets['dataset3'] = df
print(f"dataset3: {len(df)} posts")

dataset3: 78 posts


In [36]:
# Dataset 4
df = pd.read_pickle(dataset_folder / "20251001_011733_reddit_posts.pkl")
flat_comments_list = []

#Flatten the nested lists in the 'corpus' column
for corpus in df['corpus']:
    flat_row = []

    for item in corpus:
        if isinstance(item, list):
            if len(item) > 0:
                flat_row.append(item[0])
        else:
            flat_row.append(item)
    
    flat_comments_list.append(flat_row)

#Add the flattened comments as a new column
df['flat_corpus'] = flat_comments_list

df = clean_dataframe_column(df, column='flat_corpus')
df = get_post_statistics(df)
datasets['dataset4'] = df
print(f" dataset4: {len(df)} posts")

 dataset4: 91 posts


## Compare statistics across datasets

In [37]:
for name, df in datasets.items():
    total_comments = df['cleaned_comments'].apply(len).sum()
    print(f"{name}:")
    print(f"  Total posts: {len(df)}")
    print(f"  Total comments: {total_comments}")
    print(f"  Average comment length: {df['avg_comment_length'].mean():.1f} words")
    print(f"  Total tokens: {df['total_tokens'].sum()}")
    print()

dataset1:
  Total posts: 78
  Total comments: 3263
  Average comment length: 27.9 words
  Total tokens: 89127

dataset12:
  Total posts: 78
  Total comments: 3263
  Average comment length: 27.9 words
  Total tokens: 89127

dataset3:
  Total posts: 78
  Total comments: 3243
  Average comment length: 27.9 words
  Total tokens: 89139

dataset4:
  Total posts: 91
  Total comments: 457
  Average comment length: 35.8 words
  Total tokens: 17498



## Top 5 words by dataset 

In [31]:
for name, df in datasets.items():
    all_comments = []
    for comments_list in df['cleaned_comments']:
        all_comments.extend(comments_list)
    
    words = get_word_counts(all_comments, top_n=5)
    print(f"{name}:")
    print(words)
    print()

dataset1:
     word  count
0  people    744
1    like    679
2  school    593
3     get    501
4   would    443

dataset12:
     word  count
0  people    744
1    like    679
2  school    593
3     get    501
4   would    443

dataset3:
     word  count
0  people    755
1    like    683
2  school    601
3     get    503
4   would    448

dataset4:
     word  count
0  school    218
1    palo    209
2    alto    204
3    kids    142
4  people    109

