In [1]:
import pandas as pd
import os

DATA_PATHS = [
    '/Users/kishanterdal/Downloads/period_03/2024_fb_ads_president_scored_anon.csv',
    '/Users/kishanterdal/Downloads/period_03/2024_fb_posts_president_scored_anon.csv',
    '/Users/kishanterdal/Downloads/period_03/2024_tw_posts_president_scored_anon.csv'
]

def load_data(filepath):
    return pd.read_csv(filepath)

def overall_statistics(df, dataset_name):
    print(f"\n=== Overall Descriptive Statistics for {dataset_name} ===")
    for col in df.columns:
        print(f"\nColumn: {col}")
        print(df[col].describe())

    print("\nUnique Value Counts:")
    for col in df.columns:
        print(f"{col}: {df[col].nunique()}")

    print("\nMost Frequent Values:")
    for col in df.columns:
        if not df[col].mode().empty:
            print(f"{col}: {df[col].mode().iloc[0]}")

def group_statistics(df, group_cols, dataset_name):
    print(f"\n=== Grouped Statistics for {dataset_name} by {group_cols} ===")
    grouped = df.groupby(group_cols)

    for name, group in list(grouped)[0:3]:  # limit to first 3 groups
        print(f"\nGroup: {name}")
        print(group.describe(include='all'))

def analyze_dataset(filepath):
    if not os.path.isfile(filepath):
        print(f"File {filepath} does not exist. Skipping...")
        return

    df = load_data(filepath)
    dataset_name = os.path.basename(filepath)
    print(f"\nLoaded {df.shape[0]} rows and {df.shape[1]} columns from {dataset_name}")

    overall_statistics(df, dataset_name)

    columns = df.columns

    if 'page_id' in columns and 'ad_id' in columns:
        group_statistics(df, ['page_id', 'ad_id'], dataset_name)
    elif 'page_id' in columns:
        group_statistics(df, ['page_id'], dataset_name)
    elif 'state' in columns:
        group_statistics(df, ['state'], dataset_name)
    else:
        print(f"\nNo grouping columns ('page_id', 'ad_id', or 'state') found in {dataset_name}. Skipping grouping.")

def main():
    for filepath in DATA_PATHS:
        analyze_dataset(filepath)

if __name__ == "__main__":
    main()



Loaded 246745 rows and 41 columns from 2024_fb_ads_president_scored_anon.csv

=== Overall Descriptive Statistics for 2024_fb_ads_president_scored_anon.csv ===

Column: page_id
count                                                246745
unique                                                 4475
top       4d66f5853f0365dba032a87704a634f023d15babde973b...
freq                                                  55503
Name: page_id, dtype: object

Column: ad_id
count                                                246745
unique                                               246745
top       0ddb025b8544e2d58e6977ad417e742a52522b3e1fc1c9...
freq                                                      1
Name: ad_id, dtype: object

Column: ad_creation_time
count         246745
unique           547
top       2024-10-27
freq            8619
Name: ad_creation_time, dtype: object

Column: bylines
count                   245736
unique                    3790
top       HARRIS FOR PRESIDENT
freq          