In [1]:
import csv
import math
import os
from collections import Counter, defaultdict

DATA_PATHS = [
    '/Users/kishanterdal/Downloads/period_03/2024_fb_ads_president_scored_anon.csv',
    '/Users/kishanterdal/Downloads/period_03/2024_fb_posts_president_scored_anon.csv',
    '/Users/kishanterdal/Downloads/period_03/2024_tw_posts_president_scored_anon.csv'
]

def load_data(filepath):
    with open(filepath, mode='r', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        data = list(reader)
    return data

def is_numeric(value):
    try:
        float(value)
        return True
    except ValueError:
        return False

def overall_statistics(data, dataset_name):
    print(f"\n=== Overall Statistics for {dataset_name} ===")
    if not data:
        print("No data available.")
        return

    columns = data[0].keys()
    for col in columns:
        col_values = [row[col] for row in data if row[col] != '']
        numeric_values = [float(v) for v in col_values if is_numeric(v)]

        print(f"\nColumn: {col}")
        print(f"Count: {len(col_values)}")

        if numeric_values:
            mean = sum(numeric_values) / len(numeric_values)
            min_val = min(numeric_values)
            max_val = max(numeric_values)
            stdev = math.sqrt(sum((x - mean) ** 2 for x in numeric_values) / len(numeric_values))

            print(f"Mean: {mean}")
            print(f"Min: {min_val}")
            print(f"Max: {max_val}")
            print(f"Standard Deviation: {stdev}")
        else:
            value_counts = Counter(col_values)
            most_common = value_counts.most_common(1)[0] if value_counts else ('N/A', 0)

            print(f"Unique Values: {len(set(col_values))}")
            print(f"Most Frequent Value: {most_common[0]} (Count: {most_common[1]})")

def group_statistics(data, group_cols, dataset_name):
    print(f"\n=== Grouped Statistics for {dataset_name} by {group_cols} ===")
    grouped_data = defaultdict(list)

    for row in data:
        key = tuple(row[col] for col in group_cols)
        grouped_data[key].append(row)

    for i, (group_key, group_rows) in enumerate(grouped_data.items()):
        if i >= 3:
            break
        print(f"\nGroup: {group_key}")
        overall_statistics(group_rows, f"{dataset_name} - Group {group_key}")

def analyze_dataset(filepath):
    if not os.path.isfile(filepath):
        print(f"File {filepath} does not exist. Skipping...")
        return

    data = load_data(filepath)
    dataset_name = os.path.basename(filepath)
    print(f"\nLoaded {len(data)} rows from {dataset_name}")

    overall_statistics(data, dataset_name)

    if data:
        columns = data[0].keys()
        if 'page_id' in columns and 'ad_id' in columns:
            group_statistics(data, ['page_id', 'ad_id'], dataset_name)
        elif 'page_id' in columns:
            group_statistics(data, ['page_id'], dataset_name)
        elif 'state' in columns:
            group_statistics(data, ['state'], dataset_name)
        else:
            print(f"No grouping columns ('page_id', 'ad_id', 'state') found in {dataset_name}.")

def main():
    for filepath in DATA_PATHS:
        analyze_dataset(filepath)

if __name__ == "__main__":
    main()



Loaded 246745 rows from 2024_fb_ads_president_scored_anon.csv

=== Overall Statistics for 2024_fb_ads_president_scored_anon.csv ===

Column: page_id
Count: 246745
Unique Values: 4475
Most Frequent Value: 4d66f5853f0365dba032a87704a634f023d15babde973bb7a284ed8cd2707b2d (Count: 55503)

Column: ad_id
Count: 246745
Unique Values: 246745
Most Frequent Value: 0ddb025b8544e2d58e6977ad417e742a52522b3e1fc1c9d9b61c57148f8d72fc (Count: 1)

Column: ad_creation_time
Count: 246745
Unique Values: 547
Most Frequent Value: 2024-10-27 (Count: 8619)

Column: bylines
Count: 245736
Unique Values: 3790
Most Frequent Value: HARRIS FOR PRESIDENT (Count: 49788)

Column: currency
Count: 246745
Unique Values: 18
Most Frequent Value: USD (Count: 246599)

Column: delivery_by_region
Count: 246745
Unique Values: 141122
Most Frequent Value: {} (Count: 30989)

Column: demographic_distribution
Count: 246745
Unique Values: 215622
Most Frequent Value: {} (Count: 30989)

Column: estimated_audience_size
Count: 246745
Mean