In [6]:
import polars as pl
from polars import Float64, Float32, Int64, Int32, Int16, Int8, UInt64, UInt32, UInt16, UInt8

import os

from collections import defaultdict, Counter

os.chdir("/Users/kishanterdal/Downloads/period_03/")

def is_numeric_dtype(dtype):
    return dtype in {Float64, Float32, Int64, Int32, Int16, Int8, UInt64, UInt32, UInt16, UInt8}

def analyze_dataframe_with_polars(df: pl.DataFrame):
    numeric_cols = [col for col, dtype in df.schema.items() if is_numeric_dtype(dtype)]
    non_numeric_cols = [col for col, dtype in df.schema.items() if not is_numeric_dtype(dtype)]

    numeric_stats = None
    if numeric_cols:
        numeric_stats = df.select(numeric_cols).describe().transpose(include_header=True)
        numeric_stats = numeric_stats.rename({"column": "Metric"})
        numeric_stats = numeric_stats.with_columns([pl.lit("Numeric").alias("Type")])

    non_numeric_stats = []
    for col in non_numeric_cols:
        col_df = df.select(pl.col(col)).drop_nulls()
        if col_df.is_empty():
            most_freq_val = None
            most_freq_count = None
        else:
            vc = df.select(pl.col(col)).drop_nulls().group_by(col).agg(pl.len().alias("count")).sort("count", descending=True)
            most_freq_val = vc[0, col]
            most_freq_count = vc[0, "count"]
        non_numeric_stats.append({
            "Column": col,
            "Type": "Non-Numeric",
            "Count": df.select(pl.col(col)).drop_nulls().height,
            "Unique Values": df.select(pl.col(col)).n_unique(),
            "Most Frequent": most_freq_val,
            "Frequency": most_freq_count
        })

    df_non_numeric_stats = pl.DataFrame(non_numeric_stats)

    return numeric_stats, df_non_numeric_stats

ads_df = pl.read_csv("2024_fb_ads_president_scored_anon.csv")
fb_posts_df = pl.read_csv("2024_fb_posts_president_scored_anon.csv")
tw_posts_df = pl.read_csv("2024_tw_posts_president_scored_anon.csv")

ads_num, ads_non_num = analyze_dataframe_with_polars(ads_df)
fb_posts_num, fb_posts_non_num = analyze_dataframe_with_polars(fb_posts_df)
tw_posts_num, tw_posts_non_num = analyze_dataframe_with_polars(tw_posts_df)


In [7]:
ads_num
ads_non_num

fb_posts_num
fb_posts_non_num

tw_posts_num
tw_posts_non_num

Column,Type,Count,Unique Values,Most Frequent,Frequency
str,str,i64,i64,str,i64
"""id""","""Non-Numeric""",27304,27304,"""e353bf5c558773dbf3de874081a538…",1
"""url""","""Non-Numeric""",27304,27304,"""d6ab6946a11d0a7cc73c9f86662cd4…",1
"""source""","""Non-Numeric""",27304,14,"""Twitter Web App""",14930
"""createdAt""","""Non-Numeric""",27304,27014,"""2024-08-30 03:04:08""",4
"""lang""","""Non-Numeric""",27304,12,"""en""",27281
…,…,…,…,…,…
"""isRetweet""","""Non-Numeric""",27304,1,"""false""",27304
"""isQuote""","""Non-Numeric""",27304,2,"""false""",24064
"""isConversationControlled""","""Non-Numeric""",27304,2,"""false""",27296
"""month_year""","""Non-Numeric""",27304,15,"""2024-10""",3586
