<a href="https://colab.research.google.com/github/Kunal-Ahirrao/Task_04_Descriptive_Stats/blob/main/polars_stats_fb_posts.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [20]:
import polars as pl

# Load the dataset
df = pl.read_csv("/content/2024_fb_posts_president_scored_anon.csv")

# Try cleaning comma-separated numbers in object columns
cleaned_cols = []
for col in df.columns:
    if df[col].dtype == pl.Utf8:
        try:
            df = df.with_columns(
                pl.col(col)
                .str.replace_all(",", "")
                .cast(pl.Float64)
                .alias(col)
            )
            cleaned_cols.append(col)
        except:
            continue  # Skip if not convertible

# Automatically detect numeric columns
numeric_cols = [col for col in df.columns if df[col].dtype in [pl.Float64, pl.Int64]]

output_lines = []

# === Column Types ===
output_lines.append("=== Column Types ===")
for col in df.columns:
    output_lines.append(f"{col}: {df[col].dtype}")
output_lines.append("\n")

# === Basic Descriptive Stats ===
output_lines.append("=== Basic Descriptive Stats ===")
stats = df.select([
    pl.col(col).mean().alias(f"{col}_mean") for col in numeric_cols
] + [
    pl.col(col).std().alias(f"{col}_std") for col in numeric_cols
] + [
    pl.col(col).min().alias(f"{col}_min") for col in numeric_cols
] + [
    pl.col(col).max().alias(f"{col}_max") for col in numeric_cols
])
output_lines.append(stats.to_pandas().to_string())
output_lines.append("\n")

# === Grouped by account_type ===
if "account_type" in df.columns:
    output_lines.append("=== Grouped by account_type (Mean Only, First 10) ===")
    group1 = df.group_by("account_type").agg([
        pl.col(col).mean().alias(f"{col}_mean") for col in numeric_cols
    ])
    output_lines.append(group1.head(10).to_pandas().to_string())
    output_lines.append("\n")
else:
    output_lines.append("Column 'account_type' not found — skipping group.\n")

# === Grouped by account_id and post_id ===
if "account_id" in df.columns and "post_id" in df.columns:
    output_lines.append("=== Grouped by account_id and post_id (Mean Only, First 10) ===")
    group2 = df.group_by(["account_id", "post_id"]).agg([
        pl.col(col).mean().alias(f"{col}_mean") for col in numeric_cols
    ])
    output_lines.append(group2.head(10).to_pandas().to_string())
    output_lines.append("\n")
else:
    output_lines.append("Missing 'account_id' or 'post_id' — skipping group.\n")

# Save to text file
with open("polars_output_posts.txt", "w", encoding="utf-8") as f:
    f.write("\n".join(output_lines))

print("✅ polars_output_posts.txt has been saved.")


✅ polars_output_posts.txt has been saved.
