# MindLens-AI — 01: Exploratory Data Analysis

Analyze the Reddit Depression dataset to understand class distribution, text characteristics, and key patterns before model training.

In [None]:
# Setup
import sys, os
sys.path.insert(0, os.path.abspath(".."))

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud

from src.preprocessing import (
    load_primary_dataset,
    preprocess_pipeline,
    save_processed,
)
from src.features import build_tfidf

# Style
sns.set_theme(style="whitegrid", palette="muted")
plt.rcParams["figure.figsize"] = (12, 5)
plt.rcParams["figure.dpi"] = 100

print("Setup complete ✓")

## 1. Load & Preprocess Data

In [None]:
# Load raw dataset
raw_df = load_primary_dataset("../data/raw/depression_dataset_reddit_cleaned.csv")
print(f"Raw dataset: {raw_df.shape[0]} rows, {raw_df.shape[1]} cols")
print(raw_df["label"].value_counts())

# Preprocess
df = preprocess_pipeline(raw_df, remove_stopwords=False)
print(f"\nProcessed dataset: {df.shape[0]} rows")
df.head()

## 2. Dataset Overview

In [None]:
print("Shape:", df.shape)
print("\n--- Info ---")
df.info()
print("\n--- Describe ---")
df.describe()

## 3. Class Distribution

In [None]:
fig, ax = plt.subplots(figsize=(6, 4))
counts = df["label"].value_counts().sort_index()
bars = ax.bar(["No Risk (0)", "Risk (1)"], counts.values, color=["#4CAF50", "#F44336"])
for bar, val in zip(bars, counts.values):
    ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 50,
            str(val), ha="center", fontweight="bold")
ax.set_title("Class Distribution", fontsize=14)
ax.set_ylabel("Count")
plt.tight_layout()
plt.show()

print(f"Class balance — 0: {counts[0]} ({counts[0]/len(df)*100:.1f}%)  "
      f"1: {counts[1]} ({counts[1]/len(df)*100:.1f}%)")

## 4. Text Length Analysis

In [None]:
fig, axes = plt.subplots(2, 3, figsize=(18, 10))
features = ["word_count", "char_count", "avg_word_length", "word_density", "unique_word_ratio"]
labels_map = {0: "No Risk", 1: "Risk"}
colors = {0: "#4CAF50", 1: "#F44336"}

for i, feat in enumerate(features):
    ax = axes[i // 3][i % 3]
    for lab in [0, 1]:
        subset = df[df["label"] == lab][feat]
        ax.hist(subset, bins=50, alpha=0.6, label=labels_map[lab], color=colors[lab])
    ax.set_title(feat, fontsize=12)
    ax.legend()

# Remove empty subplot
axes[1][2].set_visible(False)
plt.suptitle("Feature Distributions by Class", fontsize=14, y=1.01)
plt.tight_layout()
plt.show()

## 5. Word Clouds

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

for idx, (lab, title, cmap) in enumerate([(0, "No Risk", "Greens"), (1, "Risk", "Reds")]):
    text_blob = " ".join(df[df["label"] == lab]["text"].dropna())
    wc = WordCloud(width=800, height=400, background_color="white",
                   colormap=cmap, max_words=100).generate(text_blob)
    axes[idx].imshow(wc, interpolation="bilinear")
    axes[idx].set_title(title, fontsize=14)
    axes[idx].axis("off")

plt.suptitle("Word Clouds by Class", fontsize=16, y=1.02)
plt.tight_layout()
plt.show()

## 6. Top TF-IDF Terms per Class

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

for idx, (lab, title, color) in enumerate([(0, "No Risk — Top 20 TF-IDF", "#4CAF50"),
                                            (1, "Risk — Top 20 TF-IDF", "#F44336")]):
    subset_texts = df[df["label"] == lab]["text"]
    vec, X = build_tfidf(subset_texts, max_features=5000)
    mean_tfidf = X.mean(axis=0).A1
    top_idx = mean_tfidf.argsort()[-20:]
    words = np.array(vec.get_feature_names_out())[top_idx]
    scores = mean_tfidf[top_idx]

    axes[idx].barh(words, scores, color=color)
    axes[idx].set_title(title, fontsize=12)
    axes[idx].set_xlabel("Mean TF-IDF")

plt.tight_layout()
plt.show()

## 7. Feature Correlations

In [None]:
numeric_cols = ["word_count", "char_count", "avg_word_length", "word_density", "unique_word_ratio", "label"]
corr = df[numeric_cols].corr()

fig, ax = plt.subplots(figsize=(8, 6))
sns.heatmap(corr, annot=True, fmt=".2f", cmap="RdBu_r", center=0, ax=ax)
ax.set_title("Feature Correlation Matrix", fontsize=14)
plt.tight_layout()
plt.show()

## 8. Summary Statistics by Class

In [None]:
summary = df.groupby("label")[["word_count", "char_count", "avg_word_length",
                               "word_density", "unique_word_ratio"]].agg(["mean", "median", "std"])
summary.columns = ["_".join(c) for c in summary.columns]
summary

## 9. Save Processed Data

In [None]:
save_processed(df, "../data/processed/primary_clean.csv")
print("EDA complete ✓")