In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from pathlib import Path
from tqdm import tqdm
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

from preprocess import *
from visualize import *
from classification import *
from Sampling import *
from feature_engineering import *

tqdm.pandas()

# 1. Data Preprocessing

In [None]:
# Uncomment when 1st execution - Comment after
# base_dir = Path('./RMHD/raw data')  # Current dir
# monthly_df = []
# years = ['2019', '2020', '2021', '2022']

# for year in years:
#     year_dir = base_dir / year
        
#     print(f"Processing year: {year}")
#     # month folder in year
#     for subdir_path in year_dir.iterdir():
#         if not subdir_path.is_dir():
#             continue
            
#         print(f"  Processing subdirectory: {subdir_path.name}")
#         # Find all CSV files in the subdir
#         csv_files = list(subdir_path.glob('*.csv'))
        
#         for csv_file in csv_files:
#             print(f"    Reading file: {csv_file.name}")
#             df = pd.read_csv(csv_file)
#             df = df.drop(df.columns[0], axis=1) # drop first col
#             monthly_df.append(df)

# # Concat all dataframes
# if monthly_df:
#     df = pd.concat(monthly_df, ignore_index=True)
# else:
#     print("No CSV files found or all files were empty.")

# df.to_csv('data_samples/RMHD_raw.csv', index=False)

In [None]:
df = pd.read_csv('data_samples/RMHD_raw.csv')
df.info()

In [None]:
print(df['subreddit'].value_counts())

#countplot("subreddit distribution", df, "subreddit")

In [None]:
df = limit_subreddits(df) # Take top 5 subreddit records
df_sampled = sample_mental_health_data(df = df, output_file = 'data_samples/RMHD_Sampled.csv') # Undersample
print(df_sampled['subreddit'].value_counts())
df_processed = preprocess(df_sampled) # Pre process (drop na, datetime)

In [None]:
# plt.figure(figsize=(16, 8))

# sns.barplot(df['timestamp'].dt.to_period('M').value_counts().sort_index())

# # Customize appearance
# plt.title('Monthly Post Frequency', fontsize=16)
# plt.xlabel('Month', fontsize=14)
# plt.ylabel('Number of Posts', fontsize=14)
# plt.xticks(rotation=45, ha='right')
# plt.grid(axis='y', linestyle='--', alpha=0.7)

# 2. Text Preprocessing 

In [None]:
df_processed['title_processed'] = df_processed['title'].progress_apply(text_process)
df_processed['selftext_processed'] = df_processed['selftext'].progress_apply(text_process)
df.to_csv("data_samples/RMHD_TextProcessed.csv", index = False)

# Display example results
print("Original Title:")
print(df_processed['title'].iloc[0])
print("\nProcessed Title:")
print(df_processed['title_processed'].iloc[0])
print("\nOriginal Selftext:")
print(df_processed['selftext'].iloc[0])
print("\nProcessed Selftext:")
print(df_processed['selftext_processed'].iloc[0])

In [None]:
subreddits = df_sampled['subreddit'].value_counts().head().index

# title
for subreddit in subreddits:
    text = ' '.join(df_sampled[df_sampled['subreddit'] == subreddit]['title_processed'])
    plot_wordcloud(text, f"{subreddit} Title Word Cloud")

# selftext
for subreddit in subreddits:
    text = ' '.join(df_sampled[df_sampled['subreddit'] == subreddit]['selftext_processed'])
    plot_wordcloud(text, f"{subreddit} SelfText Word Cloud")

In [None]:
df_sampled['text'] = df_sampled['title_processed'] + " " + df_sampled['selftext_processed']
lda_topics, lda, count_vectorizer = extract_lda_features(df_sampled['text'])

df_sampled['dominant_topic'] = np.argmax(lda_topics, axis=1)
topic_class_matrix = df_sampled.groupby(['dominant_topic', 'subreddit']).size().unstack().fillna(0)

plt.figure(figsize=(10, 6))
sns.heatmap(topic_class_matrix, annot=True, fmt='.0f', cmap='YlGnBu')
plt.title("Topic-Class Distribution Heatmap (LDA)")
plt.xlabel("Subreddit")
plt.ylabel("Dominant Topic")
plt.savefig('Topic-Class_Distribution_Heatmap.png', dpi=300, bbox_inches='tight')
plt.show()

feature_names = vectorizer.get_feature_names_out()
for i, topic in enumerate(lda.components_):
    top_words_idx = topic.argsort()[:-21:-1]  # get top 10 words
    top_words = [feature_names[idx] for idx in top_words_idx]
    print(f"Topic #{i+1}:")
    print(", ".join(top_words))
    print()

Topic #1: Keywords such as “dont”, “want”, “life”, “feel”, “anymore”, “fucking” suggest strong mood swings that may involve despair, helplessness, or dissatisfaction with life, and may even involve depression or suicidal thoughts.

Topic #2: Keywords such as “feel”, “really”, “want”, “time”, “thing” suggest that the topic may be related to inner feelings and confusion in life, and may involve self-doubt or uncertainty.

Topic #3: Keywords such as “friend”, “year”, “school”, “job” suggest that this topic may be related to life experiences such as changes in upbringing, education, work, and friendships.

Topic #4: Keywords such as “people”, “like”, “friend”, “make” suggest that this topic may be related to social relationships, interpersonal interactions, and perceptions of other people or society, and may be associated with feelings of loneliness or social anxiety.

Topic #5: Keywords such as “anxiety”, “attack”, “help”, “day”, “time” explicitly point to symptoms of anxiety and may relate to panic attacks, help-seeking, and mental health issues.