### Clustering

In [13]:
from gensim.models import LdaModel
from gensim.corpora import Dictionary
from pprint import pprint
from datasets import load_dataset
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
import string

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/haitong/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
# Load dataset
ds = load_dataset("solomonk/reddit_mental_health_posts")

# Print dataset structure
print(f"Dataset structure: {ds}")
print(f"\nFeatures: {ds['train'].features}")
print(f"\nNumber of examples: {len(ds['train'])}")

# Convert to pandas dataframe, show first few rows
df = ds['train'].to_pandas()
df.head()

Repo card metadata block was not found. Setting CardData to empty.


Dataset structure: DatasetDict({
    train: Dataset({
        features: ['author', 'body', 'created_utc', 'id', 'num_comments', 'score', 'subreddit', 'title', 'upvote_ratio', 'url'],
        num_rows: 151288
    })
})

Features: {'author': Value(dtype='string', id=None), 'body': Value(dtype='string', id=None), 'created_utc': Value(dtype='string', id=None), 'id': Value(dtype='string', id=None), 'num_comments': Value(dtype='int64', id=None), 'score': Value(dtype='int64', id=None), 'subreddit': Value(dtype='string', id=None), 'title': Value(dtype='string', id=None), 'upvote_ratio': Value(dtype='float64', id=None), 'url': Value(dtype='string', id=None)}

Number of examples: 151288


Unnamed: 0,author,body,created_utc,id,num_comments,score,subreddit,title,upvote_ratio,url
0,HotConversation1273,A few months ago I was accepted into this full...,2021-12-22T18:32:56.000Z,rmbjwb,1,1,ADHD,I get extremely anxious if I’m not working 24/7,1.0,https://www.reddit.com/r/ADHD/comments/rmbjwb/...
1,snorefestt,"Hey guys, I was curious if anyone else has the...",2021-12-22T18:24:25.000Z,rmbd1y,3,5,ADHD,"I can't will myself to clean my own house, but...",1.0,https://www.reddit.com/r/ADHD/comments/rmbd1y/...
2,etyf12,\n\ni have 6 exams in the next 2 weeks one of...,2021-12-22T18:22:52.000Z,rmbbvu,1,2,ADHD,i need some help,1.0,https://www.reddit.com/r/ADHD/comments/rmbbvu/...
3,GetHairOrDieTryin,Is there anyone out there that is struggling w...,2021-12-22T18:20:35.000Z,rmba1t,3,2,ADHD,Anyone up for a chat?,1.0,https://www.reddit.com/r/ADHD/comments/rmba1t/...
4,ZeroTransPat,"Whenever I get hungry, I never eat because I d...",2021-12-22T18:18:47.000Z,rmb8lm,2,1,ADHD,Figuring out what to eat sucks,1.0,https://www.reddit.com/r/ADHD/comments/rmb8lm/...


In [17]:
# Check missing values
print("Missing values before cleaning:")
print(df.isnull().sum())

# Remove rows with missing values in 'body' column
df = df.dropna(subset=['body'])

print("\nMissing values after cleaning:")
print(df.isnull().sum())
print(f"\nTotal rows remaining: {len(df)}")

Missing values before cleaning:
author               0
body              1609
created_utc          0
id                   0
num_comments         0
score                0
subreddit            0
title                0
upvote_ratio         0
url                  0
processed_body       0
dtype: int64

Missing values after cleaning:
author            0
body              0
created_utc       0
id                0
num_comments      0
score             0
subreddit         0
title             0
upvote_ratio      0
url               0
processed_body    0
dtype: int64

Total rows remaining: 149679


In [18]:
def preprocess_post(post):    
    post = re.sub(r'\n', r' ', post) 
    post = re.sub(r'[^\x00-\x7f]', r'', post) 
    post = post.lower().split() 
    post = [word for word in post if word not in stop_words] 
    post = [word for word in post if not word.startswith('http')] 
    post = [word for word in post if not word.startswith('@')] 
    post = [word.translate(str.maketrans('', '', string.punctuation)) for word in post] 
    post = [word for word in post if word] #remove empty strings
    return post

df['processed_body'] = df['body'].apply(preprocess_post)
print(df['processed_body'][:20])

0     [months, ago, accepted, full, time, software, ...
1     [hey, guys, curious, anyone, else, issue, me, ...
2     [6, exams, next, 2, weeks, one, monday, havent...
3     [anyone, struggling, addadhd, thats, interesti...
4     [whenever, get, hungry, never, eat, know, eat,...
5                                             [removed]
6     [im, 20, mg, lexapro, 50, mg, vyvanse, wonderi...
7     [recently, big, lifestyle, change, new, job, p...
8     [newly, diagnosed, 42, yr, old, female, starte...
9     [tldr, rough, time, titrating, concerta, last,...
10                                            [removed]
11    [adhd, highly, suspect, ocd, hear, lot, intrus...
12    [hello, everyone, life, falling, apart, take, ...
13    [currently, working, temporary, employment, co...
14    [need, advice, dont, know, do, ive, ruined, re...
15                                            [removed]
16                                            [deleted]
17                                            [r

In [19]:
dictionary = Dictionary(df['processed_body'])
corpus = [dictionary.doc2bow(post) for post in df['processed_body']]

lda = LdaModel(
               corpus=corpus,
               id2word=dictionary,
               num_topics=5,
               alpha="auto",
               eta="auto",
               random_state=42,
               passes=5,
               iterations=1000)

print("Topics discovered by LDA model:")
pprint(lda.print_topics(num_words=10))

Topics discovered by LDA model:
[(0,
  '0.018*"like" + 0.013*"feel" + 0.012*"im" + 0.010*"know" + 0.010*"me" + '
  '0.009*"get" + 0.008*"even" + 0.008*"time" + 0.008*"want" + 0.007*"cant"'),
 (1,
  '0.011*"ptsd" + 0.009*"sleep" + 0.008*"anxiety" + 0.008*"day" + '
  '0.007*"night" + 0.007*"get" + 0.006*"started" + 0.006*"time" + 0.006*"work" '
  '+ 0.006*"panic"'),
 (2,
  '0.217*"removed" + 0.016*"song" + 0.016*"threw" + 0.015*"smoking" + '
  '0.013*"prison" + 0.011*"music" + 0.010*"blue" + 0.010*"snapped" + '
  '0.009*"songs" + 0.008*"cruel"'),
 (3,
  '0.015*"ocd" + 0.013*"ptsd" + 0.011*"people" + 0.009*"anyone" + 0.008*"also" '
  '+ 0.007*"would" + 0.006*"trauma" + 0.006*"therapist" + 0.006*"thoughts" + '
  '0.005*"something"'),
 (4,
  '0.476*"deleted" + 0.012*"surgery" + 0.007*"survivor" + 0.007*"shoulder" + '
  '0.007*"kiss" + 0.006*"stood" + 0.006*"murder" + 0.005*"fought" + '
  '0.005*"intimacy" + 0.005*"graphic"')]
