# Data Cleaning

In [1]:
import pandas as pd
import numpy as np
from transformers import BertModel, BertTokenizer
from datasets import tqdm
import torch
from torch.utils.data import TensorDataset, DataLoader

In [2]:
path = 'reddit_mental_health.csv'

# data overview
df = pd.read_csv(path)
df

Unnamed: 0.1,Unnamed: 0,text,title,target
0,0,Welcome to /r/depression's check-in post - a p...,"Regular check-in post, with information about ...",1
1,1,We understand that most people who reply immed...,Our most-broken and least-understood rules is ...,1
2,2,Anyone else just miss physical touch? I crave ...,"I haven’t been touched, or even hugged, in so ...",1
3,3,I’m just so ashamed. Everyone and everything f...,Being Depressed is Embarrassing,1
4,4,I really need a friend. I don't even have a si...,I'm desperate for a friend and to feel loved b...,1
...,...,...,...,...
5952,1183,I’ve (24M) dealt with depression/anxiety for y...,Nobody takes me seriously,4
5953,1184,"""I don't feel very good, it's like I don't be...",selfishness,4
5954,1185,"I can't sleep most of the nights, meds didn't ...",Is there any way to sleep better?,4
5955,1186,"Hi, all. I have to give a presentation at work...",Public speaking tips?,4


In [3]:
# drop the rows with missing values
df1 = df.dropna()
df1

Unnamed: 0.1,Unnamed: 0,text,title,target
0,0,Welcome to /r/depression's check-in post - a p...,"Regular check-in post, with information about ...",1
1,1,We understand that most people who reply immed...,Our most-broken and least-understood rules is ...,1
2,2,Anyone else just miss physical touch? I crave ...,"I haven’t been touched, or even hugged, in so ...",1
3,3,I’m just so ashamed. Everyone and everything f...,Being Depressed is Embarrassing,1
4,4,I really need a friend. I don't even have a si...,I'm desperate for a friend and to feel loved b...,1
...,...,...,...,...
5952,1183,I’ve (24M) dealt with depression/anxiety for y...,Nobody takes me seriously,4
5953,1184,"""I don't feel very good, it's like I don't be...",selfishness,4
5954,1185,"I can't sleep most of the nights, meds didn't ...",Is there any way to sleep better?,4
5955,1186,"Hi, all. I have to give a presentation at work...",Public speaking tips?,4


In [4]:
# drop the 'row number' column since it's just an identifier
df2 = df1.drop(df1.columns[0], axis=1)
df2

# also note that we have a pretty balanced dataset!

Unnamed: 0,text,title,target
0,Welcome to /r/depression's check-in post - a p...,"Regular check-in post, with information about ...",1
1,We understand that most people who reply immed...,Our most-broken and least-understood rules is ...,1
2,Anyone else just miss physical touch? I crave ...,"I haven’t been touched, or even hugged, in so ...",1
3,I’m just so ashamed. Everyone and everything f...,Being Depressed is Embarrassing,1
4,I really need a friend. I don't even have a si...,I'm desperate for a friend and to feel loved b...,1
...,...,...,...
5952,I’ve (24M) dealt with depression/anxiety for y...,Nobody takes me seriously,4
5953,"""I don't feel very good, it's like I don't be...",selfishness,4
5954,"I can't sleep most of the nights, meds didn't ...",Is there any way to sleep better?,4
5955,"Hi, all. I have to give a presentation at work...",Public speaking tips?,4


In [5]:
# check if the datatypes are correct
df2.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5607 entries, 0 to 5956
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    5607 non-null   object
 1   title   5607 non-null   object
 2   target  5607 non-null   int64 
dtypes: int64(1), object(2)
memory usage: 175.2+ KB


In [6]:
# change "title" and "text" columns to string datatype
df3 = df2.copy()
df3["title"] = df2["title"].astype("string")
df3["text"] = df2["text"].astype("string")

df3.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5607 entries, 0 to 5956
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    5607 non-null   string
 1   title   5607 non-null   string
 2   target  5607 non-null   int64 
dtypes: int64(1), string(2)
memory usage: 175.2 KB


In [64]:
# combine "title" and "text" into one column called "combined_text"
df4 = df3.copy()
df4["combined_text"] = df3["title"] + " " + df3["text"]
df4.drop(columns=["title", "text"], inplace=True)
df4

Unnamed: 0,target,combined_text
0,1,"Regular check-in post, with information about our rules and wikis Welcome to /r/depression's check-in post - a place to take a moment and share what is going on and how you are doing. If you have an accomplishment you want to talk about (these shouldn't be standalone posts in the sub as they violate the ""role model"" rule, but are welcome here), or are having a tough time but prefer not to make your own post, this is a place you can share. ----- Our subreddit rules are located in the sidebar (you can also always access them at https://www.reddit.com/r/depression/about/rules) - since all of them exist for important safety reasons, we ask everyone here to read and follow them. Please click 'report' on any harmful content you see here - we always want to know and deal as soon as we can. We also have several wikis there for help with finding and giving support: https://www.reddit.com/r/depression/wiki/what_is_depression provides guidance about what is and isn't a depressive disorder, guidance on the complex nature of the illnesses that are usually grouped under the ""depression"" label, and redirect information for common off-topic issues. https://www.reddit.com/r/depression/wiki/giving_help offers information on the nature and value of peer support for mental-health issues in general, and lots of guidance for learning what is -- and isn't -- usually helpful in giving peer support. YSK that the types of rule violations that we most frequently see interfering with people getting safe and relevant support here are: - People breaking the private contact rule. You should never trust anyone who tries to get you into a private conversation in response to a post here. See https://www.reddit.com/r/depression/wiki/private_contact - ""I'm here to help"" posts. This shows that you don't understand the most basic principles of peer support, especially selectivity. The ""giving help"" wiki explains more about this. - Role modelling, i.e. ""achievement"" or ""advice"" posts. This is an expert-free zone -- that's what peer support means (rule 5). We know that ""internet culture"" celebrate not just bragging about your achievements but bragging about your good intentions. Nothing like that is ever acceptable here. - Content that's more about 'making a statement' or casually polling the sub than seeking personal support (or, in a comment, giving it) (rules 1, 2 and 10). - Off-topic posts about difficult situations or circumstances, including interpersonal losses. Grief, sadness, anger, and other difficult emotions are not mental illnesses. The ""what is depression"" wiki has suggestions for other places to post about these issues, which are 100% valid and serious but inappropriate here."
1,1,"Our most-broken and least-understood rules is ""helpers may not invite private contact as a first resort"", so we've made a new wiki to explain it We understand that most people who reply immediately to an OP with an invitation to talk privately mean only to help, but this type of response usually leads to either disappointment or disaster. it usually works out quite differently here than when you say ""PM me anytime"" in a casual social context. We have huge admiration and appreciation for the goodwill and good citizenship of so many of you who support others here and flag inappropriate content - even more so because we know that so many of you are struggling yourselves. We're hard at work behind the scenes on more information and resources to make it easier to give and get quality help here - this is just a small start. Our new wiki page explains in detail why it's much better to respond in public comments, at least until you've gotten to know someone. It will be maintained at /r/depression/wiki/private_contact, and the full text of the current version is below. ***** ###Summary### **Anyone who, while acting as a helper, invites or accepts private contact (I.e. PMs, chat, or any kind of offsite communication) early in the conversion is showing either bad intentions or bad judgement. Either way, it's unwise to trust them.** ""PM me anytime"" seems like a kind and generous offer. And it might be perfectly well-meaning, but, unless and until a solid rapport has been established, it's just not a wise idea. Here are some points to consider before you offer or accept an invitation to communicate privately. * **By posting supportive replies publicly, you'll help more people than just the OP. If your responses are of good quality, you'll educate and inspire other helpers.** [The 1-9-90 rule](https://en.wikipedia.org/wiki/1%25_rule_(Internet_culture\)) applies here as much as it does anywhere else on the internet. * People who are struggling with serious mental-health issues often (justifiably) have a low tolerance for disappointment and a high-level of ever-changing emotional need. **Unless the helper is able to make a 100% commitment to be there for them in every way, for as long as necessary, offering a personal inbox as a resource is likely to do more harm than good.** This is why mental-health crisis-line responders usually don't give their names and callers aren't allowed to request specific responders. It's much healthier and safer for the callers to develop a relationship with the agency as a whole. Analogously, it's much safer and healthier for our OPs to develop a relationship with the community as a whole. Even trained responders are generally not allowed to work high-intensity situations alone. It's partly about availability, but it's mostly about wider perspective and preventing compassion fatigue. * **If a helper gets in over their head with someone whose mental-health issues (including suicidality, which is often comorbid with depression) escalate, in a PM conversation it's much harder for others, including the /r/depression and /r/SuicideWatch moderators to help**. (Contrary to common assumptions, moderators can't see or police PMs.) * In our observation over many years, the people who say ""PM me"" the most are consistently the ones with the least understanding of mental-health issues and mental-health support. We all have gaps in our knowledge and in our ability to communicate effectively. Community input mitigates these limitations. **There's no reason why someone who's truly here to help would want to hide their responses from community scrutiny**. If helpers are concerned about their own privacy, keep in mind that self-disclosure, when used supportively, is more about the feelings than the details, and that we have no problem here with the use of alt/throwaway accounts, and have no restrictions on account age or karma. * We all know the internet is used by some people to exploit or abuse others. These people *do* want to hide their deceptive and manipulative responses from everyone except their victims. There are many of them who specifically target those who are vulnerable because of mental-health issues. **If a helper invites an OP to talk privately and gives them a good, supportive experience, they've primed that person to be more vulnerable to abusers.** This sort of cognitive priming tends to be particularly effective when someone's in a state of mental-health crisis, when people rely more on heuristics than critical reasoning. * If OPs want to talk privately, posting on a wide-open anonymous forum like reddit might not be the best option. Although we don't recommend it, we do allow OPs to request private contact when asking for support. If you want to do this, please keep your expectations realistic, and to have a careful look at the history of anyone who offers to PM before opening up to them."
2,1,"I haven’t been touched, or even hugged, in so long that I can’t even remember what it feels like… Anyone else just miss physical touch? I crave it so badly…"
3,1,"Being Depressed is Embarrassing I’m just so ashamed. Everyone and everything feels so far away. Every time I leave my house, I feel like I’m drowning in an ocean while watching people on the shore having a grand old time. I feel like everyone else is on a whole other plane of existence, meanwhile I’m stuck beneath the surface just trying to breathe. Every time I talk to someone, I have to check to see if there’s something on my face once I leave because other people don’t know what to say to me and it’s always so fucking awkward. I try so hard to appear normal, but other people see right through it. I can’t relate to anyone. I can’t talk to anyone. I try so hard but I just can’t. And it’s embarrassing. I feel stupid for being this way. I’ve tried so hard to fix myself and I don’t know what I’m doing wrong. I’ve been in therapy for over a decade and the only thing that’s changed is I’m really good at talking about myself."
4,1,I'm desperate for a friend and to feel loved by someone. I really need a friend. I don't even have a single best friend and I'm desperate to find true love. Please help me. No one responded to my last post. It seems like no one cares about me and my life doesn't matter. Female here just saying
...,...,...
5952,4,"Nobody takes me seriously I’ve (24M) dealt with depression/anxiety for years now. I used to be great with people, make good money, have the nice cars, great girlfriend, supportive parents, friends that I could say looked up to me etc. and then I was diagnosed with depression. Within about a year, I quit my job, lost my girlfriend even though she was great to me, and have yet to keep a stable job for more than a month at a time. My depression eventually was ruled to be “treatment resistant” after being on a number of meds and trying many other things. Some would work for a couple months and then I’d fall even further back from where I was. But now, after not having worked since early July of 2022, I am dealing with extensive and scary brain fog. I’ve incorporated working out, eating healthier, taking supplements and just trying to live a healthier lifestyle as I figured this was coming from a bad diet my whole life. I also got all blood work done including thyroid, basics, vitamin levels, testosterone etc and everything came back normal other than pretty high cholesterol. Both doctors I’ve seen (general practitioner, psychiatrist) has kinda blown me off when I tell them about the brain fog. Almost like they don’t really know what to say or what the next step should be to ruling out causes. I’m so scared as I’m feeling like I’m going crazy or have dementia. My parents are usually very supportive when I’m going through stuff but my mom doesn’t even want to hear me talk about it and my dad isn’t always around as he lives about 45 mins away. I ended up packing some things and driving to his house tonight without saying anything to my mom. I just hate feeling alone and don’t know what to do anymore. I feel like I’m at the end of my road"
5953,4,"selfishness ""I don't feel very good, it's like I don't belong in this world (I don't think I ever did). My friends are happy, and I'm always the one who's not really funny and who ruins the mood. So now I try to say as little as possible, people always ask me if I'm okay, if I'm tired, or worse, they say I'm scary. I think I'm just a mistake. People must find me weird or creepy, it's ruining me. A few years ago, my brother committed suicide, I felt very close to him, I think about his death all the time, I wish I could start my life over again and make the right choices (for once). What affects me the most is girls, I think... It's ridiculous, but I would love to have a relationship with a (very) pretty girl. Sometimes I think I've suffered so much that I would at least deserve that. I saw the damage my brother's death caused in my family, now I think if I didn't have a family to make suffer =&gt; suicide. My message is so selfish, and I know it. Thank you for reading these few lines..."" \-Lust"
5954,4,"Is there any way to sleep better? I can't sleep most of the nights, meds didn't help."
5955,4,"Public speaking tips? Hi, all. I have to give a presentation at work next week (45 minutes long and the CEO will be in attendance). I’m already panicking, as once the anxiety kicks in, I’m certain I’m going to forget everything I’m supposed to say. ( anxiety makes it very difficult for me to focus on anything) Does anyone have any speaking tips that have worked for them in the past? Thanks so much!"


# BERT Embeddings

In [44]:
# Load the pre-trained BERT tokenizer and model.
# We use BertModel (without a classification head) for feature extraction.
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')
model.eval()  # Set the model to evaluation mode to ensure that dropout and other training-specific layers are deactivated

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [45]:
# Parameters for tokenization
max_length = 128  # Adjust as needed
batch_size = 16   # Adjust batch size based on your hardware

# Tokenize all texts
texts = df4["combined_text"].tolist()
labels = df4["target"].tolist()

# Tokenize with padding and truncation. Setting return_tensors="pt" to get PyTorch tensors.
encoded_inputs = tokenizer(texts,
                           padding=True,
                           truncation=True,
                           max_length=max_length,
                           return_tensors='pt')

In [65]:
# Create a PyTorch dataset and DataLoader for batching.
dataset = TensorDataset(encoded_inputs['input_ids'],
                        encoded_inputs['attention_mask'],
                        torch.tensor(labels))
dataloader = DataLoader(dataset, batch_size=batch_size)

# Initialize a list to hold embeddings and corresponding labels.
all_embeddings = []
all_labels = []

# Disable gradient calculation for inference.
with torch.no_grad():
    for batch in tqdm(dataloader, desc="Generating BERT Embeddings"):
        input_ids, attention_mask, batch_labels = batch

        # Get model outputs. outputs.last_hidden_state has shape (batch_size, sequence_length, hidden_size)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)

        # Extract the embedding corresponding to the [CLS] token (first token)
        cls_embeddings = outputs.last_hidden_state[:, 0, :]  # shape: (batch_size, hidden_size)

        # Append to our list (move to CPU and convert to numpy array)
        all_embeddings.append(cls_embeddings.cpu().numpy())
        all_labels.extend(batch_labels.cpu().numpy())

# Stack all embeddings into a single NumPy array.
X = np.vstack(all_embeddings)  # Shape: (num_samples, hidden_size)
y = np.array(all_labels)


Generating BERT Embeddings:   0%|          | 0/701 [00:00<?, ?it/s]

In [66]:
# make sure X has the right shape
X.shape

(5607, 768)

In [67]:
# make sure y has the right shape
y.shape

(5607,)

# Classifier

## JUST AN LOGISTIC REGRESSION EXAMPLE - MAGGIE

In [61]:
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Use a logistic regression classifier (or choose any other classifier)
classifier = LogisticRegression(max_iter=1000)
classifier.fit(X_train, y_train)

# Predict on the test set
y_pred = classifier.predict(X_test)

# Evaluate the classifier
accuracy = accuracy_score(y_test, y_pred)
print("Test Accuracy:", accuracy)

Test Accuracy: 0.6488413547237076
