In [1]:
import pickle
import pandas as pd
import re
import string
import nltk
from nltk.corpus import stopwords
import sklearn
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import sklearn.svm as svm
import sklearn.metrics as metrics
import numpy as np

In [2]:
depressed_pickle = open("depressed.pickle","rb")
depressed_dict = pickle.load(depressed_pickle)
depressed_df = pd.DataFrame(depressed_dict)
depressed_df.drop_duplicates(subset=['id'], inplace=True)
print("test Total number of comments: ", sum(depressed_df['num_comments']))

test Total number of comments:  121355


In [3]:
non_depressed_pickle = open("non_depressed.pickle","rb")
non_depressed_dict = pickle.load(non_depressed_pickle)
non_depressed_df = pd.DataFrame(non_depressed_dict)
non_depressed_df.drop_duplicates(subset=['id'], inplace=True)
print("test Total number of comments: ", sum(non_depressed_df['num_comments']))

test Total number of comments:  178277


In [4]:
depressed_comments_arr = depressed_df['comments'].tolist()
non_depressed_comments_arr = non_depressed_df['comments'].tolist()

In [5]:
depressed_comments = [comment for post in depressed_df['comments'] for comment in post]
non_depressed_comments = [comment for post in non_depressed_df['comments'] for comment in post]

In [6]:
print("depressed length ", len(depressed_comments))
print("non depressed length ", len(non_depressed_comments))

depressed length  116900
non depressed length  127911


In [7]:
def clean_one_comment(comment):
    comment = comment.lower()
    # remove /r/, /u/, numbers
    comment = re.sub(r"(\/r\/)|(\/u\/)|(\d+)","",comment)
    # remove punctuation
    comment = comment.translate(str.maketrans('','', string.punctuation))
    # replacing ' 
    comment = comment.replace('’','')
    comment = comment.replace('\n'," ")
    # remove trailing whitespace
    comment = comment.strip()
    return comment


In [None]:
def clean_comments(comments):
    removed_comments = ['[deleted]', '[removed]']
    filtered_comments = [comment for comment in comments if comment not in removed_comments]
    comments_clean = list()
    for comment in filtered_comments:
        comment_new = clean_one_comment(comment)
        comments_clean.append(comment_new)
    return comments_clean

In [None]:
#test_comments = depressed_comments[:10]
#print(test_comments)
print('orig length: ', len(depressed_comments))
print(depressed_comments[:10])
depressed_comments_cleaned = clean_comments(depressed_comments)
print('cleaned length: ', len(depressed_comments_cleaned))
#print(depressed_comments_cleaned)

orig length:  116900
["Sorry if I'm stupid, but what's activism? I don't want to accidentally break the rules", 'Ah thank you, usually I just see it as a load of horsecrap of people wanting to feel good by repeating lines of comfort words to a person and thinking they did some good not knowing that they made a depressed person lose eveb more hope and reason to live with their plastic character. Some genuinely care but not all.', 'Lol somebody in my town thought that day was a good day for suicide good job. Small town of like 14k citizens 5 suicides under 18 in 1 year. Thinking about joining the club as well.', 'I really appreciate this post. The "activism" and "awareness" days are honestly so triggering for me that I\'d like to pretend they don\'t exist.', 'donate to my mom lol\n\n&#x200B;', '[removed]', '[deleted]', '[deleted]', '[deleted]', "Basically anything that's aimed at raising awreness (or money) among the general public.  Totally out of place in a support community.  Our prob

In [None]:
#print('orig length: ', len(non_depressed_comments))
non_depressed_comments_cleaned = clean_comments(non_depressed_comments)
print('cleaned length: ', len(non_depressed_comments_cleaned))
#print(depressed_comments_cleaned)

cleaned length:  127559


In [None]:
depressed_df = pd.DataFrame(depressed_comments_cleaned)
depressed_df.columns = ['text']
depressed_df['label'] = 1 # note 1 = depressed, 0 = not depressed
depressed_df

Unnamed: 0,text,label
0,sorry if im stupid but whats activism i dont w...,1
1,ah thank you usually i just see it as a load o...,1
2,lol somebody in my town thought that day was a...,1
3,i really appreciate this post the activism and...,1
4,donate to my mom lol xb,1
5,basically anything thats aimed at raising awre...,1
6,thats not what were referring to by activism e...,1
7,how does helping words make a person more depr...,1
8,not sure how this is relevant,1
9,the mod who spends hours of their own personal...,1


In [None]:
non_depressed_df = pd.DataFrame(non_depressed_comments_cleaned)
non_depressed_df.columns = ['text']
non_depressed_df['label'] = 0
non_depressed_df

Unnamed: 0,text,label
0,his dad developed huntingtons and his mom left...,0
1,got back together with him after years apart ...,0
2,she met someone like a decade ago and theyve b...,0
3,she is a therapist and happily married,0
4,she was an exchange student who i was absolute...,0
5,she died far too young i taught hockey in can...,0
6,recently found out i was the high school crush...,0
7,i have two i havent kept up with them but this...,0
8,shes sprialing out of control due to severe me...,0
9,shes married two kids i think shes a stay at h...,0


In [None]:
df = pd.concat([depressed_df, non_depressed_df]).reset_index(drop=True)
df

Unnamed: 0,text,label
0,sorry if im stupid but whats activism i dont w...,1
1,ah thank you usually i just see it as a load o...,1
2,lol somebody in my town thought that day was a...,1
3,i really appreciate this post the activism and...,1
4,donate to my mom lol xb,1
5,basically anything thats aimed at raising awre...,1
6,thats not what were referring to by activism e...,1
7,how does helping words make a person more depr...,1
8,not sure how this is relevant,1
9,the mod who spends hours of their own personal...,1


In [None]:
df = shuffle(df).reset_index(drop=True)
df

Unnamed: 0,text,label
0,slurping any drink if possible the punishment ...,0
1,a little bit of editing and formatting and you...,1
2,yes i go to work and am friendly and generally...,1
3,for some of us they never come,1
4,every day i just dont know what to do anymore,1
5,so true had to self rescue myself after i got ...,0
6,you can try doing new things that helped me a ...,1
7,gosh i cant imagine what it must be like to re...,0
8,psychadelics,0
9,went through this this time last year went hom...,1


In [None]:
vectorizer = TfidfVectorizer(min_df=2,norm='l2')

In [None]:
xy, test = train_test_split(df, test_size=0.2)
train, dev = train_test_split(xy, test_size=0.25)

In [None]:
print(len(train))
print(len(dev))
print(len(test))

143712
47904
47905


In [None]:
train

Unnamed: 0,text,label
223865,love,0
143701,im just like you it seems im having a tough re...,1
17413,are you familiar with author david sedaris he ...,1
195505,to not be such a dick all the time,0
96085,murderball,0
44331,wishing you so much peace and love my dear its...,1
118317,yeah its like in the godfather vito wanted mic...,0
21798,may i come in,0
170570,being broke,0
126183,l theanine supplements daily,0


In [None]:
test

Unnamed: 0,text,label
170591,ignore it and do whatever you do on any other day,0
66631,sega game gear with the tv tuner,0
78591,living alone is amazing i could cut my rent in...,0
231544,tbh cant even remember who i was before i star...,1
25639,they dont think it is weird that you go alone ...,0
113326,yes anyone who says its the cowardly way out i...,1
235330,thats depressing lol,1
4711,wow you are the exact opposite to me i feel th...,0
29363,i used to say to myself always consult the tr...,0
69227,i do this all the time along with critical sel...,1


In [None]:
X_train = train['text']
y_train = train['label']
X_dev = dev['text']
y_dev = dev['label']
X_test = test['text']
y_test = test['label']

In [None]:
Xv_train = vectorizer.fit_transform(X_train)
Xv_dev = vectorizer.transform(X_dev)
Xv_test = vectorizer.transform(X_test)

In [None]:
"""
C_param = [0.1,0.25,0.3,0.4,0.5,1,10,100]
for c in C_param:
    print("C: ", c)
    lr = LogisticRegression(C=c)
    lr.fit(Xv_train, y_train)
    preds = lr.predict(Xv_dev)
    print("Acc ", metrics.accuracy_score(y_dev, preds))
    print("Precision ", metrics.precision_score(y_dev, preds))
    print("Recall ", metrics.recall_score(y_dev, preds))
    print("F1 score ", metrics.f1_score(y_dev, preds))
    print(metrics.classification_report(y_dev, preds))
"""

'\nC_param = [0.1,0.25,0.3,0.4,0.5,1,10,100]\nfor c in C_param:\n    print("C: ", c)\n    lr = LogisticRegression(C=c)\n    lr.fit(Xv_train, y_train)\n    preds = lr.predict(Xv_dev)\n    print("Acc ", metrics.accuracy_score(y_dev, preds))\n    print("Precision ", metrics.precision_score(y_dev, preds))\n    print("Recall ", metrics.recall_score(y_dev, preds))\n    print("F1 score ", metrics.f1_score(y_dev, preds))\n    print(metrics.classification_report(y_dev, preds))\n'

In [None]:
"""
C_param = [0.1,0.25,0.3,0.4,0.5,1,10,100]
for c in C_param:
    print("C: ", c)
    model = svm.LinearSVC(C=c)
    model.fit(Xv_train, y_train)
    preds = model.predict(Xv_dev)
    print("Acc ", metrics.accuracy_score(y_dev, preds))
    print("Precision ", metrics.precision_score(y_dev, preds))
    print("Recall ", metrics.recall_score(y_dev, preds))
    print("F1 score ", metrics.f1_score(y_dev, preds))
    print(metrics.classification_report(y_dev, preds))
"""

'\nC_param = [0.1,0.25,0.3,0.4,0.5,1,10,100]\nfor c in C_param:\n    print("C: ", c)\n    model = svm.LinearSVC(C=c)\n    model.fit(Xv_train, y_train)\n    preds = model.predict(Xv_dev)\n    print("Acc ", metrics.accuracy_score(y_dev, preds))\n    print("Precision ", metrics.precision_score(y_dev, preds))\n    print("Recall ", metrics.recall_score(y_dev, preds))\n    print("F1 score ", metrics.f1_score(y_dev, preds))\n    print(metrics.classification_report(y_dev, preds))\n'

In [None]:
lr = LogisticRegression(C=1)
lr.fit(Xv_train, y_train)
preds = lr.predict(Xv_test)
print("Acc ", metrics.accuracy_score(y_test, preds))
print("Precision ", metrics.precision_score(y_test, preds))
print("Recall ", metrics.recall_score(y_test, preds))
print("F1 score ", metrics.f1_score(y_test, preds))
print(metrics.classification_report(y_test, preds))




Acc  0.8476359461434089
Precision  0.8560567190633892
Recall  0.8133031995036781
F1 score  0.8341324849448927
              precision    recall  f1-score   support

           0       0.84      0.88      0.86     25339
           1       0.86      0.81      0.83     22566

   micro avg       0.85      0.85      0.85     47905
   macro avg       0.85      0.85      0.85     47905
weighted avg       0.85      0.85      0.85     47905



In [None]:
model = svm.LinearSVC(C=0.25)
model.fit(Xv_train, y_train)
preds = model.predict(Xv_test)
print("Acc ", metrics.accuracy_score(y_test, preds))
print("Precision ", metrics.precision_score(y_test, preds))
print("Recall ", metrics.recall_score(y_test, preds))
print("F1 score ", metrics.f1_score(y_test, preds))
print(metrics.classification_report(y_test, preds))

Acc  0.8497442855651811
Precision  0.8541340215688081
Recall  0.8212798014712399
F1 score  0.8373847822157962
              precision    recall  f1-score   support

           0       0.85      0.88      0.86     25339
           1       0.85      0.82      0.84     22566

   micro avg       0.85      0.85      0.85     47905
   macro avg       0.85      0.85      0.85     47905
weighted avg       0.85      0.85      0.85     47905



In [None]:
# gaussian kernel
"""
C_param = [0.5,1,10,100]
for c in C_param:
    print("C: ", c)
    model = svm.SVC(kernel="rbf", C=c)
    model.fit(Xv_train, y_train)
    preds = model.predict(Xv_dev)
    print("Acc ", metrics.accuracy_score(y_dev, preds))
    print("Precision ", metrics.precision_score(y_dev, preds))
    print("Recall ", metrics.recall_score(y_dev, preds))
    print("F1 score ", metrics.f1_score(y_dev, preds))
    print(metrics.classification_report(y_dev, preds))
"""

C:  0.5




In [None]:
#Xv_train.to_pickle('Xv_train.pickle')
#Xv_test.to_pickle('Xv_test.pickle')
#y_train.to_pickle('y_train.pickle')
#y_test.to_pickle('y_test.pickle')