In [26]:
import pickle
import pandas as pd
import re
import string
import nltk
from nltk.corpus import stopwords
import sklearn
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import sklearn.svm as svm
import sklearn.metrics as metrics
import numpy as np

In [2]:
depressed_pickle = open("depressed.pickle","rb")
depressed_dict = pickle.load(depressed_pickle)
depressed_df = pd.DataFrame(depressed_dict)
depressed_df.drop_duplicates(subset=['id'], inplace=True)
print("test Total number of comments: ", sum(depressed_df['num_comments']))

test Total number of comments:  121355


In [3]:
non_depressed_pickle = open("non_depressed.pickle","rb")
non_depressed_dict = pickle.load(non_depressed_pickle)
non_depressed_df = pd.DataFrame(non_depressed_dict)
non_depressed_df.drop_duplicates(subset=['id'], inplace=True)
print("test Total number of comments: ", sum(non_depressed_df['num_comments']))

test Total number of comments:  178277


In [4]:
depressed_comments_arr = depressed_df['comments'].tolist()
non_depressed_comments_arr = non_depressed_df['comments'].tolist()

In [5]:
depressed_comments = [comment for post in depressed_df['comments'] for comment in post]
non_depressed_comments = [comment for post in non_depressed_df['comments'] for comment in post]

In [6]:
print("depressed length ", len(depressed_comments))
print("non depressed length ", len(non_depressed_comments))

depressed length  116900
non depressed length  127911


In [7]:
def clean_one_comment(comment):
    comment = comment.lower()
    # remove /r/, /u/, numbers
    comment = re.sub(r"(\/r\/)|(\/u\/)|(\d+)","",comment)
    # remove punctuation
    comment = comment.translate(str.maketrans('','', string.punctuation))
    # replacing ' 
    comment = comment.replace('’','')
    comment = comment.replace('\n'," ")
    # remove trailing whitespace
    comment = comment.strip()
    return comment


In [8]:
def clean_comments(comments):
    removed_comments = ['[deleted]', '[removed]']
    filtered_comments = [comment for comment in comments if comment not in removed_comments]
    comments_clean = list()
    for comment in filtered_comments:
        comment_new = clean_one_comment(comment)
        comments_clean.append(comment_new)
    return comments_clean

In [9]:
#test_comments = depressed_comments[:10]
#print(test_comments)
print('orig length: ', len(depressed_comments))
print(depressed_comments[:10])
depressed_comments_cleaned = clean_comments(depressed_comments)
print('cleaned length: ', len(depressed_comments_cleaned))
#print(depressed_comments_cleaned)

orig length:  116900
["Sorry if I'm stupid, but what's activism? I don't want to accidentally break the rules", 'Ah thank you, usually I just see it as a load of horsecrap of people wanting to feel good by repeating lines of comfort words to a person and thinking they did some good not knowing that they made a depressed person lose eveb more hope and reason to live with their plastic character. Some genuinely care but not all.', 'Lol somebody in my town thought that day was a good day for suicide good job. Small town of like 14k citizens 5 suicides under 18 in 1 year. Thinking about joining the club as well.', 'I really appreciate this post. The "activism" and "awareness" days are honestly so triggering for me that I\'d like to pretend they don\'t exist.', 'donate to my mom lol\n\n&#x200B;', '[removed]', '[deleted]', '[deleted]', '[deleted]', "Basically anything that's aimed at raising awreness (or money) among the general public.  Totally out of place in a support community.  Our prob

In [10]:
#print('orig length: ', len(non_depressed_comments))
non_depressed_comments_cleaned = clean_comments(non_depressed_comments)
print('cleaned length: ', len(non_depressed_comments_cleaned))
#print(depressed_comments_cleaned)

cleaned length:  127559


In [11]:
depressed_df = pd.DataFrame(depressed_comments_cleaned)
depressed_df.columns = ['text']
depressed_df['label'] = 1
depressed_df

Unnamed: 0,text,label
0,sorry if im stupid but whats activism i dont w...,1
1,ah thank you usually i just see it as a load o...,1
2,lol somebody in my town thought that day was a...,1
3,i really appreciate this post the activism and...,1
4,donate to my mom lol xb,1
5,basically anything thats aimed at raising awre...,1
6,thats not what were referring to by activism e...,1
7,how does helping words make a person more depr...,1
8,not sure how this is relevant,1
9,the mod who spends hours of their own personal...,1


In [12]:
non_depressed_df = pd.DataFrame(non_depressed_comments_cleaned)
non_depressed_df.columns = ['text']
non_depressed_df['label'] = 0
non_depressed_df

Unnamed: 0,text,label
0,his dad developed huntingtons and his mom left...,0
1,got back together with him after years apart ...,0
2,she met someone like a decade ago and theyve b...,0
3,she is a therapist and happily married,0
4,she was an exchange student who i was absolute...,0
5,she died far too young i taught hockey in can...,0
6,recently found out i was the high school crush...,0
7,i have two i havent kept up with them but this...,0
8,shes sprialing out of control due to severe me...,0
9,shes married two kids i think shes a stay at h...,0


In [13]:
df = pd.concat([depressed_df, non_depressed_df]).reset_index(drop=True)
df

Unnamed: 0,text,label
0,sorry if im stupid but whats activism i dont w...,1
1,ah thank you usually i just see it as a load o...,1
2,lol somebody in my town thought that day was a...,1
3,i really appreciate this post the activism and...,1
4,donate to my mom lol xb,1
5,basically anything thats aimed at raising awre...,1
6,thats not what were referring to by activism e...,1
7,how does helping words make a person more depr...,1
8,not sure how this is relevant,1
9,the mod who spends hours of their own personal...,1


In [14]:
df = shuffle(df).reset_index(drop=True)
df

Unnamed: 0,text,label
0,i just call that weekends,1
1,well thats not right either im sorry its very ...,1
2,educating youth on nutritional eating habits a...,0
3,hello notamerican how is the weather from your...,0
4,just started using it as im tired of sending p...,0
5,scaphism long story short someone is tied betw...,0
6,if i knew that id be a stalker,0
7,christmas is hard work for sure but remember t...,1
8,free free free free,0
9,because they have free moviesps games and swit...,0


In [15]:
vectorizer = TfidfVectorizer(min_df=1,norm='l2')

In [16]:
train, test = train_test_split(df, test_size=0.2)

In [17]:
train

Unnamed: 0,text,label
57114,just rewatched the episodes after reading your...,1
224081,i wish i could go back to thats the last time...,1
110589,air quotes are a pet peeve of mine i once stop...,0
222230,dont be afraid to try out psychedelics i feel ...,0
85094,i have struggled with severe depression all of...,1
114964,same name as a family member and being a cigar...,0
120209,being the only boy after sister i never got a...,0
40276,i still keep a road atlas in my car ive had t...,0
144815,im glad im stupid so i never have this kind of...,0
144785,i dont know i think hed be a lot happier with ...,1


In [18]:
test

Unnamed: 0,text,label
119883,man i cant wait to be dead i hate the act of e...,1
45734,golf good clubs and good courses college aint ...,0
11714,thats probably what keeps me going,0
151749,step by step new kid on the block once in a l...,0
7857,i have a unique way of speaking apparently and...,0
199070,you could buy points for cents a piece i can...,0
115776,wyoming,0
43770,yeai do this a lot,1
123795,figuring out which tone i should say “no thank...,0
167442,theyve never washed dishes in their life,0


In [19]:
X_train = train['text']
y_train = train['label']
X_test = test['text']
y_test = test['label']

In [20]:
Xv_train = vectorizer.fit_transform(X_train)
Xv_test = vectorizer.transform(X_test)


In [21]:
C_param = [0.01,0.1,0.5,1,10,100]
for c in C_param:
    print("C: ", c)
    lr = LogisticRegression(C=c)
    lr.fit(Xv_train, y_train)
    preds = lr.predict(Xv_test)
    print(metrics.accuracy_score(y_test, preds))

C:  0.01




0.7929652437115123
C:  0.1
0.8312702223149985
C:  0.5
0.8449431165849076
C:  1
0.8494102912013359
C:  10
0.8496399123264795
C:  100
0.835820895522388


In [30]:
C_param = [0.3,0.4,0.5,1,10]
for c in C_param:
    print("C: ", c)
    model = svm.LinearSVC(C=c)
    model.fit(Xv_train, y_train)
    preds = model.predict(Xv_test)
    print(metrics.accuracy_score(y_test, preds))

C:  0.3
0.8509550151341196
C:  0.4
0.8513098841457051
C:  0.5
0.8509967644296003
C:  1
0.8482621855756184
C:  10
0.8326479490658595
