In [2]:
import pandas as pd
from collections import Counter
import numpy as np
import math
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/justinchen/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [40]:
df = pd.read_csv('downloads/the-reddit-covid-dataset-comments.csv', nrows = 1000000)

In [7]:
dataset = pd.read_csv('/Users/justinchen/Documents/mental_health_subreddits_dataset.csv')

In [8]:
dataset.tail()

Unnamed: 0.1,Unnamed: 0,subreddit.name,created_utc,body,sentiment
45880,17777094,anxiety,1581466598,Stay focused on what the experts are saying an...,0.743
45881,17777113,anxiety,1581463547,"No new COVID-19 cases in B.C., 4 confirmed pat...",-0.1531
45882,17777197,mentalhealth,1581454025,"Hi mate, as someone with GAD I can relate to y...",0.5332
45883,17777368,anxiety,1581071322,***MORE UPDATES ON MY OTHER COMMENT ON THIS T...,0.9325
45884,17777471,technology,1326993659,If you happen to be at ISE this year stop by b...,-0.296


In [36]:
sample = dataset.sample(20000)
sample.head()

Unnamed: 0.1,Unnamed: 0,subreddit.name,created_utc,body,sentiment
22479,10870274,anxiety,1605299828,https://www.nbcnews.com/health/health-news/inh...,0.8735
15770,8376619,anxiety,1611876730,I'm sorry you and your husband have to endure ...,0.6846
35786,14961040,depression,1593060179,Before covid I actually tutored! I would recom...,0.9798
10364,6009827,depression,1619302494,Covid is also wreaking havoc in my family. Don...,-0.8525
42096,16599696,anxiety,1587154084,"Hey, I'm new to reddit. This is the first post...",-0.9932


In [37]:
count = Counter()
for index, row in sample.iterrows():
    if row['subreddit.name'] == 'anxiety':
        count['anxiety'] += 1
    if row['subreddit.name'] == 'depression':
        count['depression'] += 1
    if row['subreddit.name'] == 'suicidewatch':
        count['SuicideWatch'] += 1
    if row['subreddit.name'] == 'mentalhealth':
        count['mentalhealth'] += 1
    if row['subreddit.name'] == 'covid19_support':
        count['COVID19_support'] += 1
count

Counter({'anxiety': 7265,
         'depression': 2606,
         'COVID19_support': 6904,
         'SuicideWatch': 1761,
         'mentalhealth': 1463})

## Preprocessing 

In [38]:
import re #regex library used for substituting words

In [39]:
#removing stopwords and urls, stemming, making lowercase 
stop_words = stopwords.words('english')
stemmer = SnowballStemmer('english')

text_cleaning_re = "@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+"

def cleaning(text, stem=False):
    text = re.sub(text_cleaning_re, ' ', str(text).lower()).strip()
    tokens = []
    for token in text.split():
        if token not in stop_words:
            if stem:
                tokens.append(stemmer.stem(token))
            else:
                tokens.append(token)
    return " ".join(tokens)

In [40]:
def post_cleaning(text):
    # Remove all the special characters
    processed_feature = re.sub(r'\W', ' ',text)

    # remove all single characters
    processed_feature= re.sub(r'\s+[a-zA-Z]\s+', ' ', processed_feature)

    # Remove single characters from the start
    processed_feature = re.sub(r'\^[a-zA-Z]\s+', ' ', processed_feature) 
    
    #remove digits
    processed_feature = re.sub("^\d+\s|\s\d+\s|\s\d+$", " ", processed_feature)

    # Substituting multiple spaces with single space
    processed_feature = re.sub(r'\s+', ' ', processed_feature, flags=re.I)

    return processed_feature

In [41]:
#above functions combined
def preprocessing(text):
    line = cleaning(text)
    line = post_cleaning (line)
    return line

In [42]:
sample.body = sample.body.apply(lambda x: preprocessing(x))

In [43]:
sample.head()

Unnamed: 0.1,Unnamed: 0,subreddit.name,created_utc,body,sentiment
22479,10870274,anxiety,1605299828,inhaled medicine may help covid patients inhal...,0.8735
15770,8376619,anxiety,1611876730,sorry husband endure pandemic affected us quit...,0.6846
35786,14961040,depression,1593060179,covid actually tutored would recommend past al...,0.9798
10364,6009827,depression,1619302494,covid also wreaking havoc family friends every...,-0.8525
42096,16599696,anxiety,1587154084,hey new reddit first post responding bc really...,-0.9932


In [44]:
for sentiment in sample['sentiment']:
    if sentiment > 0:
        sample['sentiment'].replace({sentiment : 'positive'}, inplace=True)
    if sentiment < 0:
        sample['sentiment'].replace({sentiment : 'negative'}, inplace=True)
    else:
        sample['sentiment'].replace({sentiment : 'neutral'}, inplace=True)

In [20]:
sample.head()

Unnamed: 0.1,Unnamed: 0,subreddit.name,created_utc,body,sentiment
10569,6124604,anxiety,1618972628,thing blood test elevated white blood cell cou...,positive
2580,1711492,covid19_support,1631034354,hard truth vaccinated people safe covid dont n...,positive
22353,10831585,suicidewatch,1605404618,jc someone age hope deal shit congrats making ...,positive
2092,1343197,anxiety,1631738650,got second dose weeks ago deal pretty bad anxi...,positive
3539,2324883,covid19_support,1629913745,gt worried depressed fully vaccinated feel lik...,negative


In [45]:
from sklearn.model_selection import train_test_split

In [46]:
 # Splits Dataset into Training and Testing set
train_size = 0.8
train_data, test_data = train_test_split(sample, test_size=1-train_size, random_state=42, shuffle=True)
print("Train Data size:", len(train_data))
print("Test Data size", len(test_data))

Train Data size: 16000
Test Data size 4000


## Vecotrizing

In [23]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [47]:
vectorizer = TfidfVectorizer(max_features=8000,
                             min_df = 5,
                             max_df = 0.8,
                             sublinear_tf = True,
                             use_idf = True)
vectorizer.fit(train_data.body.to_list())
x_train = vectorizer.transform(train_data.body.to_list())
x_test = vectorizer.transform(test_data.body.to_list())

In [48]:
print("x_train Shape:",x_train.shape)
print("x_test Shape:",x_test.shape)

x_train Shape: (16000, 8000)
x_test Shape: (4000, 8000)


In [32]:
from sklearn.preprocessing import LabelEncoder

In [49]:
encoder = LabelEncoder() #https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html
encoder.fit(train_data.sentiment.to_list())

y_train = encoder.transform(train_data.sentiment.to_list())
y_test = encoder.transform(test_data.sentiment.to_list())

y_train = y_train.reshape(-1,1)
y_test = y_test.reshape(-1,1)

print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

y_train shape: (16000, 1)
y_test shape: (4000, 1)


## Linear SVM Model

In [34]:
import time
from sklearn import svm
from sklearn.metrics import classification_report

In [50]:
# Perform classification with SVM, kernel=linear
classifier_linear = svm.SVC(kernel='linear')
t0 = time.time()
classifier_linear.fit(x_train, y_train)
t1 = time.time()
prediction_linear = classifier_linear.predict(x_test)
t2 = time.time()
time_linear_train = t1-t0
time_linear_predict = t2-t1

target_names = ['positive', 'neutral', 'negative']

# results
print("Training time: %fs; Prediction time: %fs" % (time_linear_train, time_linear_predict))

report = classification_report(y_test, prediction_linear, target_names=target_names, output_dict=True)

print('positive: ', report['positive'])
print('negative: ', report['negative'])
print('neutral: ', report['neutral'])

  return f(**kwargs)


Training time: 60.245207s; Prediction time: 13.050862s
positive:  {'precision': 0.7630685675492193, 'recall': 0.7341606792945787, 'f1-score': 0.7483355525965381, 'support': 1531}
negative:  {'precision': 0.8181089743589743, 'recall': 0.8775247099269445, 'f1-score': 0.84677586564379, 'support': 2327}
neutral:  {'precision': 0.8709677419354839, 'recall': 0.19014084507042253, 'f1-score': 0.3121387283236994, 'support': 142}
