In [62]:
# import libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# nlp vectorizers

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import text
import gensim

# model and metrics

from sklearn.metrics import confusion_matrix, classification_report
from sklearn.semi_supervised import LabelPropagation, LabelSpreading
from sklearn.preprocessing import StandardScaler

# ignore warnings

import warnings
warnings.filterwarnings("ignore")

In [None]:
# read in the dataset

df = pd.read_json('../data/cleaned/json.json')

rv1 = pd.read_csv('../data/cleaned/review1.csv')
rv2 = pd.read_csv('../data/cleaned/review2.csv')

In [3]:
# combine the reviewed dataset

labeled = pd.concat([rv1, rv2])

# set the index

labeled.set_index('id', inplace = True)

labeled.head(3)

Unnamed: 0_level_0,post,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1428,How do you guys feel less dead inside? I've go...,1
1429,i want to get help but i don’t know how my par...,1
1430,I can’t stop myself from loving this fictional...,0


In [4]:
# merge the labeled data to main dataframe

df = pd.merge(df, labeled, how = 'left', on = 'post')

In [5]:
# check for nulls

df.isnull().sum()

subreddit            0
author               0
date                 0
post                 0
covid_related        0
suicidal             0
alc_abuse            0
loneliness           0
stress               0
n_words              0
n_sentences          0
lemmatized           0
vectors              0
label            99208
dtype: int64

In [6]:
# fill null values with 99 for easy distinction

df.fillna(-1, inplace = True)

In [7]:
df.isnull().sum()

subreddit        0
author           0
date             0
post             0
covid_related    0
suicidal         0
alc_abuse        0
loneliness       0
stress           0
n_words          0
n_sentences      0
lemmatized       0
vectors          0
label            0
dtype: int64

In [8]:
df.replace({9: 0}, inplace = True)

In [9]:
# divide the datasets to labeled and not labeled

unlabel = df[df['label'] == -1]
label = df[df['label'] != -1]

In [10]:
## choose a small portion of samples including labeled data to test the model
# combine labeled data and sampled unlabeled data for test

sampled = unlabel.sample(n = 500, random_state = 2020)

df = pd.concat([label, sampled])

print(df.shape)
df.head()

(3329, 14)


Unnamed: 0,subreddit,author,date,post,covid_related,suicidal,alc_abuse,loneliness,stress,n_words,n_sentences,lemmatized,vectors,label
0,alcoholism,glorybellpirate,2020-01-01,Day 1 of sobriety Feeling anxious and letting ...,0,0,1,0,0,24,3,day 1 of sobriety feel anxious and let the fee...,"[-0.026765203100000003, 0.2515704036, -0.15668...",0.0
1,alcoholism,EhndlessSl0th,2020-01-01,"Started the New Year with a bang. Hey, I'm new...",0,1,1,0,1,577,41,start the new year with a bang hey -pron- be n...,"[0.0102483444, 0.18354494870000002, -0.2263026...",0.0
2,alcoholism,the_kinky_penguin,2020-01-01,Why can't I get drunk anymore I've been a heav...,0,0,1,0,0,64,5,why ca not -pron- get drunk anymore -pron- hav...,"[-0.09739924970000001, 0.1839587241, -0.227181...",0.0
3,alcoholism,SauceoffSauceOn,2020-01-01,I am an Alcoholic. How do I quit? I have been ...,0,0,1,0,1,111,14,-pron- be an alcoholic how do -pron- quit -pro...,"[-0.0117096035, 0.1865714192, -0.2576603591, -...",0.0
4,alcoholism,ben42187,2020-01-01,Funniest Thing about Alcoholism With every oth...,0,0,1,0,0,136,0,funniest thing about alcoholism with every oth...,"[-0.0443742387, 0.1812106818, -0.1554362029000...",0.0


In [11]:
# check the distribution of subreddits
# labeled dataset contains 500 from suicide, 500 from depression, 1428 from alcoholism, 390 from bipolarreddit

df.groupby('subreddit')['subreddit'].count()

subreddit
alcoholism       1432
anxiety            83
bipolarreddit     394
depression        705
healthanxiety      10
lonely             39
mentalhealth       54
suicidewatch      612
Name: subreddit, dtype: int64

In [12]:
# most of the suicidal posts from labeled data came from suicide watch and depression - check

print(label[(label['subreddit'] == 'suicidewatch') | (label['subreddit'] == 'depression')][['label']].mean())
print(label[label['subreddit'] == 'suicidewatch'][['label']].mean())
print(label[label['subreddit'] == 'depression'][['label']].mean())
print()
print(label['label'].mean())

label    0.449751
dtype: float64
label    0.650099
dtype: float64
label    0.249004
dtype: float64

0.17497348886532343


In [13]:
# set up X and y

X = df['lemmatized']
y = df['label']

In [14]:
len(y)

3329

In [15]:
y.unique()

array([ 0.,  1., -1.])

### TF-IDF Vectorizer with Parameters:
#### max_features = 256, max_df = 0.8, min_df = 10, ngram_range = (1, 1)

In [16]:
# update stop_words for TF-IDF vectorizer

remove = ['-pron-', 'feel', 'know', 'want', 'life', 'go', 'think', 'make', 'people', 'really', 'even', 'much', 'now', 
          'pron', 'don', 'will', 'try', 'talk', 'friends', 'tell', 'just', 'like', 'time', 'want', 'well', 'thing', 'day',
          'friend', 'help', 'year', 'bad', 've', 'say', 'good', 'need', 'way', 'right', 'month', 'amp', 'x200b']

my_stop_words = text.ENGLISH_STOP_WORDS.union(remove)

In [17]:
# instantiate Tf-IDF

tvec = TfidfVectorizer(stop_words = my_stop_words, max_df = .80, min_df = 10, max_features = 256, ngram_range = (1, 1))

In [18]:
# fit/transform X then save it to a dataframe

t = pd.DataFrame(tvec.fit_transform(X).toarray(),
                 columns = tvec.get_feature_names())
t.head()

Unnamed: 0,10,20,30,aa,able,abuse,act,actually,addiction,advice,...,wish,withdrawal,wonder,work,world,worried,worry,write,wrong,young
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.095629,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.278592,0.0,0.0,0.0,0.0,0.0,0.209062,...,0.0,0.2621,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
# instantiate LabelPropagation model, fit, and predict

model = LabelSpreading()
model.fit(t, y)
pred = np.array(model.predict(t))

In [20]:
# check the length of both label and prediction

print(len(y))
print(len(pred))

3329
3329


In [21]:
# create a dataframe just with label and prediction

d = {'label': y, 'pred': pred}

p = pd.DataFrame(d)

In [22]:
# create a separate dataframe to check the accuracy of the model with the labeled data

cf = p[p['label'] != -1]

In [23]:
# get the values from confusion matrix

tn, fp, fn, tp = confusion_matrix(cf['label'], cf['pred'], normalize = 'true').ravel()

In [24]:
# check the values from confusion matrix

print(f'True Negative: {tn}')
print(f'False Positive: {fp}')
print(f'False Negative: {fn}')
print(f'True Positive: {tp}')

True Negative: 1.0
False Positive: 0.0
False Negative: 0.006060606060606061
True Positive: 0.9939393939393939


In [25]:
# classification report

print(classification_report(cf['label'], cf['pred'], target_names = ['non-suicidal', 'suicidal']))

              precision    recall  f1-score   support

non-suicidal       1.00      1.00      1.00      2334
    suicidal       1.00      0.99      1.00       495

    accuracy                           1.00      2829
   macro avg       1.00      1.00      1.00      2829
weighted avg       1.00      1.00      1.00      2829



In [26]:
# check pred for unlabeled to make sure it only contains 0 and 1

cf_u = p[p['label'] == -1]

set(cf_u['pred'])

{0.0, 1.0}

In [27]:
# split dataframe into relatively same size

n = 5000

list_df = [unlabel[i: (i + n)] for i in range(0, unlabel.shape[0], n)]

len(list_df)

20

In [28]:
# create an empty list to store the predictions for unlabeled data

y_pred = []

# iterate through sectioned dataframe

for i in range(len(list_df)):
    
    # combine the labeled data and unlabeled data
    
    df = pd.concat([label, list_df[i]])
    
    # set X and y
    
    X = df['lemmatized']
    y = df['label']
    
    # vectorized values to a dataframe
    
    t = pd.DataFrame(tvec.fit_transform(X).toarray(), columns = tvec.get_feature_names())
    
    # instantiate the model
    
    model = LabelSpreading()
    
    # fit and predict
    
    model.fit(t, y)
    pred = np.array(model.predict(t))
    
    # extract predicted values just for the unlabeled
    
    dct = {'label': y, 'pred': pred}
    values = pd.DataFrame(dct)
    non_scoring = values[values['label'] == -1]

    # extend the prediction to the list
    
    y_pred.extend(non_scoring['pred'])

In [29]:
# create a column to store prediction

unlabel['pred'] = y_pred

In [30]:
# check the null values - 'pred' column should have 2829 rows of null

pd.concat([label, unlabel]).isnull().sum()

subreddit           0
author              0
date                0
post                0
covid_related       0
suicidal            0
alc_abuse           0
loneliness          0
stress              0
n_words             0
n_sentences         0
lemmatized          0
vectors             0
label               0
pred             2829
dtype: int64

In [31]:
# combine labeled and unlabeled dataframes

df = pd.concat([label, unlabel])

In [32]:
# fill null values in pred with label values

df['pred'].fillna(df['label'], inplace = True)

In [33]:
# check for null

df.isnull().sum()

subreddit        0
author           0
date             0
post             0
covid_related    0
suicidal         0
alc_abuse        0
loneliness       0
stress           0
n_words          0
n_sentences      0
lemmatized       0
vectors          0
label            0
pred             0
dtype: int64

In [34]:
# drop label column

df.drop(columns = ['label'], axis = 1, inplace = True)

df.head()

Unnamed: 0,subreddit,author,date,post,covid_related,suicidal,alc_abuse,loneliness,stress,n_words,n_sentences,lemmatized,vectors,pred
0,alcoholism,glorybellpirate,2020-01-01,Day 1 of sobriety Feeling anxious and letting ...,0,0,1,0,0,24,3,day 1 of sobriety feel anxious and let the fee...,"[-0.026765203100000003, 0.2515704036, -0.15668...",0.0
1,alcoholism,EhndlessSl0th,2020-01-01,"Started the New Year with a bang. Hey, I'm new...",0,1,1,0,1,577,41,start the new year with a bang hey -pron- be n...,"[0.0102483444, 0.18354494870000002, -0.2263026...",0.0
2,alcoholism,the_kinky_penguin,2020-01-01,Why can't I get drunk anymore I've been a heav...,0,0,1,0,0,64,5,why ca not -pron- get drunk anymore -pron- hav...,"[-0.09739924970000001, 0.1839587241, -0.227181...",0.0
3,alcoholism,SauceoffSauceOn,2020-01-01,I am an Alcoholic. How do I quit? I have been ...,0,0,1,0,1,111,14,-pron- be an alcoholic how do -pron- quit -pro...,"[-0.0117096035, 0.1865714192, -0.2576603591, -...",0.0
4,alcoholism,ben42187,2020-01-01,Funniest Thing about Alcoholism With every oth...,0,0,1,0,0,136,0,funniest thing about alcoholism with every oth...,"[-0.0443742387, 0.1812106818, -0.1554362029000...",0.0


In [35]:
# rename column pred as label

df.rename(columns = {'pred': 'label'}, inplace = True)

In [36]:
# check

df.head(3)

Unnamed: 0,subreddit,author,date,post,covid_related,suicidal,alc_abuse,loneliness,stress,n_words,n_sentences,lemmatized,vectors,label
0,alcoholism,glorybellpirate,2020-01-01,Day 1 of sobriety Feeling anxious and letting ...,0,0,1,0,0,24,3,day 1 of sobriety feel anxious and let the fee...,"[-0.026765203100000003, 0.2515704036, -0.15668...",0.0
1,alcoholism,EhndlessSl0th,2020-01-01,"Started the New Year with a bang. Hey, I'm new...",0,1,1,0,1,577,41,start the new year with a bang hey -pron- be n...,"[0.0102483444, 0.18354494870000002, -0.2263026...",0.0
2,alcoholism,the_kinky_penguin,2020-01-01,Why can't I get drunk anymore I've been a heav...,0,0,1,0,0,64,5,why ca not -pron- get drunk anymore -pron- hav...,"[-0.09739924970000001, 0.1839587241, -0.227181...",0.0


In [37]:
# change label column to int

df['label'] = df['label'].astype(int)

In [38]:
# check the label

df['label'].mean()

0.20530787851465646

In [39]:
# check each subreddit how they are labeled

df.groupby('subreddit')[['label']].mean()

Unnamed: 0_level_0,label
subreddit,Unnamed: 1_level_1
alcoholism,0.006285
anxiety,0.107574
bipolarreddit,0.074561
depression,0.217601
healthanxiety,0.055624
lonely,0.175487
mentalhealth,0.151621
suicidewatch,0.339623


In [40]:
# store this to a csv

labeled = df.drop(columns = ['vectors'], axis = 1)
labeled.to_csv('../data/cleaned/labeled.csv', index = False)

#### Try Scaling the data

In [63]:
# read in the dataset

df = pd.read_json('../data/cleaned/json.json')

rv1 = pd.read_csv('../data/cleaned/review1.csv')
rv2 = pd.read_csv('../data/cleaned/review2.csv')

In [64]:
# combine the reviewed dataset

label = pd.concat([rv1, rv2])

# set the index

label.set_index('id', inplace = True)

label.head(3)

Unnamed: 0_level_0,post,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1428,How do you guys feel less dead inside? I've go...,1
1429,i want to get help but i don’t know how my par...,1
1430,I can’t stop myself from loving this fictional...,0


In [65]:
# merge the labeled data to main dataframe

df = pd.merge(df, label, how = 'left', on = 'post')

In [66]:
# check for nulls

df.isnull().sum()

subreddit            0
author               0
date                 0
post                 0
covid_related        0
suicidal             0
alc_abuse            0
loneliness           0
stress               0
n_words              0
n_sentences          0
lemmatized           0
vectors              0
label            99208
dtype: int64

In [67]:
# fill null values with 99 for easy distinction

df.fillna(-1, inplace = True)

In [68]:
df.isnull().sum()

subreddit        0
author           0
date             0
post             0
covid_related    0
suicidal         0
alc_abuse        0
loneliness       0
stress           0
n_words          0
n_sentences      0
lemmatized       0
vectors          0
label            0
dtype: int64

In [69]:
df.replace({9: 0}, inplace = True)

In [70]:
# divide the datasets to labeled and not labeled

unlabel = df[df['label'] == -1]
label = df[df['label'] != -1]

In [71]:
## choose a small portion of samples including labeled data to test the model
# combine labeled data and sampled unlabeled data for test

sampled = unlabel.sample(n = 500, random_state = 2020)

df = pd.concat([label, sampled])

print(df.shape)
df.head()

(3329, 14)


Unnamed: 0,subreddit,author,date,post,covid_related,suicidal,alc_abuse,loneliness,stress,n_words,n_sentences,lemmatized,vectors,label
0,alcoholism,glorybellpirate,2020-01-01,Day 1 of sobriety Feeling anxious and letting ...,0,0,1,0,0,24,3,day 1 of sobriety feel anxious and let the fee...,"[-0.026765203100000003, 0.2515704036, -0.15668...",0.0
1,alcoholism,EhndlessSl0th,2020-01-01,"Started the New Year with a bang. Hey, I'm new...",0,1,1,0,1,577,41,start the new year with a bang hey -pron- be n...,"[0.0102483444, 0.18354494870000002, -0.2263026...",0.0
2,alcoholism,the_kinky_penguin,2020-01-01,Why can't I get drunk anymore I've been a heav...,0,0,1,0,0,64,5,why ca not -pron- get drunk anymore -pron- hav...,"[-0.09739924970000001, 0.1839587241, -0.227181...",0.0
3,alcoholism,SauceoffSauceOn,2020-01-01,I am an Alcoholic. How do I quit? I have been ...,0,0,1,0,1,111,14,-pron- be an alcoholic how do -pron- quit -pro...,"[-0.0117096035, 0.1865714192, -0.2576603591, -...",0.0
4,alcoholism,ben42187,2020-01-01,Funniest Thing about Alcoholism With every oth...,0,0,1,0,0,136,0,funniest thing about alcoholism with every oth...,"[-0.0443742387, 0.1812106818, -0.1554362029000...",0.0


In [72]:
# check the distribution of subreddits
# labeled dataset contains 500 from suicide, 500 from depression, 1428 from alcoholism, 390 from bipolarreddit

df.groupby('subreddit')['subreddit'].count()

subreddit
alcoholism       1432
anxiety            83
bipolarreddit     394
depression        705
healthanxiety      10
lonely             39
mentalhealth       54
suicidewatch      612
Name: subreddit, dtype: int64

In [73]:
# most of the suicidal posts from labeled data came from suicide watch and depression - check

print(label[(label['subreddit'] == 'suicidewatch') | (label['subreddit'] == 'depression')][['label']].mean())
print(label[label['subreddit'] == 'suicidewatch'][['label']].mean())
print(label[label['subreddit'] == 'depression'][['label']].mean())
print()
print(label['label'].mean())

label    0.449751
dtype: float64
label    0.650099
dtype: float64
label    0.249004
dtype: float64

0.17497348886532343


In [74]:
# set up X and y

X = df['lemmatized']
y = df['label']

In [75]:
len(y)

3329

In [76]:
y.unique()

array([ 0.,  1., -1.])

In [77]:
# instantiate Tf-IDF

tvec = TfidfVectorizer(stop_words = my_stop_words, max_df = .80, min_df = 10, max_features = 256, ngram_range = (1, 1))

In [78]:
# fit/transform X then save it to a dataframe

t = pd.DataFrame(tvec.fit_transform(X).toarray(),
                 columns = tvec.get_feature_names())
t.head()

Unnamed: 0,10,20,30,aa,able,abuse,act,actually,addiction,advice,...,wish,withdrawal,wonder,work,world,worried,worry,write,wrong,young
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.095629,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.278592,0.0,0.0,0.0,0.0,0.0,0.209062,...,0.0,0.2621,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [79]:
# instantiate standard scaler and fit/transform

sc = StandardScaler()
t = sc.fit_transform(t)

In [80]:
# instantiate LabelPropagation model, fit, and predict

model = LabelSpreading()
model.fit(t, y)
pred = np.array(model.predict(t))

In [81]:
# check the length of both label and prediction

print(len(y))
print(len(pred))

3329
3329


In [82]:
# create a dataframe just with label and prediction

d = {'label': y, 'pred': pred}

p = pd.DataFrame(d)

In [83]:
# create a separate dataframe to check the accuracy of the model with the labeled data

cf = p[p['label'] != -1]

In [84]:
# get the values from confusion matrix

tn, fp, fn, tp = confusion_matrix(cf['label'], cf['pred'], normalize = 'true').ravel()

In [85]:
# check the values from confusion matrix

print(f'True Negative: {tn}')
print(f'False Positive: {fp}')
print(f'False Negative: {fn}')
print(f'True Positive: {tp}')

True Negative: 1.0
False Positive: 0.0
False Negative: 0.00404040404040404
True Positive: 0.9959595959595959


In [86]:
# classification report

print(classification_report(cf['label'], cf['pred'], target_names = ['non-suicidal', 'suicidal']))

              precision    recall  f1-score   support

non-suicidal       1.00      1.00      1.00      2334
    suicidal       1.00      1.00      1.00       495

    accuracy                           1.00      2829
   macro avg       1.00      1.00      1.00      2829
weighted avg       1.00      1.00      1.00      2829



In [87]:
# check pred for unlabeled to make sure it only contains 0 and 1

cf_u = p[p['label'] == -1]

set(cf_u['pred'])

{0.0, 1.0}

In [88]:
# split dataframe into relatively same size

n = 5000

list_df = [unlabel[i: (i + n)] for i in range(0, unlabel.shape[0], n)]

len(list_df)

20

In [90]:
# create an empty list to store the predictions for unlabeled data

y_pred = []

# iterate through sectioned dataframe

for i in range(len(list_df)):
    
    # combine the labeled data and unlabeled data
    
    df = pd.concat([label, list_df[i]])
    
    # set X and y
    
    X = df['lemmatized']
    y = df['label']
    
    # vectorized values to a dataframe
    
    t = pd.DataFrame(tvec.fit_transform(X).toarray(), columns = tvec.get_feature_names())
    
    # instantiate standard scaler and fit/transform
    
    sc = StandardScaler()
    t = sc.fit_transform(t)
    
    # instantiate the model
    
    model = LabelSpreading()
    
    # fit and predict
    
    model.fit(t, y)
    pred = np.array(model.predict(t))
    
    # extract predicted values just for the unlabeled
    
    dct = {'label': y, 'pred': pred}
    values = pd.DataFrame(dct)
    non_scoring = values[values['label'] == -1]

    # extend the prediction to the list
    
    y_pred.extend(non_scoring['pred'])

In [91]:
# create a column to store prediction

unlabel['pred'] = y_pred

In [92]:
# check the null values - 'pred' column should have 2829 rows of null

pd.concat([label, unlabel]).isnull().sum()

subreddit           0
author              0
date                0
post                0
covid_related       0
suicidal            0
alc_abuse           0
loneliness          0
stress              0
n_words             0
n_sentences         0
lemmatized          0
vectors             0
label               0
pred             2829
dtype: int64

In [93]:
# combine labeled and unlabeled dataframes

df = pd.concat([label, unlabel])

In [94]:
# fill null values in pred with label values

df['pred'].fillna(df['label'], inplace = True)

In [95]:
# check for null

df.isnull().sum()

subreddit        0
author           0
date             0
post             0
covid_related    0
suicidal         0
alc_abuse        0
loneliness       0
stress           0
n_words          0
n_sentences      0
lemmatized       0
vectors          0
label            0
pred             0
dtype: int64

In [96]:
# drop label column

df.drop(columns = ['label'], axis = 1, inplace = True)

df.head()

Unnamed: 0,subreddit,author,date,post,covid_related,suicidal,alc_abuse,loneliness,stress,n_words,n_sentences,lemmatized,vectors,pred
0,alcoholism,glorybellpirate,2020-01-01,Day 1 of sobriety Feeling anxious and letting ...,0,0,1,0,0,24,3,day 1 of sobriety feel anxious and let the fee...,"[-0.026765203100000003, 0.2515704036, -0.15668...",0.0
1,alcoholism,EhndlessSl0th,2020-01-01,"Started the New Year with a bang. Hey, I'm new...",0,1,1,0,1,577,41,start the new year with a bang hey -pron- be n...,"[0.0102483444, 0.18354494870000002, -0.2263026...",0.0
2,alcoholism,the_kinky_penguin,2020-01-01,Why can't I get drunk anymore I've been a heav...,0,0,1,0,0,64,5,why ca not -pron- get drunk anymore -pron- hav...,"[-0.09739924970000001, 0.1839587241, -0.227181...",0.0
3,alcoholism,SauceoffSauceOn,2020-01-01,I am an Alcoholic. How do I quit? I have been ...,0,0,1,0,1,111,14,-pron- be an alcoholic how do -pron- quit -pro...,"[-0.0117096035, 0.1865714192, -0.2576603591, -...",0.0
4,alcoholism,ben42187,2020-01-01,Funniest Thing about Alcoholism With every oth...,0,0,1,0,0,136,0,funniest thing about alcoholism with every oth...,"[-0.0443742387, 0.1812106818, -0.1554362029000...",0.0


In [97]:
# rename column pred as label

df.rename(columns = {'pred': 'label'}, inplace = True)

In [98]:
# check

df.head(3)

Unnamed: 0,subreddit,author,date,post,covid_related,suicidal,alc_abuse,loneliness,stress,n_words,n_sentences,lemmatized,vectors,label
0,alcoholism,glorybellpirate,2020-01-01,Day 1 of sobriety Feeling anxious and letting ...,0,0,1,0,0,24,3,day 1 of sobriety feel anxious and let the fee...,"[-0.026765203100000003, 0.2515704036, -0.15668...",0.0
1,alcoholism,EhndlessSl0th,2020-01-01,"Started the New Year with a bang. Hey, I'm new...",0,1,1,0,1,577,41,start the new year with a bang hey -pron- be n...,"[0.0102483444, 0.18354494870000002, -0.2263026...",0.0
2,alcoholism,the_kinky_penguin,2020-01-01,Why can't I get drunk anymore I've been a heav...,0,0,1,0,0,64,5,why ca not -pron- get drunk anymore -pron- hav...,"[-0.09739924970000001, 0.1839587241, -0.227181...",0.0


In [99]:
# change label column to int

df['label'] = df['label'].astype(int)

In [100]:
# check the label

df['label'].mean()

0.006438840812646393

In [101]:
# check each subreddit how they are labeled

df.groupby('subreddit')[['label']].mean()

Unnamed: 0_level_0,label
subreddit,Unnamed: 1_level_1
alcoholism,0.006285
anxiety,0.000189
bipolarreddit,0.024123
depression,0.005101
healthanxiety,0.0
lonely,0.001257
mentalhealth,0.00074
suicidewatch,0.018588


### TF-IDF Vectorizer with Parameters:
#### max_features = 256, max_df = 0.8, min_df = 10, ngram_range = (2, 2)

In [102]:
# read in the dataset

df = pd.read_json('../data/cleaned/json.json')

rv1 = pd.read_csv('../data/cleaned/review1.csv')
rv2 = pd.read_csv('../data/cleaned/review2.csv')

In [103]:
# combine the reviewed dataset

label = pd.concat([rv1, rv2])

# set the index

label.set_index('id', inplace = True)

label.head(3)

Unnamed: 0_level_0,post,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1428,How do you guys feel less dead inside? I've go...,1
1429,i want to get help but i don’t know how my par...,1
1430,I can’t stop myself from loving this fictional...,0


In [104]:
# merge the labeled data to main dataframe

df = pd.merge(df, label, how = 'left', on = 'post')

In [105]:
# check for nulls

df.isnull().sum()

subreddit            0
author               0
date                 0
post                 0
covid_related        0
suicidal             0
alc_abuse            0
loneliness           0
stress               0
n_words              0
n_sentences          0
lemmatized           0
vectors              0
label            99208
dtype: int64

In [106]:
# fill null values with 99 for easy distinction

df.fillna(-1, inplace = True)

In [107]:
df.isnull().sum()

subreddit        0
author           0
date             0
post             0
covid_related    0
suicidal         0
alc_abuse        0
loneliness       0
stress           0
n_words          0
n_sentences      0
lemmatized       0
vectors          0
label            0
dtype: int64

In [108]:
df.replace({9: 0}, inplace = True)

In [109]:
# divide the datasets to labeled and not labeled

unlabel = df[df['label'] == -1]
label = df[df['label'] != -1]

In [110]:
## choose a small portion of samples including labeled data to test the model
# combine labeled data and sampled unlabeled data for test

sampled = unlabel.sample(n = 500, random_state = 2020)

df = pd.concat([label, sampled])

print(df.shape)
df.head()

(3329, 14)


Unnamed: 0,subreddit,author,date,post,covid_related,suicidal,alc_abuse,loneliness,stress,n_words,n_sentences,lemmatized,vectors,label
0,alcoholism,glorybellpirate,2020-01-01,Day 1 of sobriety Feeling anxious and letting ...,0,0,1,0,0,24,3,day 1 of sobriety feel anxious and let the fee...,"[-0.026765203100000003, 0.2515704036, -0.15668...",0.0
1,alcoholism,EhndlessSl0th,2020-01-01,"Started the New Year with a bang. Hey, I'm new...",0,1,1,0,1,577,41,start the new year with a bang hey -pron- be n...,"[0.0102483444, 0.18354494870000002, -0.2263026...",0.0
2,alcoholism,the_kinky_penguin,2020-01-01,Why can't I get drunk anymore I've been a heav...,0,0,1,0,0,64,5,why ca not -pron- get drunk anymore -pron- hav...,"[-0.09739924970000001, 0.1839587241, -0.227181...",0.0
3,alcoholism,SauceoffSauceOn,2020-01-01,I am an Alcoholic. How do I quit? I have been ...,0,0,1,0,1,111,14,-pron- be an alcoholic how do -pron- quit -pro...,"[-0.0117096035, 0.1865714192, -0.2576603591, -...",0.0
4,alcoholism,ben42187,2020-01-01,Funniest Thing about Alcoholism With every oth...,0,0,1,0,0,136,0,funniest thing about alcoholism with every oth...,"[-0.0443742387, 0.1812106818, -0.1554362029000...",0.0


In [111]:
# check the distribution of subreddits
# labeled dataset contains 500 from suicide, 500 from depression, 1428 from alcoholism, 390 from bipolarreddit

df.groupby('subreddit')['subreddit'].count()

subreddit
alcoholism       1432
anxiety            83
bipolarreddit     394
depression        705
healthanxiety      10
lonely             39
mentalhealth       54
suicidewatch      612
Name: subreddit, dtype: int64

In [112]:
# most of the suicidal posts from labeled data came from suicide watch and depression - check

print(label[(label['subreddit'] == 'suicidewatch') | (label['subreddit'] == 'depression')][['label']].mean())
print(label[label['subreddit'] == 'suicidewatch'][['label']].mean())
print(label[label['subreddit'] == 'depression'][['label']].mean())
print()
print(label['label'].mean())

label    0.449751
dtype: float64
label    0.650099
dtype: float64
label    0.249004
dtype: float64

0.17497348886532343


In [113]:
# set up X and y

X = df['lemmatized']
y = df['label']

In [114]:
len(y)

3329

In [115]:
y.unique()

array([ 0.,  1., -1.])

In [116]:
# instantiate Tf-IDF

tvec = TfidfVectorizer(stop_words = my_stop_words, max_df = .80, min_df = 10, max_features = 256, ngram_range = (2, 2))

In [117]:
# fit/transform X then save it to a dataframe

t = pd.DataFrame(tvec.fit_transform(X).toarray(),
                 columns = tvec.get_feature_names())
t.head()

Unnamed: 0,10 ago,10 hour,100 mg,20 old,24 hour,aa meeting,admit problem,advice appreciate,ago start,alcohol abuse,...,weight gain,withdrawal symptom,work drink,work hard,work home,work hour,work week,work work,worth live,www reddit
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.358126,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [118]:
# instantiate LabelPropagation model, fit, and predict

model = LabelSpreading()
model.fit(t, y)
pred = np.array(model.predict(t))

In [119]:
# check the length of both label and prediction

print(len(y))
print(len(pred))

3329
3329


In [120]:
# create a dataframe just with label and prediction

d = {'label': y, 'pred': pred}

p = pd.DataFrame(d)

In [121]:
# create a separate dataframe to check the accuracy of the model with the labeled data

cf = p[p['label'] != -1]

In [122]:
# get the values from confusion matrix

tn, fp, fn, tp = confusion_matrix(cf['label'], cf['pred'], normalize = 'true').ravel()

In [123]:
# check the values from confusion matrix

print(f'True Negative: {tn}')
print(f'False Positive: {fp}')
print(f'False Negative: {fn}')
print(f'True Positive: {tp}')

True Negative: 0.9875749785775493
False Positive: 0.012425021422450729
False Negative: 0.49696969696969695
True Positive: 0.503030303030303


In [124]:
# classification report

print(classification_report(cf['label'], cf['pred'], target_names = ['non-suicidal', 'suicidal']))

              precision    recall  f1-score   support

non-suicidal       0.90      0.99      0.94      2334
    suicidal       0.90      0.50      0.64       495

    accuracy                           0.90      2829
   macro avg       0.90      0.75      0.79      2829
weighted avg       0.90      0.90      0.89      2829



In [125]:
# check pred for unlabeled to make sure it only contains 0 and 1

cf_u = p[p['label'] == -1]

set(cf_u['pred'])

{0.0, 1.0}

In [126]:
# split dataframe into relatively same size

n = 5000

list_df = [unlabel[i: (i + n)] for i in range(0, unlabel.shape[0], n)]

len(list_df)

20

In [127]:
# create an empty list to store the predictions for unlabeled data

y_pred = []

# iterate through sectioned dataframe

for i in range(len(list_df)):
    
    # combine the labeled data and unlabeled data
    
    df = pd.concat([label, list_df[i]])
    
    # set X and y
    
    X = df['lemmatized']
    y = df['label']
    
    # vectorized values to a dataframe
    
    t = pd.DataFrame(tvec.fit_transform(X).toarray(), columns = tvec.get_feature_names())
    
    # instantiate standard scaler and fit/transform
    
    sc = StandardScaler()
    t = sc.fit_transform(t)
    
    # instantiate the model
    
    model = LabelSpreading()
    
    # fit and predict
    
    model.fit(t, y)
    pred = np.array(model.predict(t))
    
    # extract predicted values just for the unlabeled
    
    dct = {'label': y, 'pred': pred}
    values = pd.DataFrame(dct)
    non_scoring = values[values['label'] == -1]

    # extend the prediction to the list
    
    y_pred.extend(non_scoring['pred'])

In [128]:
# create a column to store prediction

unlabel['pred'] = y_pred

In [129]:
# check the null values - 'pred' column should have 2829 rows of null

pd.concat([label, unlabel]).isnull().sum()

subreddit           0
author              0
date                0
post                0
covid_related       0
suicidal            0
alc_abuse           0
loneliness          0
stress              0
n_words             0
n_sentences         0
lemmatized          0
vectors             0
label               0
pred             2829
dtype: int64

In [130]:
# combine labeled and unlabeled dataframes

df = pd.concat([label, unlabel])

In [131]:
# fill null values in pred with label values

df['pred'].fillna(df['label'], inplace = True)

In [132]:
# check for null

df.isnull().sum()

subreddit        0
author           0
date             0
post             0
covid_related    0
suicidal         0
alc_abuse        0
loneliness       0
stress           0
n_words          0
n_sentences      0
lemmatized       0
vectors          0
label            0
pred             0
dtype: int64

In [133]:
# drop label column

df.drop(columns = ['label'], axis = 1, inplace = True)

df.head()

Unnamed: 0,subreddit,author,date,post,covid_related,suicidal,alc_abuse,loneliness,stress,n_words,n_sentences,lemmatized,vectors,pred
0,alcoholism,glorybellpirate,2020-01-01,Day 1 of sobriety Feeling anxious and letting ...,0,0,1,0,0,24,3,day 1 of sobriety feel anxious and let the fee...,"[-0.026765203100000003, 0.2515704036, -0.15668...",0.0
1,alcoholism,EhndlessSl0th,2020-01-01,"Started the New Year with a bang. Hey, I'm new...",0,1,1,0,1,577,41,start the new year with a bang hey -pron- be n...,"[0.0102483444, 0.18354494870000002, -0.2263026...",0.0
2,alcoholism,the_kinky_penguin,2020-01-01,Why can't I get drunk anymore I've been a heav...,0,0,1,0,0,64,5,why ca not -pron- get drunk anymore -pron- hav...,"[-0.09739924970000001, 0.1839587241, -0.227181...",0.0
3,alcoholism,SauceoffSauceOn,2020-01-01,I am an Alcoholic. How do I quit? I have been ...,0,0,1,0,1,111,14,-pron- be an alcoholic how do -pron- quit -pro...,"[-0.0117096035, 0.1865714192, -0.2576603591, -...",0.0
4,alcoholism,ben42187,2020-01-01,Funniest Thing about Alcoholism With every oth...,0,0,1,0,0,136,0,funniest thing about alcoholism with every oth...,"[-0.0443742387, 0.1812106818, -0.1554362029000...",0.0


In [134]:
# rename column pred as label

df.rename(columns = {'pred': 'label'}, inplace = True)

In [135]:
# check

df.head(3)

Unnamed: 0,subreddit,author,date,post,covid_related,suicidal,alc_abuse,loneliness,stress,n_words,n_sentences,lemmatized,vectors,label
0,alcoholism,glorybellpirate,2020-01-01,Day 1 of sobriety Feeling anxious and letting ...,0,0,1,0,0,24,3,day 1 of sobriety feel anxious and let the fee...,"[-0.026765203100000003, 0.2515704036, -0.15668...",0.0
1,alcoholism,EhndlessSl0th,2020-01-01,"Started the New Year with a bang. Hey, I'm new...",0,1,1,0,1,577,41,start the new year with a bang hey -pron- be n...,"[0.0102483444, 0.18354494870000002, -0.2263026...",0.0
2,alcoholism,the_kinky_penguin,2020-01-01,Why can't I get drunk anymore I've been a heav...,0,0,1,0,0,64,5,why ca not -pron- get drunk anymore -pron- hav...,"[-0.09739924970000001, 0.1839587241, -0.227181...",0.0


In [136]:
# change label column to int

df['label'] = df['label'].astype(int)

In [137]:
# check the label

df['label'].mean()

0.04649293883591246

In [138]:
# check each subreddit how they are labeled

df.groupby('subreddit')[['label']].mean()

Unnamed: 0_level_0,label
subreddit,Unnamed: 1_level_1
alcoholism,0.006285
anxiety,0.018998
bipolarreddit,0.042398
depression,0.051061
healthanxiety,0.009477
lonely,0.037084
mentalhealth,0.037073
suicidewatch,0.075378


><font size = 4><font color = "purple">TF-IDF with unigram, and not scaled data seems to be the best model.</font></font>