In [49]:
# import libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# nlp vectorizers

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import text
import gensim

# model and metrics

from sklearn.metrics import confusion_matrix, classification_report
from sklearn.semi_supervised import LabelPropagation, LabelSpreading

# ignore warnings

import warnings
warnings.filterwarnings("ignore")

In [98]:
# read in labeled data

rv1 = pd.read_csv('../data/cleaned/review1.csv')
rv2 = pd.read_csv('../data/cleaned/review2.csv')

labeled = pd.concat([rv1, rv2])

labeled.head()

Unnamed: 0,id,post,label
0,1428,How do you guys feel less dead inside? I've go...,1
1,1429,i want to get help but i don’t know how my par...,1
2,1430,I can’t stop myself from loving this fictional...,0
3,1431,There's no point in continuing I lost my job l...,1
4,1432,My friends keep finding my reddit accounts. I ...,0


In [99]:
# set the index

labeled.set_index('id', inplace = True)

labeled.head(3)

Unnamed: 0_level_0,post,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1428,How do you guys feel less dead inside? I've go...,1
1429,i want to get help but i don’t know how my par...,1
1430,I can’t stop myself from loving this fictional...,0


In [100]:
# read in the main dataframe

df = pd.read_json('../data/cleaned/json.json')

df.head()

Unnamed: 0,subreddit,author,date,post,covid_related,suicidal,alc_abuse,loneliness,stress,n_words,n_sentences,lemmatized,vectors
0,alcoholism,glorybellpirate,2020-01-01,Day 1 of sobriety Feeling anxious and letting ...,0,0,1,0,0,24,3,day 1 of sobriety feel anxious and let the fee...,"[-0.026765203100000003, 0.2515704036, -0.15668..."
1,alcoholism,EhndlessSl0th,2020-01-01,"Started the New Year with a bang. Hey, I'm new...",0,1,1,0,1,577,41,start the new year with a bang hey -pron- be n...,"[0.0102483444, 0.18354494870000002, -0.2263026..."
2,alcoholism,the_kinky_penguin,2020-01-01,Why can't I get drunk anymore I've been a heav...,0,0,1,0,0,64,5,why ca not -pron- get drunk anymore -pron- hav...,"[-0.09739924970000001, 0.1839587241, -0.227181..."
3,alcoholism,SauceoffSauceOn,2020-01-01,I am an Alcoholic. How do I quit? I have been ...,0,0,1,0,1,111,14,-pron- be an alcoholic how do -pron- quit -pro...,"[-0.0117096035, 0.1865714192, -0.2576603591, -..."
4,alcoholism,ben42187,2020-01-01,Funniest Thing about Alcoholism With every oth...,0,0,1,0,0,136,9,funniest thing about alcoholism with every oth...,"[-0.0443742387, 0.1812106818, -0.1554362029000..."


In [101]:
# merge the labeled data to main dataframe

df = pd.merge(df, labeled, how = 'left', on = 'post')

In [102]:
# check for nulls

df.isnull().sum()

subreddit            0
author               0
date                 0
post                 0
covid_related        0
suicidal             0
alc_abuse            0
loneliness           0
stress               0
n_words              0
n_sentences          0
lemmatized           0
vectors              0
label            99208
dtype: int64

In [103]:
# fill null values with 99 for easy distinction

df.fillna(-1, inplace = True)

In [104]:
df.isnull().sum()

subreddit        0
author           0
date             0
post             0
covid_related    0
suicidal         0
alc_abuse        0
loneliness       0
stress           0
n_words          0
n_sentences      0
lemmatized       0
vectors          0
label            0
dtype: int64

In [105]:
df.replace({9: 0}, inplace = True)

In [106]:
# divide the datasets to labeled and not labeled

unlabel = df[df['label'] == -1]
label = df[df['label'] != -1]

## Unlabeled Samples - 500

In [107]:
## choose a small portion of samples including labeled data to test the model
# combine labeled data and sampled unlabeled data for test

sampled = unlabel.sample(n = 500, random_state = 2020)

df = pd.concat([label, sampled])

print(df.shape)
df.head()

(3329, 14)


Unnamed: 0,subreddit,author,date,post,covid_related,suicidal,alc_abuse,loneliness,stress,n_words,n_sentences,lemmatized,vectors,label
0,alcoholism,glorybellpirate,2020-01-01,Day 1 of sobriety Feeling anxious and letting ...,0,0,1,0,0,24,3,day 1 of sobriety feel anxious and let the fee...,"[-0.026765203100000003, 0.2515704036, -0.15668...",0.0
1,alcoholism,EhndlessSl0th,2020-01-01,"Started the New Year with a bang. Hey, I'm new...",0,1,1,0,1,577,41,start the new year with a bang hey -pron- be n...,"[0.0102483444, 0.18354494870000002, -0.2263026...",0.0
2,alcoholism,the_kinky_penguin,2020-01-01,Why can't I get drunk anymore I've been a heav...,0,0,1,0,0,64,5,why ca not -pron- get drunk anymore -pron- hav...,"[-0.09739924970000001, 0.1839587241, -0.227181...",0.0
3,alcoholism,SauceoffSauceOn,2020-01-01,I am an Alcoholic. How do I quit? I have been ...,0,0,1,0,1,111,14,-pron- be an alcoholic how do -pron- quit -pro...,"[-0.0117096035, 0.1865714192, -0.2576603591, -...",0.0
4,alcoholism,ben42187,2020-01-01,Funniest Thing about Alcoholism With every oth...,0,0,1,0,0,136,0,funniest thing about alcoholism with every oth...,"[-0.0443742387, 0.1812106818, -0.1554362029000...",0.0


In [108]:
# check the distribution of subreddits
# labeled dataset contains 500 from suicide, 500 from depression, 1428 from alcoholism, 390 from bipolarreddit

df.groupby('subreddit')['subreddit'].count()

subreddit
alcoholism       1432
anxiety            83
bipolarreddit     394
depression        705
healthanxiety      10
lonely             39
mentalhealth       54
suicidewatch      612
Name: subreddit, dtype: int64

In [109]:
# most of the suicidal posts from labeled data came from suicide watch and depression - check

print(label[(label['subreddit'] == 'suicidewatch') | (label['subreddit'] == 'depression')][['label']].mean())
print(label[label['subreddit'] == 'suicidewatch'][['label']].mean())
print(label[label['subreddit'] == 'depression'][['label']].mean())
print()
print(label['label'].mean())

label    0.449751
dtype: float64
label    0.650099
dtype: float64
label    0.249004
dtype: float64

0.17497348886532343


In [110]:
# set up X and y

X = df['lemmatized']
y = df['label']

In [111]:
len(y)

3329

In [112]:
y.unique()

array([ 0.,  1., -1.])

### TF-IDF Vectorizer with Parameters:
#### max_features = 256, max_df = 0.8, min_df = 10, ngram_range = (1, 1)

In [17]:
# update stop_words for TF-IDF vectorizer

remove = ['-pron-', 'feel', 'know', 'want', 'life', 'go', 'think', 'make', 'people', 'really', 'even', 'much', 'now', 
          'pron', 'don', 'will', 'try', 'talk', 'friends', 'tell', 'just', 'like', 'time', 'want', 'well', 'thing', 'day',
          'friend', 'help', 'year', 'bad', 've', 'say', 'good', 'need', 'way', 'right', 'month', 'amp', 'x200b']

my_stop_words = text.ENGLISH_STOP_WORDS.union(remove)

In [18]:
# instantiate Tf-IDF

tvec = TfidfVectorizer(stop_words = my_stop_words, max_df = .80, min_df = 10, max_features = 256, ngram_range = (1, 1))

In [19]:
# fit/transform X then save it to a dataframe

t = pd.DataFrame(tvec.fit_transform(X).toarray(),
                 columns = tvec.get_feature_names())
t.head()

Unnamed: 0,10,20,30,aa,able,abuse,act,actually,addiction,advice,...,wish,withdrawal,wonder,work,world,worried,worry,write,wrong,young
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.095629,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.278592,0.0,0.0,0.0,0.0,0.0,0.209062,...,0.0,0.2621,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
# instantiate LabelPropagation model, fit, and predict

model = LabelPropagation()
model.fit(t, y)
pred = np.array(model.predict(t))

In [21]:
# check the length of both label and prediction

print(len(y))
print(len(pred))

3329
3329


In [22]:
# create a dataframe just with label and prediction

d = {'label': y, 'pred': pred}

p = pd.DataFrame(d)

In [23]:
# create a separate dataframe to check the accuracy of the model with the labeled data

cf = p[p['label'] != -1]

In [24]:
# get the values from confusion matrix

tn, fp, fn, tp = confusion_matrix(cf['label'], cf['pred'], normalize = 'true').ravel()

In [25]:
# check the values from confusion matrix

print(f'True Negative: {tn}')
print(f'False Positive: {fp}')
print(f'False Negative: {fn}')
print(f'True Positive: {tp}')

True Negative: 1.0
False Positive: 0.0
False Negative: 0.006060606060606061
True Positive: 0.9939393939393939


In [26]:
# classification report

print(classification_report(cf['label'], cf['pred'], target_names = ['non-suicidal', 'suicidal']))

              precision    recall  f1-score   support

non-suicidal       1.00      1.00      1.00      2334
    suicidal       1.00      0.99      1.00       495

    accuracy                           1.00      2829
   macro avg       1.00      1.00      1.00      2829
weighted avg       1.00      1.00      1.00      2829



In [27]:
# check pred for unlabeled to make sure it only contains 0 and 1

cf_u = p[p['label'] == -1]

set(cf_u['pred'])

{0.0, 1.0}

### TF-IDF Vectorizer with Parameters:
#### max_features = 256, max_df = 0.8, min_df = 10, ngram_range = (2, 2)

In [28]:
# instantiate Tf-IDF

tvec = TfidfVectorizer(stop_words = my_stop_words, max_df = .80, min_df = 10, max_features = 256, ngram_range = (2, 2))

In [29]:
# fit/transform X then save it to a dataframe

t = pd.DataFrame(tvec.fit_transform(X).toarray(),
                 columns = tvec.get_feature_names())
t.head()

Unnamed: 0,10 ago,10 hour,100 mg,20 old,24 hour,aa meeting,admit problem,advice appreciate,ago start,alcohol abuse,...,weight gain,withdrawal symptom,work drink,work hard,work home,work hour,work week,work work,worth live,www reddit
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.358126,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [30]:
# instantiate LabelPropagation model, fit, and predict

model = LabelPropagation()
model.fit(t, y)
pred = np.array(model.predict(t))

In [31]:
# check the length of both label and prediction

print(len(y))
print(len(pred))

3329
3329


In [32]:
# create a dataframe just with label and prediction

d = {'label': y, 'pred': pred}

p = pd.DataFrame(d)

In [33]:
# create a separate dataframe to check the accuracy of the model with the labeled data

cf = p[p['label'] != -1]

In [34]:
# get the values from confusion matrix

tn, fp, fn, tp = confusion_matrix(cf['label'], cf['pred'], normalize = 'true').ravel()

In [35]:
# check the values from confusion matrix

print(f'True Negative: {tn}')
print(f'False Positive: {fp}')
print(f'False Negative: {fn}')
print(f'True Positive: {tp}')

True Negative: 0.9875749785775493
False Positive: 0.012425021422450729
False Negative: 0.49696969696969695
True Positive: 0.503030303030303


In [36]:
# classification report

print(classification_report(cf['label'], cf['pred'], target_names = ['non-suicidal', 'suicidal']))

              precision    recall  f1-score   support

non-suicidal       0.90      0.99      0.94      2334
    suicidal       0.90      0.50      0.64       495

    accuracy                           0.90      2829
   macro avg       0.90      0.75      0.79      2829
weighted avg       0.90      0.90      0.89      2829



In [37]:
# check pred for unlabeled to make sure it only contains 0 and 1

cf_u = p[p['label'] == -1]

set(cf_u['pred'])

{0.0, 1.0}

><font size = 4><font color = "purple">
Unigram works much much better than 2-gram, so we will use the parameter 1-gram for TF-IDF Vectorizer.</font></font>

## Implement seudo labeling for entire dataset
### by sections due to its size

In [38]:
# split dataframe into relatively same size

n = 5000

list_df = [unlabel[i: (i + n)] for i in range(0, unlabel.shape[0], n)]

len(list_df)

20

In [39]:
# check the shape of each dataset

[j.shape for j in list_df]

[(5000, 14),
 (5000, 14),
 (5000, 14),
 (5000, 14),
 (5000, 14),
 (5000, 14),
 (5000, 14),
 (5000, 14),
 (5000, 14),
 (5000, 14),
 (5000, 14),
 (5000, 14),
 (5000, 14),
 (5000, 14),
 (5000, 14),
 (5000, 14),
 (5000, 14),
 (5000, 14),
 (5000, 14),
 (4208, 14)]

In [40]:
# instantiate tf-idf vectorizer

tvec = TfidfVectorizer(stop_words = my_stop_words, max_features = 256, max_df = 0.8, min_df = 10, ngram_range= (1, 1))

In [46]:
# combine labeled and sections of unlabeled dataframe
# fit/predict - and get classification report to compare the f1 score

def gamma(df1, df_list, gamma):
    
    for i in range(len(df_list)):
        
        df = pd.concat([df1, df_list[i]])
        
        X = df['lemmatized']
        y = df['label']
        
        t = pd.DataFrame(tvec.fit_transform(X).toarray(), columns = tvec.get_feature_names())
        
        model = LabelPropagation(gamma = gamma)
        
        model.fit(t, y)
        
        pred = np.array(model.predict(t))
        
        dct = {'label': y, 'pred': pred}
        
        values = pd.DataFrame(dct)
        
        scoring = values[values['label'] != -1]

        return print(classification_report(scoring['label'], scoring['pred'], target_names = ['non-suicidal', 'suicidal']))

In [47]:
# with gamma = 10

gamma(label, list_df, 10)



              precision    recall  f1-score   support

non-suicidal       1.00      1.00      1.00      2334
    suicidal       1.00      0.99      1.00       495

    accuracy                           1.00      2829
   macro avg       1.00      1.00      1.00      2829
weighted avg       1.00      1.00      1.00      2829



><font size = 4><font color = 'purple'>Gamma does not seem to impact the scores. Implement pseudo labeling.</font></font>

In [54]:
# create an empty list to store the predictions for unlabeled data

y_pred = []

# iterate through sectioned dataframe

for i in range(len(list_df)):
    
    # combine the labeled data and unlabeled data
    
    df = pd.concat([label, list_df[i]])
    
    # set X and y
    
    X = df['lemmatized']
    y = df['label']
    
    # vectorized values to a dataframe
    
    t = pd.DataFrame(tvec.fit_transform(X).toarray(), columns = tvec.get_feature_names())
    
    # instantiate the model
    
    model = LabelPropagation()
    
    # fit and predict
    
    model.fit(t, y)
    pred = np.array(model.predict(t))
    
    # extract predicted values just for the unlabeled
    
    dct = {'label': y, 'pred': pred}
    values = pd.DataFrame(dct)
    non_scoring = values[values['label'] == -1]

    # extend the prediction to the list
    
    y_pred.extend(non_scoring['pred'])

In [56]:
# create a column to store prediction

unlabel['pred'] = y_pred

In [58]:
# check the null values - 'pred' column should have 2829 rows of null

pd.concat([label, unlabel]).isnull().sum()

subreddit           0
author              0
date                0
post                0
covid_related       0
suicidal            0
alc_abuse           0
loneliness          0
stress              0
n_words             0
n_sentences         0
lemmatized          0
vectors             0
label               0
pred             2829
dtype: int64

In [59]:
# combine labeled and unlabeled dataframes

df = pd.concat([label, unlabel])

In [70]:
# fill null values in pred with label values

df['pred'].fillna(df['label'], inplace = True)

In [71]:
# check for null

df.isnull().sum()

subreddit        0
author           0
date             0
post             0
covid_related    0
suicidal         0
alc_abuse        0
loneliness       0
stress           0
n_words          0
n_sentences      0
lemmatized       0
vectors          0
label            0
pred             0
dtype: int64

In [74]:
# drop label column

df.drop(columns = ['label'], axis = 1, inplace = True)

df.head()

Unnamed: 0,subreddit,author,date,post,covid_related,suicidal,alc_abuse,loneliness,stress,n_words,n_sentences,lemmatized,vectors,pred
0,alcoholism,glorybellpirate,2020-01-01,Day 1 of sobriety Feeling anxious and letting ...,0,0,1,0,0,24,3,day 1 of sobriety feel anxious and let the fee...,"[-0.026765203100000003, 0.2515704036, -0.15668...",0.0
1,alcoholism,EhndlessSl0th,2020-01-01,"Started the New Year with a bang. Hey, I'm new...",0,1,1,0,1,577,41,start the new year with a bang hey -pron- be n...,"[0.0102483444, 0.18354494870000002, -0.2263026...",0.0
2,alcoholism,the_kinky_penguin,2020-01-01,Why can't I get drunk anymore I've been a heav...,0,0,1,0,0,64,5,why ca not -pron- get drunk anymore -pron- hav...,"[-0.09739924970000001, 0.1839587241, -0.227181...",0.0
3,alcoholism,SauceoffSauceOn,2020-01-01,I am an Alcoholic. How do I quit? I have been ...,0,0,1,0,1,111,14,-pron- be an alcoholic how do -pron- quit -pro...,"[-0.0117096035, 0.1865714192, -0.2576603591, -...",0.0
4,alcoholism,ben42187,2020-01-01,Funniest Thing about Alcoholism With every oth...,0,0,1,0,0,136,0,funniest thing about alcoholism with every oth...,"[-0.0443742387, 0.1812106818, -0.1554362029000...",0.0


In [78]:
# rename column pred as label

df.rename(columns = {'pred': 'label'}, inplace = True)

In [80]:
# check

df.head(3)

Unnamed: 0,subreddit,author,date,post,covid_related,suicidal,alc_abuse,loneliness,stress,n_words,n_sentences,lemmatized,vectors,label
0,alcoholism,glorybellpirate,2020-01-01,Day 1 of sobriety Feeling anxious and letting ...,0,0,1,0,0,24,3,day 1 of sobriety feel anxious and let the fee...,"[-0.026765203100000003, 0.2515704036, -0.15668...",0.0
1,alcoholism,EhndlessSl0th,2020-01-01,"Started the New Year with a bang. Hey, I'm new...",0,1,1,0,1,577,41,start the new year with a bang hey -pron- be n...,"[0.0102483444, 0.18354494870000002, -0.2263026...",0.0
2,alcoholism,the_kinky_penguin,2020-01-01,Why can't I get drunk anymore I've been a heav...,0,0,1,0,0,64,5,why ca not -pron- get drunk anymore -pron- hav...,"[-0.09739924970000001, 0.1839587241, -0.227181...",0.0


In [82]:
# change label column to int

df['label'] = df['label'].astype(int)

In [83]:
# check the label

df['label'].mean()

0.055274067250115154

In [86]:
# store labeled dataset witout spacy vectors in a new dataframe

labeled = df.drop(columns = ['vectors'], axis = 1)

In [87]:
# check each subreddit how they are labeled

labeled.groupby('subreddit')[['label']].mean()

Unnamed: 0_level_0,label
subreddit,Unnamed: 1_level_1
alcoholism,0.006285
anxiety,0.02466
bipolarreddit,0.039474
depression,0.056504
healthanxiety,0.009477
lonely,0.0489
mentalhealth,0.034779
suicidewatch,0.100598


## Word2Vec

In [117]:
## import libraries
# faster looping

from itertools import islice

# create own warning sign

import logging

# nlp

import nltk

In [97]:
# create word2vec model

file_path = '/Users/juhee/Desktop/GA/08-week/8.05-lesson-word-vectors/lexvec.enwiki+newscrawl.300d.W.pos.vectors'

w2v = gensim.models.KeyedVectors.load_word2vec_format(file_path)

In [113]:
# precompute l2 normalized vectors 

w2v.init_sims(replace = True)

# check the vocabulary index between 13030 and 13048 in w2v model

list(islice(w2v.vocab, 13030, 13048))

['bal',
 'harley',
 'proponents',
 'escalating',
 'madeleine',
 'crushing',
 'yielded',
 'understandable',
 'agnes',
 'victorious',
 'rockefeller',
 'deeds',
 'jude',
 'doomed',
 'sundays',
 'rejecting',
 'prep',
 'concession']

In [114]:
# average word vectors

def word_averaging(model, words):
    
    # store all words from posts as a set to filter duplicates and average in a list
    
    all_words, mean = set(), []
    
    for word in words:
        
        # word is in an array form (vector arrays), then append it to mean
        
        if isinstance(word, np.ndarray):
            mean.append(word)
        
        # word is in model vocabulary, append it to mean, and add it to the words set
        
        elif word in model.vocab:
            mean.append(model.syn0norm[model.vocab[word].index])
            all_words.add(model.vocab[word].index)
    
    # word cannot be computated to vectoris, generate a warning and fill it with 0
    
    if not mean:
        logging.warning('cannot compute similarity with no input %s', words)
        return np.zeros(model.vector_size, )
    
    # scale vectors to unit length
    
    mean = gensim.matutils.unitvec(np.array(mean).mean(axis = 0)).astype(np.float32)
    
    return mean

def word_averaging_list(model, text_list):
    
    # stack the arrays vertically
    
    return np.vstack([word_averaging(model, post) for post in text_list])

In [118]:
# create a function to tokenize

def w2v_tokenize_text(text):
    
    # store tokes in a list
    
    tokens = []
    
    # iterate through every sentence in a post
    
    for sent in nltk.sent_tokenize(text.lower(), language = 'english'):
        
        # iterate through every word in a sentence
        
        for word in nltk.word_tokenize(sent, language = 'english'):
            
            # discard 1 letter words
            
            if len(word) < 2:
                continue
            
            # add word to the token list
            
            tokens.append(word)
    
    return tokens

In [119]:
# tokenize

tokenized = df.apply(lambda r: w2v_tokenize_text(r['post']), axis = 1).values

In [120]:
tokenized[0]

['day',
 'of',
 'sobriety',
 'feeling',
 'anxious',
 'and',
 'letting',
 'the',
 'feelings',
 'flow',
 'why',
 'is',
 'alcohol',
 'everywhere',
 'on',
 'tv',
 'breathe',
 'through',
 'it']

In [121]:
# get the word average

word_average = word_averaging_list(w2v, tokenized)

In [122]:
# get information of vectors

print(len(word_average))
print(len(word_average[0]))

3329
300


In [123]:
# store word_average in a dataframe

w2vecs = pd.DataFrame(word_average, columns = list(range(1, 301)))

In [124]:
w2vecs

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,291,292,293,294,295,296,297,298,299,300
0,0.037460,0.034029,0.055597,-0.088117,-0.058507,0.005221,-0.106107,-0.059396,0.026664,0.032608,...,0.004527,-0.081843,-0.004714,-0.122824,-0.052387,0.063062,0.056496,-0.050070,-0.037649,-0.038785
1,0.035795,0.055317,0.068639,-0.036306,-0.072369,0.051879,-0.086426,-0.074786,0.000019,0.040920,...,0.010527,-0.012978,-0.032673,-0.137554,-0.038872,0.092861,0.045020,-0.032196,-0.007265,-0.039375
2,0.046500,-0.021264,0.074944,-0.025710,-0.038560,0.065254,-0.095385,-0.117162,0.001046,0.068352,...,-0.009813,-0.051927,-0.042510,-0.111192,-0.032969,0.086183,0.038650,-0.065420,-0.031204,0.027661
3,0.080897,-0.036834,0.100167,0.018957,-0.043922,0.044533,-0.082013,-0.142459,0.035959,0.053978,...,0.037125,0.006417,-0.057249,-0.074064,-0.022859,0.052665,0.005002,-0.068471,0.006779,-0.003934
4,0.040934,0.053291,0.078219,-0.061615,-0.055173,0.040084,-0.112820,-0.092787,0.006901,0.079071,...,-0.014055,0.003156,-0.016104,-0.124517,-0.042584,0.092943,0.050748,-0.068932,-0.005761,-0.037822
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3324,0.065093,0.021342,0.093285,-0.054928,-0.046333,0.056814,-0.093443,-0.123148,0.030101,0.043364,...,0.036998,-0.031662,-0.037633,-0.121989,-0.023170,0.061828,0.028451,-0.062842,0.019890,-0.021757
3325,0.024629,0.011509,0.067459,-0.048470,-0.029087,0.053946,-0.097083,-0.137825,0.027765,0.041543,...,0.025892,-0.008239,-0.043675,-0.097586,-0.015809,0.087455,0.005613,-0.064735,-0.022789,-0.010430
3326,0.048005,0.014318,0.075801,-0.024941,-0.050677,0.064399,-0.105592,-0.124935,0.022253,0.062350,...,0.038320,-0.003675,-0.036363,-0.097034,-0.023825,0.076075,0.013838,-0.076001,0.013714,-0.008262
3327,0.047735,0.015429,0.102364,0.036355,-0.081131,0.010826,-0.052715,-0.071956,-0.019834,-0.031022,...,0.015134,0.017768,-0.051100,-0.081129,-0.034281,0.073929,0.030706,-0.056229,-0.020722,0.000609


In [126]:
# instantiate LabelPropagation model, fit, and predict

model = LabelPropagation()
model.fit(w2vecs, y)
pred = np.array(model.predict(w2vecs))

In [127]:
# check the length of both label and prediction

print(len(y))
print(len(pred))

3329
3329


In [128]:
# create a dataframe just with label and prediction

d = {'label': y, 'pred': pred}

p = pd.DataFrame(d)

In [129]:
# create a separate dataframe to check the accuracy of the model with the labeled data

cf = p[p['label'] != -1]

In [130]:
# get the values from confusion matrix

tn, fp, fn, tp = confusion_matrix(cf['label'], cf['pred'], normalize = 'true').ravel()

In [131]:
# check the values from confusion matrix

print(f'True Negative: {tn}')
print(f'False Positive: {fp}')
print(f'False Negative: {fn}')
print(f'True Positive: {tp}')

True Negative: 1.0
False Positive: 0.0
False Negative: 0.9818181818181818
True Positive: 0.01818181818181818


In [132]:
# classification report

print(classification_report(cf['label'], cf['pred'], target_names = ['non-suicidal', 'suicidal']))

              precision    recall  f1-score   support

non-suicidal       0.83      1.00      0.91      2334
    suicidal       1.00      0.02      0.04       495

    accuracy                           0.83      2829
   macro avg       0.91      0.51      0.47      2829
weighted avg       0.86      0.83      0.75      2829



In [133]:
# check pred for unlabeled to make sure it only contains 0 and 1

cf_u = p[p['label'] == -1]

set(cf_u['pred'])

{0.0}