In [14]:
!pip install wordninja
!pip install scattertext && python -m spacy.en.download
!pip install sklearn

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

pd.set_option('display.max_columns', 100)
sns.set_style("darkgrid")

from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import re

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from PIL import Image

import wordninja

%matplotlib inline
import scattertext as st
import re, io
from pprint import pprint
from scipy.stats import rankdata, hmean, norm
import spacy
import os, pkgutil, json, urllib
from urllib.request import urlopen
from IPython.display import IFrame
from IPython.core.display import display, HTML
from scattertext import CorpusFromPandas, produce_scattertext_explorer
display(HTML("<style>.container { width:98% !important; }</style>"))

/usr/bin/python3: Error while finding module specification for 'spacy.en.download' (ModuleNotFoundError: No module named 'spacy.en')


In [15]:
model_data = pd.read_csv('../data/fin_combined_data.csv', keep_default_na=False)

In [16]:
model_data.head(3)

Unnamed: 0,title,selftext,is_suicide,url
0,Our most-broken and least-understood rules is ...,We understand that most people who reply immed...,1,https://www.reddit.com/r/depression/comments/d...
1,Regular Check-In Post,Welcome to /r/depression's check-in post - a p...,1,https://www.reddit.com/r/depression/comments/e...
2,I hate it so much when you try and express you...,I've been feeling really depressed and lonely ...,1,https://www.reddit.com/r/depression/comments/f...


In [17]:
model_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2516 entries, 0 to 2515
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   title       2516 non-null   object
 1   selftext    2516 non-null   object
 2   is_suicide  2516 non-null   int64 
 3   url         2516 non-null   object
dtypes: int64(1), object(3)
memory usage: 78.8+ KB


In [18]:
model_data['is_suicide'].mean()
#baseline accuracy of 60.25%

0.6025437201907791

In [19]:
def multi_modelling(columns_list, model):
    for i in columns_list:
        X = model_data[i]
        y = model_data['is_suicide']
        
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)
        
        cvec = CountVectorizer()
        cvec.fit(X_train)
        
        X_train = pd.DataFrame(cvec.transform(X_train).todense(),
                               columns=cvec.get_feature_names())
        X_test = pd.DataFrame(cvec.transform(X_test).todense(),
                               columns=cvec.get_feature_names())
        
        nb = MultinomialNB()
        nb.fit(X_train,y_train)
        
        pred = nb.predict(X_test)
        
        tn, fp, fn, tp = confusion_matrix(y_test, pred).ravel()
        
        
        nb.predict_proba(X_test)
        pred_proba = [i[1] for i in nb.predict_proba(X_test)] 
        auc = roc_auc_score(y_test, pred_proba)

        classi_dict = (classification_report(y_test,pred, output_dict=True))

        model_results = {}
        model_results['series used (X)'] = i
        model_results['model'] = model
        model_results['AUC Score'] = auc
        model_results['precision']= classi_dict['weighted avg']['precision']
        model_results['recall (sensitivity)']= classi_dict['weighted avg']['recall']
        model_results['confusion matrix']={"TP": tp,"FP":fp, "TN": tn, "FN": fn}
        model_results['train accuracy'] = nb.score(X_train, y_train)
        model_results['test accuracy'] = nb.score(X_test, y_test)
        model_results['baseline accuracy']=0.5166
        model_results['specificity']= tn/(tn+fp)  
        model_results['f1-score']= classi_dict['weighted avg']['f1-score']

        model_results
        df_list.append(model_results) 

    pd.set_option("display.max_colwidth", 50)
    return (pd.DataFrame(df_list)).round(2)

In [20]:
columns_list = ['title', 'selftext']
model = "CountVec + MultinomialNB"
df_list=[]
multi_modelling(columns_list, model)

Unnamed: 0,series used (X),model,AUC Score,precision,recall (sensitivity),confusion matrix,train accuracy,test accuracy,baseline accuracy,specificity,f1-score
0,title,CountVec + MultinomialNB,0.96,0.89,0.89,"{'TP': 363, 'FP': 54, 'TN': 196, 'FN': 16}",0.96,0.89,0.52,0.78,0.89
1,selftext,CountVec + MultinomialNB,0.99,0.94,0.94,"{'TP': 377, 'FP': 38, 'TN': 212, 'FN': 2}",0.96,0.94,0.52,0.85,0.94


In [21]:
X = model_data["title"]
y = model_data['is_suicide']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)

tvec_optimised = TfidfVectorizer(max_df= 0.5, max_features=70, min_df=2, ngram_range=(1, 3),stop_words = 'english')
X_train_tvec = tvec_optimised.fit_transform(X_train).todense()
X_test_tvec = tvec_optimised.transform(X_test).todense()

nb = MultinomialNB()
nb.fit(X_train_tvec, y_train)
accuracy = nb.score(X_test_tvec, y_test)

pred_proba = [i[1] for i in nb.predict_proba(X_test_tvec)] 
auc = roc_auc_score(y_test, pred_proba)

print("ACCURACY: {}\nAUC SCORE: {}".format(accuracy, auc))


ACCURACY: 0.7376788553259142
AUC SCORE: 0.8325751978891821


In [22]:
X = model_data["selftext"]
y = model_data['is_suicide']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)

tvec_optimised = TfidfVectorizer(max_df= 0.5, max_features=70, min_df=2, ngram_range=(1, 3),stop_words = 'english')
X_train_tvec = tvec_optimised.fit_transform(X_train).todense()
X_test_tvec = tvec_optimised.transform(X_test).todense()

nb = MultinomialNB()
nb.fit(X_train_tvec, y_train)
accuracy = nb.score(X_test_tvec, y_test)

pred_proba = [i[1] for i in nb.predict_proba(X_test_tvec)] 
auc = roc_auc_score(y_test, pred_proba)

print("ACCURACY: {}\nAUC SCORE: {}".format(accuracy, auc) )

ACCURACY: 0.821939586645469
AUC SCORE: 0.9560105540897098


In [23]:
model_data2 = pd.read_csv('../data/testdata.csv', keep_default_na=False)

In [24]:
model_data2['is_suicide'].mean()
#baseline accuracy of 65.18%

0.6518010291595198

In [25]:
X = model_data2["title"]
y = model_data2['is_suicide']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)

tvec_optimised = TfidfVectorizer(max_df= 0.5, max_features=70, min_df=2, ngram_range=(1, 3),stop_words = 'english')
X_train_tvec = tvec_optimised.fit_transform(X_train).todense()
X_test_tvec = tvec_optimised.transform(X_test).todense()

nb = MultinomialNB()
nb.fit(X_train_tvec, y_train)
accuracy = nb.score(X_test_tvec, y_test)

pred_proba = [i[1] for i in nb.predict_proba(X_test_tvec)] 
auc = roc_auc_score(y_test, pred_proba)

print("ACCURACY: {}\nAUC SCORE: {}".format(accuracy, auc))

ACCURACY: 0.8082191780821918
AUC SCORE: 0.8661506707946337


In [26]:
X = model_data2["selftext"]
y = model_data2['is_suicide']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)

tvec_optimised = TfidfVectorizer(max_df= 0.5, max_features=70, min_df=2, ngram_range=(1, 3),stop_words = 'english')
X_train_tvec = tvec_optimised.fit_transform(X_train).todense()
X_test_tvec = tvec_optimised.transform(X_test).todense()

nb = MultinomialNB()
nb.fit(X_train_tvec, y_train)
accuracy = nb.score(X_test_tvec, y_test)

pred_proba = [i[1] for i in nb.predict_proba(X_test_tvec)] 
auc = roc_auc_score(y_test, pred_proba)

print("ACCURACY: {}\nAUC SCORE: {}".format(accuracy, auc) )

ACCURACY: 0.8356164383561644
AUC SCORE: 0.9330237358101136
