In [1]:
# Import the libraries we'll use below.
import numpy as np
from matplotlib import pyplot as plt
import pandas as pd
import seaborn as sns  # for nicer plots
sns.set(style="darkgrid")  # default style
from sklearn import preprocessing

import pickle
from google.colab import drive

import tensorflow as tf
from tensorflow import keras
from keras import metrics
tf.get_logger().setLevel('INFO')

from sklearn.model_selection import train_test_split

import nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.util import ngrams

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [2]:
# Mount the google drive to collab
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


**Load Cleaned Data**

In [3]:
file_path = '/content/drive/MyDrive/Cleaned Suicide Detection 2023-07-08.pkl'

df = pd.read_pickle(file_path)

**Split Cleaned Data into Train, Validation and Test**

In [4]:
with open("/content/drive/MyDrive/train_indices.pkl", "rb") as handle:
    train_indices = pickle.load(handle)
with open("/content/drive/MyDrive/val_indices.pkl", "rb") as handle:
    val_indices = pickle.load(handle)
with open("/content/drive/MyDrive/test_indices.pkl", "rb") as handle:
    test_indices = pickle.load(handle)

df_train = df.loc[train_indices]
df_val = df.loc[val_indices]
df_test = df.loc[test_indices]

print(df_test.shape)
print(df_train.shape)
print(df_val.shape)

(46415, 7)
(139243, 7)
(46415, 7)


**Feature Engineering - VADER Sentiment**

In [5]:
sid = SentimentIntensityAnalyzer()

In [6]:
df_train['text_sentiment_score'] = df_train['text_cleaned'].apply(lambda text_cleaned: sid.polarity_scores(text_cleaned))
df_train['text_sentiment_prob'] = df_train['text_sentiment_score'].apply(lambda score_dict: score_dict['compound'])
df_train['text_sentiment'] = df_train['text_sentiment_prob'].apply(lambda c: 'pos' if c >=0 else 'neg')

In [7]:
df_val['text_sentiment_score'] = df_val['text_cleaned'].apply(lambda text_cleaned: sid.polarity_scores(text_cleaned))
df_val['text_sentiment_prob'] = df_val['text_sentiment_score'].apply(lambda score_dict: score_dict['compound'])
df_val['text_sentiment'] = df_val['text_sentiment_prob'].apply(lambda c: 'pos' if c >=0 else 'neg')

In [8]:
df_test['text_sentiment_score'] = df_test['text_cleaned'].apply(lambda text_cleaned: sid.polarity_scores(text_cleaned))
df_test['text_sentiment_prob'] = df_test['text_sentiment_score'].apply(lambda score_dict: score_dict['compound'])
df_test['text_sentiment'] = df_test['text_sentiment_prob'].apply(lambda c: 'pos' if c >=0 else 'neg')

**Baseline Model - Sentiment Binary**

In [9]:
print('Train_Sentiment_Baseline Accuracy:', len(df_train[(df_train["text_sentiment"]=="neg") &
             (df_train["class"]=="suicide")]) / len(df_train))

print('Val_Sentiment_Baseline Accuracy:', len(df_val[(df_val["text_sentiment"]=="neg") &
             (df_val["class"]=="suicide")]) / len(df_val))

print('Test_Sentiment_Baseline Accuracy:', len(df_test[(df_test["text_sentiment"]=="neg") &
             (df_test["class"]=="suicide")]) / len(df_test))

Train_Sentiment_Baseline Accuracy: 0.3726147813534612
Val_Sentiment_Baseline Accuracy: 0.37257352149089734
Test_Sentiment_Baseline Accuracy: 0.37425401271140796


In [10]:
df_train.head()

Unnamed: 0,text,class,text_cleaned,len_text,len_text_cleaned,pos_tags,chunk_chink,text_sentiment_score,text_sentiment_prob,text_sentiment
64811,Those salmonlings from the Splatoon 3 trailer ...,non-suicide,salmon ling platoon trailer want pet right,94,42,"[(salmon, NN), (ling, VBG), (platoon, NN), (3,...","[[(salmon, NN)], [(ling, VBG)], [(platoon, NN)...","{'neg': 0.0, 'neu': 0.822, 'pos': 0.178, 'comp...",0.0772,pos
26585,Is anyone else’s reddit down or just me? I’ve ...,non-suicide,anyone geddit trouble logging geddit christ se...,136,65,"[(anyone, NN), (else, RB), (geddit, NN), (trou...","[[(anyone, NN)], (else, RB), [(geddit, NN)], [...","{'neg': 0.435, 'neu': 0.565, 'pos': 0.0, 'comp...",-0.6597,neg
87417,I have so many other feelings than this... But...,suicide,feeling silence think let suicide day born kne...,1788,834,"[(many, JJ), (feeling, VBG), (silence, NN), (c...","[(many, JJ), [(feeling, VBG)], [(silence, NN)]...","{'neg': 0.395, 'neu': 0.458, 'pos': 0.147, 'co...",-0.9938,neg
212624,Autism is a perfectly valid reason to kill you...,suicide,autism valid reason kill autistic woman nobody...,525,268,"[(autism, NN), (perfectly, RB), (valid, JJ), (...","[[(autism, NN)], (perfectly, RB), [(valid, JJ)...","{'neg': 0.393, 'neu': 0.364, 'pos': 0.243, 'co...",-0.9231,neg
56128,i wish i didn’t have cowardice tendencies so i...,suicide,wish cowardice tendency end life think little ...,379,182,"[(wish, JJ), (cowardice, NN), (tendency, NN), ...","[[(wish, JJ), (cowardice, NN)], [(tendency, NN...","{'neg': 0.371, 'neu': 0.456, 'pos': 0.172, 'co...",-0.7044,neg


**Logistic Regression Model - Sentiment Feature**

Split X and Y

In [11]:
x_train_sent = df_train['text_sentiment_prob'].values.reshape(-1,1)
y_train = df_train['class'].values.reshape(-1,1)

In [12]:
x_val_sent = df_val['text_sentiment_prob'].values.reshape(-1,1)
y_val = df_val['class'].values.reshape(-1,1)

In [13]:
x_test_sent = df_test['text_sentiment_prob'].values.reshape(-1,1)
y_test = df_val['class'].values.reshape(-1,1)

In [14]:
print(x_train_sent.shape)
print(y_train.shape)

(139243, 1)
(139243, 1)


In [15]:
print(x_val_sent.shape)
print(y_val.shape)

(46415, 1)
(46415, 1)


In [16]:
# Model training
logreg = LogisticRegression()
logreg.fit(x_train_sent, y_train)

  y = column_or_1d(y, warn=True)


In [17]:
# Predict on the test set
y_pred = logreg.predict(x_test_sent)
probabilities = logreg.predict_proba(x_test_sent)

# Evaluation Test accuracy
accuracy = accuracy_score(y_test, y_pred)
classification = classification_report(y_test, y_pred)

print("LogisticRegression Accuracy:", accuracy)
print("LogisticRegression Classification Report:")
print(classification)

print("LogisticRegression Probabilities:")

print(probabilities.shape)
print(probabilities[0][0], probabilities[0][1])
print(probabilities[1][0], probabilities[1][1])

LogisticRegression Accuracy: 0.5032640310244533
LogisticRegression Classification Report:
              precision    recall  f1-score   support

 non-suicide       0.50      0.48      0.49     23185
     suicide       0.50      0.53      0.52     23230

    accuracy                           0.50     46415
   macro avg       0.50      0.50      0.50     46415
weighted avg       0.50      0.50      0.50     46415

LogisticRegression Probabilities:
(46415, 2)
0.2736855093500312 0.7263144906499688
0.7498389596304638 0.25016104036953624


In [18]:
df = pd.DataFrame(probabilities)
df.to_csv("/content/drive/MyDrive/sentiment_probabilities_test.csv")

In [19]:
# Predict on the val set
y_pred = logreg.predict(x_val_sent)
probabilities = logreg.predict_proba(x_val_sent)

# Evaluation Test accuracy
accuracy = accuracy_score(y_val, y_pred)
classification = classification_report(y_val, y_pred)

print("LogisticRegression Accuracy:", accuracy)
print("LogisticRegression Classification Report:")
print(classification)

print("LogisticRegression Probabilities:")

print(probabilities.shape)
print(probabilities[0][0], probabilities[0][1])
print(probabilities[1][0], probabilities[1][1])

LogisticRegression Accuracy: 0.6912205106107939
LogisticRegression Classification Report:
              precision    recall  f1-score   support

 non-suicide       0.70      0.67      0.68     23185
     suicide       0.68      0.72      0.70     23230

    accuracy                           0.69     46415
   macro avg       0.69      0.69      0.69     46415
weighted avg       0.69      0.69      0.69     46415

LogisticRegression Probabilities:
(46415, 2)
0.30197677551407465 0.6980232244859254
0.29683563318498574 0.7031643668150143


In [20]:
##COVERT TO CSV. COMMENT OUT ONE AND RESET AND RUN ALL
df = pd.DataFrame(probabilities)
df.to_csv("/content/drive/MyDrive/sentiment_probabilities_val.csv")
