In [121]:
# Import the libraries we'll use below.
import numpy as np
from matplotlib import pyplot as plt
import pandas as pd
import seaborn as sns  # for nicer plots
sns.set(style="darkgrid")  # default style
from sklearn import preprocessing

import pickle
from google.colab import drive

import tensorflow as tf
from tensorflow import keras
from keras import metrics
tf.get_logger().setLevel('INFO')

from sklearn.model_selection import train_test_split

import nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.util import ngrams

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [2]:
# Mount the google drive to collab
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


**Load Cleaned Data**

In [3]:
file_path = '/content/drive/MyDrive/Cleaned Suicide Detection 2023-07-08.pkl'

df = pd.read_pickle(file_path)

**Split Cleaned Data into Train, Validation and Test**

In [4]:
df_test = df.sample(frac=0.20, random_state=3)
df_train = df.drop(df_test.index)
df_val = df_train.sample(frac=0.25, random_state=3)
df_train = df.drop(df_val.index)

print(df_test.shape)
print(df_train.shape)
print(df_val.shape)

(46415, 7)
(185659, 7)
(46414, 7)


**Feature Engineering - VADER Sentiment**

In [5]:
sid = SentimentIntensityAnalyzer()

In [6]:
df_train['text_sentiment_score'] = df_train['text_cleaned'].apply(lambda text_cleaned: sid.polarity_scores(text_cleaned))
df_train['text_sentiment_prob'] = df_train['text_sentiment_score'].apply(lambda score_dict: score_dict['compound'])
df_train['text_sentiment'] = df_train['text_sentiment_prob'].apply(lambda c: 'pos' if c >=0 else 'neg')

In [7]:
df_val['text_sentiment_score'] = df_val['text_cleaned'].apply(lambda text_cleaned: sid.polarity_scores(text_cleaned))
df_val['text_sentiment_prob'] = df_val['text_sentiment_score'].apply(lambda score_dict: score_dict['compound'])
df_val['text_sentiment'] = df_val['text_sentiment_prob'].apply(lambda c: 'pos' if c >=0 else 'neg')

In [51]:
df_test['text_sentiment_score'] = df_test['text_cleaned'].apply(lambda text_cleaned: sid.polarity_scores(text_cleaned))
df_test['text_sentiment_prob'] = df_test['text_sentiment_score'].apply(lambda score_dict: score_dict['compound'])
df_test['text_sentiment'] = df_test['text_sentiment_prob'].apply(lambda c: 'pos' if c >=0 else 'neg')

**Feature Engineering - Bi-Grams**

In [8]:
def extract_bigrams(text):
    return list(ngrams(text.split(), 2))

In [54]:
df_train['bi_grams'] = df_train['text_cleaned'].apply(lambda text_cleaned: extract_bigrams(text_cleaned))
df_val['bi_grams'] = df_val['text_cleaned'].apply(lambda text_cleaned: extract_bigrams(text_cleaned))
df_test['bi_grams'] = df_test['text_cleaned'].apply(lambda text_cleaned: extract_bigrams(text_cleaned))

**Baseline Model - Sentiment Binary**

In [52]:
print('Train_Sentiment_Baseline Accuracy:', len(df_train[(df_train["text_sentiment"]=="neg") &
             (df_train["class"]=="suicide")]) / len(df_train))

print('Val_Sentiment_Baseline Accuracy:', len(df_val[(df_val["text_sentiment"]=="neg") &
             (df_val["class"]=="suicide")]) / len(df_val))

print('Test_Sentiment_Baseline Accuracy:', len(df_test[(df_test["text_sentiment"]=="neg") &
             (df_test["class"]=="suicide")]) / len(df_test))

Train_Sentiment_Baseline Accuracy: 0.37181068518089616
Val_Sentiment_Baseline Accuracy: 0.37742922394105227
Test_Sentiment_Baseline Accuracy: 0.3682214801249596


**Logistic Regression Model - Sentiment Feature**

Split X and Y

In [96]:
#Split X and Y
x_train_sent = df_train['text_sentiment_prob'].values.reshape(-1,1)
x_train_bigram = df_train['bi_grams'].values.reshape(-1,1)
y_train = df_train['class'].values.reshape(-1,1)

In [97]:
x_val_sent = df_val['text_sentiment_prob'].values.reshape(-1,1)
x_val_bigram = df_val['bi_grams'].values.reshape(-1,1)
y_val = df_val['class'].values.reshape(-1,1)

In [98]:
x_test_sent = df_test['text_sentiment_prob'].values.reshape(-1,1)
x_test_bigram = df_test['bi_grams'].values.reshape(-1,1)
y_test = df_val['class'].values.reshape(-1,1)

In [99]:
print(x_train_sent.shape)
print(y_train.shape)

(185659, 1)
(185659, 1)


In [104]:
# Model training
logreg = LogisticRegression()
logreg.fit(x_train_sent, y_train)

# Predict on the test set
y_pred = logreg.predict(x_test_sent)
y_pred = y_pred[1:]
probabilities = logreg.predict_proba(x_test_sent)



# Evaluation Test accuracy
accuracy = accuracy_score(y_test, y_pred)
classification_report = classification_report(y_test, y_pred)

print("LogisticRegression Accuracy:", accuracy)
print("LogisticRegression Classification Report:")
print(classification_report)


print("LogisticRegression Probabilities:")

print(probabilities.shape)
print(probabilities[0][0], probabilities[0][1])
print(probabilities[1][0], probabilities[1][1])

  y = column_or_1d(y, warn=True)


LogisticRegression Accuracy: 0.49594949799629423
LogisticRegression Classification Report:
              precision    recall  f1-score   support

 non-suicide       0.49      0.48      0.49     23050
     suicide       0.50      0.51      0.50     23364

    accuracy                           0.50     46414
   macro avg       0.50      0.50      0.50     46414
weighted avg       0.50      0.50      0.50     46414

LogisticRegression Probabilities:
(46415, 2)
0.41641630141848174 0.5835836985815183
0.6091468906909148 0.3908531093090852
