In [1]:
import pandas as pd, numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split



In [3]:
data = pd.read_csv('text_dataset.csv') #Dataset generated from the wrangler file
train, test = train_test_split(data, random_state=745) #Split the dataset into training and test

In [4]:
train['label'].value_counts()

suicide          87069
non-suicide      87060
anxious           7809
normal            7459
stressed          6436
lonely            6384
not depressed     5958
depressed         1734
Name: label, dtype: int64

In [5]:
test.drop('label', axis=1, inplace=True)
test.sample(5)

Unnamed: 0,text
250372,"['turn', '21', 'sooni', ""'ll"", 'tri']"
199500,"['want', 'murderedi', '’', 'know', 'els', 'wor..."
41767,"['garden', 'look', 'estim', 'grade', 'im', 'su..."
126212,"['nobodi', 'care', 'til', 'ur', 'dead', '&', '..."
5632,"['rihanna', 'anti', 'us', 'certif', 'albumssin..."


In [6]:
def changing_label_suicide(row):
    if row['label'] == 'suicide':
        return 1
    return 0

def changing_label_lonely(row):
    if row['label'] == 'lonely':
        return 1
    return 0

def changing_label_stressed(row):
    if row['label'] == 'stressed':
        return 1
    return 0

def changing_label_normal(row):
    if row['label'] == 'normal':
        return 1
    return 0

def changing_label_non_suicide(row):
    if row['label'] == 'non-suicide':
        return 1
    return 0

def changing_label_not_depressed(row):
    if row['label'] == 'not depressed':
        return 1
    return 0

def changing_label_depressed(row):
    if row['label'] == 'depressed':
        return 1
    return 0

def changing_label_anxious(row):
    if row['label'] == 'anxious':
        return 1
    return 0

In [7]:
train['suicide'] = 0
train['lonely'] = 0
train['stressed'] = 0
train['normal'] = 0
train['non-suicide'] = 0
train['not depressed'] = 0
train['depressed'] = 0
train['anxious'] = 0

train['suicide'] = train.apply(lambda row: changing_label_suicide(row), axis=1)
train['non-suicide'] = train.apply(lambda row: changing_label_non_suicide(row), axis=1)

train['not depressed'] = train.apply(lambda row: changing_label_not_depressed(row), axis=1)
train['depressed'] = train.apply(lambda row: changing_label_depressed(row), axis=1)

train['lonely'] = train.apply(lambda row: changing_label_lonely(row), axis=1)
train['stressed'] = train.apply(lambda row: changing_label_stressed(row), axis=1)
train['normal'] = train.apply(lambda row: changing_label_normal(row), axis=1)
train['anxious'] = train.apply(lambda row: changing_label_anxious(row), axis=1)


train.sample(5)

Unnamed: 0,text,label,suicide,lonely,stressed,normal,non-suicide,not depressed,depressed,anxious
75188,"['ca', ""n't"", 'push', 'anymor', '?', ""'m"", 'ti...",suicide,1,0,0,0,0,0,0,0
30128,"['pick', 'someon', 'els', 'pleas', 'dont', 'br...",normal,0,0,0,1,0,0,0,0
134125,"['crimea', 'still', 'recogn', 'ukrainian', 'de...",non-suicide,0,0,0,0,1,0,0,0
101668,"['know', 'els', 'lookin', 'fine', 'u/ethan_kei...",non-suicide,0,0,0,0,1,0,0,0
139098,"['want', 'die', 'everyon', 'keep', 'lie', 'muc...",suicide,1,0,0,0,0,0,0,0


In [8]:
layer_1 = ['suicide', 'non-suicide']
layer_2 = ['depressed', 'not depressed']
layer_3 = ['normal', 'lonely', 'stressed', 'anxious']

for i in range(len(layer_1)):
    print(train[layer_1[i]].value_counts())
    
for i in range(len(layer_2)):
    print(train[layer_2[i]].value_counts())

for i in range(len(layer_3)):
    print(train[layer_3[i]].value_counts())

0    122840
1     87069
Name: suicide, dtype: int64
0    122849
1     87060
Name: non-suicide, dtype: int64
0    208175
1      1734
Name: depressed, dtype: int64
0    203951
1      5958
Name: not depressed, dtype: int64
0    202450
1      7459
Name: normal, dtype: int64
0    203525
1      6384
Name: lonely, dtype: int64
0    203473
1      6436
Name: stressed, dtype: int64
0    202100
1      7809
Name: anxious, dtype: int64


In [9]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Some additional clean up. Idk if this is needed. 

n = train.shape[0]
vec = TfidfVectorizer(analyzer = "word", max_features=10000)
trn_term_doc = vec.fit_transform(train['text'])
test_term_doc = vec.transform(test['text'])


In [10]:
x = trn_term_doc
test_x = test_term_doc

In [11]:
from sklearn.linear_model import LogisticRegression
# Code from: https://www.kaggle.com/code/jhoward/nb-svm-strong-linear-baseline

def pr(y_i, y):
    p = x[y==y_i].sum(0)
    return (p+1) / ((y==y_i).sum()+1)

def get_mdl(y):
    y = y.values
    r = np.log(pr(1,y) / pr(0,y))
    m = LogisticRegression(C=4, max_iter=500)
    x_nb = x.multiply(r)
    return m.fit(x_nb, y), r

layer_1 = ['suicide', 'non-suicide']
preds_1 = np.zeros((len(test), len(layer_1)))
array_1 = []

for i, j in enumerate(layer_1):
    print('fit', j)
    m,r = get_mdl(train[j])
    array_1.append([m,r])
    preds_1[:,i] = m.predict_proba(test_x.multiply(r))[:,1]


fit suicide
fit non-suicide


In [12]:
layer_2 = ['depressed', 'not depressed']
preds_2 = np.zeros((len(test), len(layer_2)))
array_2 = []

for i, j in enumerate(layer_2):
    print('fit', j)
    m,r = get_mdl(train[j])
    array_2.append([m,r])
    preds_2[:,i] = m.predict_proba(test_x.multiply(r))[:,1]


layer_3 = ['normal', 'lonely', 'stressed', 'anxious']
preds_3 = np.zeros((len(test), len(layer_3)))
array_3 = []

for i, j in enumerate(layer_3):
    print('fit', j)
    m,r = get_mdl(train[j])
    array_3.append([m,r])
    preds_3[:,i] = m.predict_proba(test_x.multiply(r))[:,1]


fit depressed
fit not depressed
fit normal
fit lonely
fit stressed
fit anxious


In [13]:
def predict_suicide_risk(txt): # The function use to predict suicide risk. 
    # Return the highest probability
    vtxt = vec.transform([txt])
    likely_label = np.zeros((len(txt), len(layer_1)))

    for i, j in enumerate(layer_1):
        m,r = array_1[i]
        likely_label[:,i] = m.predict_proba(vtxt.multiply(r))[:,1] 
    return likely_label[0]
  
def get_label_suicide(predsx):
    out = predsx.tolist()
    max = 0
    label = "non-suicide"
    for i in range(len(out)):
        if out[i] > max:
            max = out[i]
            label = layer_1[i]
    print(label, max)

def predict_depression_risk(txt): # The function use to predict suicide risk. 
    # Return the highest probability
    vtxt = vec.transform([txt])
    likely_label = np.zeros((len(txt), len(layer_2)))

    for i, j in enumerate(layer_2):
        m,r = array_2[i]
        likely_label[:,i] = m.predict_proba(vtxt.multiply(r))[:,1] 
    return likely_label[0]

def get_label_depression(predsx):
    out = predsx.tolist()
    max = 0
    label = "not depressed"
    for i in range(len(out)):
        if out[i] > max:
            max = out[i]
            label = layer_2[i]
    print(label, max)

def predict_other_risks(txt): # The function use to predict suicide risk. 
    # Return the highest probability
    vtxt = vec.transform([txt])
    likely_label = np.zeros((len(txt), len(layer_3)))

    for i, j in enumerate(layer_3):
        m,r = array_3[i]
        likely_label[:,i] = m.predict_proba(vtxt.multiply(r))[:,1] 
    return likely_label[0]

def other_labels(predsx):
    out = predsx.tolist()
    max = 0
    label = "normal"
    for i in range(len(out)):
        if out[i] > max:
            max = out[i]
            label = layer_3[i]
    print(label, max)


def classify(txt):
    suicide_risk = predict_suicide_risk(txt)
    suicide_label = get_label_suicide(suicide_risk)
    depression_risk = predict_depression_risk(txt)
    depression_label = get_label_depression(depression_risk)
    other_risks = predict_other_risks(txt)
    others = other_labels(other_risks)


In [14]:
text = "I want to die"
classify(text)

suicide 0.9629493190071304
not depressed 0.0013556514712671297
lonely 0.003678939621142827
