In [None]:
import pandas as pd
from sklearn import model_selection, preprocessing, linear_model, metrics, svm, ensemble # naive_bayes, decomposition, 
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [2]:
df_suicide = pd.read_csv('r_suicide.csv')
df_suicide.head()

Unnamed: 0.1,Unnamed: 0,Title,Body
0,0,Holy crap I’m back,Is that what this account will be? Pouring my ...
1,1,Telling someone not to kill themselves seems a...,"Also, stop telling people ""Think about your fa..."
2,2,My life is meaningless,\n\n\n\nI just want to kill myself. I cannot g...
3,3,I think this is my last option,It's not that I want to die. It's that I have ...
4,4,I don't think I can do this,I have midterms and the last thing I wanna do ...


In [3]:
df_depression = pd.read_csv('r_depression.csv')
df_depression.head()

Unnamed: 0.1,Unnamed: 0,Title,Body
0,0,Not sure if I'm being annoying and overbearing...,This semester a friend of mine has skipped sev...
1,1,Why can't I just be homeless and die in the cold,I dont want to work I dont want to get up I do...
2,2,I’m better,I’d officially better and I’m ready to leave t...
3,3,i have therapy and i’m not going,i’ve got a therapy appointment in 1 hour and i...
4,4,This one girl has actually driven me into depr...,This girl and I had known each other for a whi...


### dataset preparation

In [5]:
df_suicide = df_suicide[['Title', 'Body']]
df_depression = df_depression[['Title', 'Body']]

# add labels as the "target" to predict
df_suicide['label'] = 's'
df_depression['label'] = 'd'

df = df_suicide.append(df_depression)
df.dropna(inplace = True)
df.head()

Unnamed: 0,Title,Body,label
0,Holy crap I’m back,Is that what this account will be? Pouring my ...,s
1,Telling someone not to kill themselves seems a...,"Also, stop telling people ""Think about your fa...",s
2,My life is meaningless,\n\n\n\nI just want to kill myself. I cannot g...,s
3,I think this is my last option,It's not that I want to die. It's that I have ...,s
4,I don't think I can do this,I have midterms and the last thing I wanna do ...,s


In [8]:
# split into training and test sets
df['Text'] = df['Title'] + ' ' + df['Body']

train_x, valid_x, train_y, valid_y = model_selection.train_test_split(df['Text'], df['label'],
                                                                      test_size = 0.2, random_state=0)

In [9]:
# label encode the target variable
encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
valid_y = encoder.fit_transform(valid_y)

### feature engineering

In [10]:
# 1. Count vectors
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
count_vect.fit(df['Text'])

xtrain_count =  count_vect.transform(train_x)
xvalid_count =  count_vect.transform(valid_x)

In [12]:
# 2. word level tf-idf
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
tfidf_vect.fit(df['Text'])
xtrain_tfidf =  tfidf_vect.transform(train_x)
xvalid_tfidf =  tfidf_vect.transform(valid_x)

# 3. n-gram level tf-idf
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram.fit(df['Text'])
xtrain_tfidf_ngram =  tfidf_vect_ngram.transform(train_x)
xvalid_tfidf_ngram =  tfidf_vect_ngram.transform(valid_x)

In [15]:
# 4. Word embeddings - tbc

### model training

In [16]:
def train_model(classifier, feature_vector_train, label, feature_vector_valid, is_neural_net=False):
    # fit the training dataset on the classifier
    classifier.fit(feature_vector_train, label)
    
    # predict the labels on validation dataset
    predictions = classifier.predict(feature_vector_valid)
    
    if is_neural_net:
        predictions = predictions.argmax(axis=-1)
    
    return metrics.accuracy_score(predictions, valid_y)

In [17]:
accuracy = train_model(svm.SVC(), xtrain_count, train_y, xvalid_count)
print("SVM, Count Vectors: ", accuracy)

# SVM on Ngram Level TF IDF Vectors
accuracy = train_model(svm.SVC(), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram)
print("SVM, N-Gram Vectors: ", accuracy)

# SVM on Ngram Level TF IDF Vectors
accuracy = train_model(svm.SVC(), xtrain_tfidf, train_y, xvalid_tfidf)
print("SVM, TF-IDF Vectors: ", accuracy)



SVM, Count Vectors:  0.6078431372549019
SVM, N-Gram Vectors:  0.5010141987829615
SVM, TF-IDF Vectors:  0.5010141987829615


In [18]:
# Linear Classifier on Count Vectors
accuracy = train_model(linear_model.LogisticRegression(), xtrain_count, train_y, xvalid_count)
print("LR, Count Vectors: ", accuracy)

# Linear Classifier on Word Level TF IDF Vectors
accuracy = train_model(linear_model.LogisticRegression(), xtrain_tfidf, train_y, xvalid_tfidf)
print("LR, WordLevel TF-IDF: ", accuracy)

accuracy = train_model(linear_model.LogisticRegression(), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram)
print("LR, NGram TF-IDF: ", accuracy)



LR, Count Vectors:  0.6768086544962812
LR, WordLevel TF-IDF:  0.7126436781609196
LR, NGram TF-IDF:  0.6876267748478702


In [19]:
# RF on Count Vectors
accuracy = train_model(ensemble.RandomForestClassifier(), xtrain_count, train_y, xvalid_count)
print("RF, Count Vectors: ", accuracy)

# RF on Word Level TF IDF Vectors
accuracy = train_model(ensemble.RandomForestClassifier(), xtrain_tfidf, train_y, xvalid_tfidf)
print("RF, WordLevel TF-IDF: ", accuracy)

# RF on Word Level TF IDF Vectors
accuracy = train_model(ensemble.RandomForestClassifier(), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram)
print("RF, NGRAM TF-IDF: ", accuracy)



RF, Count Vectors:  0.632183908045977




RF, WordLevel TF-IDF:  0.6227180527383367




RF, NGRAM TF-IDF:  0.6484110885733604
