In [56]:
import pandas as pd
import os
import nltk
import re
import sklearn
import numpy as np
from sklearn.preprocessing import LabelEncoder
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import re

In [57]:
def normalize_document(doc):
    # lower case and remove special characters\whitespaces
    doc = re.sub(r'[^a-zA-Z\s]', '', doc, re.I|re.A)
    doc = doc.lower()
    doc = doc.strip()
    # tokenize document
    tokens = wpt.tokenize(doc)
    # filter stopwords out of document
    filtered_tokens = [token for token in tokens if token not in stop_words]
    # re-create document from filtered tokens
    doc = ' '.join(filtered_tokens)
    return doc

def processing_text(series_to_process):
    new_list = []
    tokenizer = RegexpTokenizer(r'(\w+)')
    lemmatizer = WordNetLemmatizer()
    for i in range(len(series_to_process)):
        #TOKENISED ITEM(LONG STRING) IN A LIST
        dirty_string = (series_to_process)[i].lower()
        words_only = tokenizer.tokenize(dirty_string) #WORDS_ONLY IS A LIST THAT DOESN'T HAVE PUNCTUATION
        #LEMMATISE THE ITEMS IN WORDS_ONLY
        words_only_lem = [lemmatizer.lemmatize(i) for i in words_only]
        #REMOVING STOP WORDS FROM THE LEMMATIZED LIST
        words_without_stop = [i for i in words_only_lem if i not in stopwords.words("english")]
        #RETURN SEPERATED WORDS INTO LONG STRING
        long_string_clean = " ".join(word for word in words_without_stop)
        new_list.append(long_string_clean)
    return new_list

def processing_label(series_to_process):
    new_list = []
    for i in range(len(series_to_process)):
        if series_to_process[i] in ("suicide", "depression"):
            new_list.append(1)
        else:
            new_list.append(0)
    return new_list

In [58]:
class LabelEncoderExt(object):
    def __init__(self):
        """
        It differs from LabelEncoder by handling new classes and providing a value for it [Unknown]
        Unknown will be added in fit and transform will take care of new item. It gives unknown class id
        """
        self.label_encoder = LabelEncoder()
        # self.classes_ = self.label_encoder.classes_

    def fit(self, data_list):
        """
        This will fit the encoder for all the unique values and introduce unknown value
        :param data_list: A list of string
        :return: self
        """
        self.label_encoder = self.label_encoder.fit(list(data_list) + ['Unknown'])
        self.classes_ = self.label_encoder.classes_

        return self

    def transform(self, data_list):
        """
        This will transform the data_list to id list where the new values get assigned to Unknown class
        :param data_list:
        :return:
        """
        new_data_list = list(data_list)
        for unique_item in np.unique(data_list):
            if unique_item not in self.label_encoder.classes_:
                new_data_list = ['Unknown' if x==unique_item else x for x in new_data_list]

        return self.label_encoder.transform(new_data_list)

In [59]:
wpt = nltk.WordPunctTokenizer()
stop_words = nltk.corpus.stopwords.words('english')


### Consolidate Depression and Suicide files into one dataframe with 2 columnns (label, text)
### Ensured 50/50 breakdown between the 2.
### Need a third
df = pd.DataFrame(columns=["label", "text"])
i = 0
for file in os.listdir("./data/slighly less uncleaned/Suicide"):
    text = open(("data/slighly less uncleaned/Suicide/" + file)).read()
    df.loc[i] = ["suicide"] + [text]
    i += 1

j = 0
while j < i:
    file = os.listdir("./data/slighly less uncleaned/Depression")[j]
    print(file)
    text = open(("data/slighly less uncleaned/Depression/" + file)).read()
    df.loc[i + j] = ["depression"] + [text]
    j += 1

k = 0
while k < j:
    file = os.listdir("./data/slighly less uncleaned/Recovery")[k]
    print(file)
    text = open(("data/slighly less uncleaned/Recovery/" + file)).read()
    df.loc[i + j + k] = ["recovery"] + [text]
    k += 1

## We know that this dataset is collected from actual surveys of participants.
## thus we can safely disregard syntactic or semantic structures of the text.

text_values = df["text"]
print(df)


## we do some simple normalization of the text data.

i = 0
while i < 3\
        *j:
    text = df.loc[i]['text']
    text = normalize_document(text)
    df.loc[i]['text'] = text
    i += 1

df["text_clean"] = processing_text(df["text"])


Depression002.txt
Depression003.txt
Depression004.txt
Depression005.txt
Depression006.txt
Depression007.txt
Depression008.txt
Depression009.txt
Depression010.txt
Depression011.txt
Depression012.txt
Depression013.txt
Depression014.txt
Depression015.txt
Depression016.txt
Depression017.txt
Depression018.txt
Depression019.txt
Depression020.txt
Depression021.txt
Depression022.txt
Depression023.txt
Depression024.txt
Depression025.txt
Depression026.txt
Depression027.txt
Depression028.txt
Depression030.txt
Depression031.txt
Depression032.txt
Depression033.txt
Depression034.txt
Depression035.txt
Depression036.txt
Depression037.txt
Depression038.txt
Depression039.txt
Depression040.txt
Depression041.txt
Depression042.txt
Depression043.txt
Depression044.txt
Depression045.txt
Depression046.txt
Depression047.txt
Depression048.txt
Depression049.txt
Depression050.txt
Depression051.txt
Depression052.txt
Depression054.txt
Depression055.txt
Depression057.txt
Depression058.txt
Depression059.txt
Depression

Credits to: https://github.com/hesamuel/goodbye_world/blob/master/code/03_Modelling.ipynb

In [69]:

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

pd.set_option('display.max_columns', 100)
sns.set_style("white")

from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer, TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, roc_auc_score

# import warnings filter
from warnings import simplefilter
# ignore all future warnings
simplefilter(action='ignore', category=FutureWarning)



In [70]:

df["is_bad"] = processing_label(df["label"])
print(df)

everyone supported far although wasnt really suicidal didnt care would suddenly died talking saved life putting back track ive depressed really long time roller coaster ev...   
910  im feeling better spiraled depression recently im beginning see light didnt see psychologist dad put still want see himher though well treat hadnt wouldnt making thread number things help think bi...   
911  happy good day woke early today oversleeping past two weeks calorie counter account got first time weeks junk food finished yesterday drank remainder juice today hot day degrees first time stayed ...   
912  simple effective weight loss tips weight loss tip decide much weight need lose people start weight loss journey claiming xyz kilos overweight isnt healthy way project aim weight loss calculate des...   
913  recovering know depressed suicidal past month combination financial problems social phobia yesterday finally got back money stolen months ago portion alone enough pay bills moms debts repair da

## Baseline

In [71]:
df["is_bad"].mean()

0.6666666666666666

## Finding a Production Model

In [72]:
# DEFINING A FUNCTION THAT WILL RUN MULTIPLE MODELS AND GRIDSEARCH FOR BEST PARAMETERS

def gridsearch_multi(steps_titles, steps_list, pipe_params):
    
    #DEFINING X and y
    X = df["text_clean"]
    y = df['is_bad']
    #TRAIN-TEST SPLIT
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)
    # DATAFRAME TO DISPLAY RESULTS
    gs_results = pd.DataFrame(columns=['model','AUC Score', 'precision', 'recall (sensitivity)', 
                                       'best_params', 'best score', 'confusion matrix', 
                                       'train_accuracy','test_accuracy','baseline_accuracy',
                                       'specificity', 'f1-score'])

    # FOR LOOP THROUGH STEPS LIST
    for i in range(len(steps_list)):
        # INSTATIATE PIPELINE
        pipe = Pipeline(steps=steps_list[i])
        # INSTANTIATE GRIDSEARCHCV WITH PARAMETER ARGUMENT
        gs = GridSearchCV(pipe, pipe_params[i], cv=3) 
        gs.fit(X_train, y_train)
        
        #GETTING PREDICTIONS FROM MODEL
        pred = gs.predict(X_test)
        
        # DEFINE CONFUSION MATRIX ELEMENTS
        tn, fp, fn, tp = confusion_matrix(y_test, gs.predict(X_test)).ravel() 
        
        #CREATING A DICTIONARY FROM THE CLASSIFICATION REPORT(WE'LL DRAW SOME METRICS FROM HERE)
        classi_dict = (classification_report(y_test,pred, output_dict=True))
        
        #CALCULATING AREA UNDER THE CURVE
        gs.predict_proba(X_test)
        pred_proba = [i[1] for i in gs.predict_proba(X_test)] 
        auc = roc_auc_score(y_test, pred_proba)
        
        #DEFINE DATAFRAME COLUMNS
        model_results = {}
        model_results['model'] = steps_titles[i]
        model_results['AUC Score'] = auc
        model_results['precision']= classi_dict['weighted avg']['precision']
        model_results['recall (sensitivity)']= classi_dict['weighted avg']['recall']
        model_results['best params'] = gs.best_params_
        model_results['best score'] = gs.best_score_
        model_results['confusion matrix']={"TP": tp,"FP":fp, "TN": tn, "FN": fn}
        model_results['train accuracy'] = gs.score(X_train, y_train)
        model_results['test accuracy'] = gs.score(X_test, y_test)
        model_results['baseline accuracy'] = 2/3
        
        model_results['specificity']= tn/(tn+fp)  
        model_results['f1-score']= classi_dict['weighted avg']['f1-score']


        #APPEND RESULTS TO A NICE DATAFRAME
        df_list.append(model_results) 
        pd.set_option("display.max_colwidth", 200)
    return (pd.DataFrame(df_list)).round(2)

In [68]:
#USING THE FUNCTION WITH COUNT VECTORIZOR

# EMPTY LIST THAT WILL HOLD RESULTS
df_list=[]

# LIST OF MODELS
steps_titles = ['cvec+ multi_nb','cvec + ss + knn','cvec + ss + logreg']

# CODE FOR PIPELINE TO INSTATIATE MODELS
steps_list = [ 
    [('cv', CountVectorizer()),('multi_nb', MultinomialNB())],
    [('cv', CountVectorizer()),('scaler', StandardScaler(with_mean=False)),('knn', KNeighborsClassifier())], 
    [('cv', CountVectorizer()),('scaler', StandardScaler(with_mean=False)),('logreg', LogisticRegression())]
]

# LIST OF PARAMETER DICTIONARIES
pipe_params = [
    {'cv__stop_words':['english'], 'cv__ngram_range':[(1,1),(1,2)],'cv__max_features': [20, 30, 50],'cv__min_df': [2, 3],'cv__max_df': [.2, .25, .3]},
    {'cv__stop_words':['english'], 'cv__ngram_range':[(1,1),(1,2)],'cv__max_features': [20, 30, 50],'cv__min_df': [2, 3],'cv__max_df': [.2, .25, .3]},
    {'cv__stop_words':['english'], 'cv__ngram_range':[(1,1),(1,2)],'cv__max_features': [20, 30, 50],'cv__min_df': [2, 3],'cv__max_df': [.2, .25, .3]}
]

#RUNNING THE FUNCTION
gridsearch_multi(steps_titles, steps_list, pipe_params)



Unnamed: 0,AUC Score,baseline accuracy,best params,best score,confusion matrix,f1-score,model,precision,recall (sensitivity),specificity,test accuracy,train accuracy
0,0.8,0.67,"{'cv__max_df': 0.25, 'cv__max_features': 50, 'cv__min_df': 3, 'cv__ngram_range': (1, 2), 'cv__stop_words': 'english'}",0.71,"{'TP': 131, 'FP': 33, 'TN': 45, 'FN': 26}",0.75,cvec+ multi_nb,0.74,0.75,0.58,0.75,0.74
1,0.58,0.67,"{'cv__max_df': 0.3, 'cv__max_features': 30, 'cv__min_df': 2, 'cv__ngram_range': (1, 2), 'cv__stop_words': 'english'}",0.68,"{'TP': 120, 'FP': 52, 'TN': 26, 'FN': 37}",0.61,cvec + ss + knn,0.6,0.62,0.33,0.62,0.78
2,0.78,0.67,"{'cv__max_df': 0.3, 'cv__max_features': 50, 'cv__min_df': 2, 'cv__ngram_range': (1, 2), 'cv__stop_words': 'english'}",0.71,"{'TP': 143, 'FP': 41, 'TN': 37, 'FN': 14}",0.75,cvec + ss + logreg,0.76,0.77,0.47,0.77,0.77


In [73]:
#USING THE FUNCTION WITH TFID VECTORIZOR

# LIST OF MODELS
steps_titles = ['tvec + multi_nb','tvec + ss + knn','tvec + ss + logreg']

# CODE FOR PIPELINE TO INSTATIATE MODELS
steps_list = [ 
    [('tv', TfidfVectorizer()),('multi_nb', MultinomialNB())],
    [('tv', TfidfVectorizer()),('scaler', StandardScaler(with_mean=False)),('knn', KNeighborsClassifier())], 
    [('tv', TfidfVectorizer()),('scaler', StandardScaler(with_mean=False)),('logreg', LogisticRegression())]
]

# LIST OF PARAMETER DICTIONARIES
pipe_params = [
    {'tv__stop_words':['english'], 'tv__ngram_range':[(1,1),(1,2)],'tv__max_features': [20, 30, 50],'tv__min_df': [2, 3],'tv__max_df': [.2, .25, .3]},
    {'tv__stop_words':['english'], 'tv__ngram_range':[(1,1),(1,2)],'tv__max_features': [20, 30, 50],'tv__min_df': [2, 3],'tv__max_df': [.2, .25, .3]},
    {'tv__stop_words':['english'], 'tv__ngram_range':[(1,1),(1,2)],'tv__max_features': [20, 30, 50],'tv__min_df': [2, 3],'tv__max_df': [.2, .25, .3]}
]   

#RUNNING THE FUNCTION
gridsearch_multi(steps_titles, steps_list, pipe_params)

Unnamed: 0,AUC Score,baseline accuracy,best params,best score,confusion matrix,f1-score,model,precision,recall (sensitivity),specificity,test accuracy,train accuracy
0,0.8,0.67,"{'cv__max_df': 0.25, 'cv__max_features': 50, 'cv__min_df': 3, 'cv__ngram_range': (1, 2), 'cv__stop_words': 'english'}",0.71,"{'TP': 131, 'FP': 33, 'TN': 45, 'FN': 26}",0.75,cvec+ multi_nb,0.74,0.75,0.58,0.75,0.74
1,0.58,0.67,"{'cv__max_df': 0.3, 'cv__max_features': 30, 'cv__min_df': 2, 'cv__ngram_range': (1, 2), 'cv__stop_words': 'english'}",0.68,"{'TP': 120, 'FP': 52, 'TN': 26, 'FN': 37}",0.61,cvec + ss + knn,0.6,0.62,0.33,0.62,0.78
2,0.78,0.67,"{'cv__max_df': 0.3, 'cv__max_features': 50, 'cv__min_df': 2, 'cv__ngram_range': (1, 2), 'cv__stop_words': 'english'}",0.71,"{'TP': 143, 'FP': 41, 'TN': 37, 'FN': 14}",0.75,cvec + ss + logreg,0.76,0.77,0.47,0.77,0.77
3,0.82,0.67,"{'tv__max_df': 0.2, 'tv__max_features': 50, 'tv__min_df': 3, 'tv__ngram_range': (1, 2), 'tv__stop_words': 'english'}",0.7,"{'TP': 154, 'FP': 64, 'TN': 14, 'FN': 3}",0.65,tvec + multi_nb,0.75,0.71,0.18,0.71,0.72
4,0.63,0.67,"{'tv__max_df': 0.25, 'tv__max_features': 30, 'tv__min_df': 3, 'tv__ngram_range': (1, 1), 'tv__stop_words': 'english'}",0.68,"{'TP': 120, 'FP': 44, 'TN': 34, 'FN': 37}",0.65,tvec + ss + knn,0.65,0.66,0.44,0.66,0.77
5,0.79,0.67,"{'tv__max_df': 0.25, 'tv__max_features': 50, 'tv__min_df': 2, 'tv__ngram_range': (1, 1), 'tv__stop_words': 'english'}",0.71,"{'TP': 140, 'FP': 42, 'TN': 36, 'FN': 17}",0.73,tvec + ss + logreg,0.74,0.75,0.46,0.75,0.74


In [74]:
#USING THE FUNCTION WITH HASHING VECTORIZOR

# LIST OF MODELS
steps_titles = ['hvec + multi_nb','hvec + ss + knn','hvec + ss + logreg']

# CODE FOR PIPELINE TO INSTATIATE MODELS
steps_list = [ 
    [('hv', HashingVectorizer(alternate_sign=False)),('multi_nb', MultinomialNB())],
    [('hv', HashingVectorizer(alternate_sign=False)),('scaler', StandardScaler(with_mean=False)),('knn', KNeighborsClassifier())], 
    [('hv', HashingVectorizer(alternate_sign=False)),('scaler', StandardScaler(with_mean=False)),('logreg', LogisticRegression())]
]

# LIST OF PARAMETER DICTIONARIES
pipe_params = [
    {'hv__stop_words':['english'], 'hv__ngram_range':[(1,1),(1,2)]},
    {'hv__stop_words':['english'], 'hv__ngram_range':[(1,1),(1,2)]},
    {'hv__stop_words':['english'], 'hv__ngram_range':[(1,1),(1,2)]}
]   

#RUNNING THE FUNCTION
gridsearch_multi(steps_titles, steps_list, pipe_params)

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Unnamed: 0,AUC Score,baseline accuracy,best params,best score,confusion matrix,f1-score,model,precision,recall (sensitivity),specificity,test accuracy,train accuracy
0,0.8,0.67,"{'cv__max_df': 0.25, 'cv__max_features': 50, 'cv__min_df': 3, 'cv__ngram_range': (1, 2), 'cv__stop_words': 'english'}",0.71,"{'TP': 131, 'FP': 33, 'TN': 45, 'FN': 26}",0.75,cvec+ multi_nb,0.74,0.75,0.58,0.75,0.74
1,0.58,0.67,"{'cv__max_df': 0.3, 'cv__max_features': 30, 'cv__min_df': 2, 'cv__ngram_range': (1, 2), 'cv__stop_words': 'english'}",0.68,"{'TP': 120, 'FP': 52, 'TN': 26, 'FN': 37}",0.61,cvec + ss + knn,0.6,0.62,0.33,0.62,0.78
2,0.78,0.67,"{'cv__max_df': 0.3, 'cv__max_features': 50, 'cv__min_df': 2, 'cv__ngram_range': (1, 2), 'cv__stop_words': 'english'}",0.71,"{'TP': 143, 'FP': 41, 'TN': 37, 'FN': 14}",0.75,cvec + ss + logreg,0.76,0.77,0.47,0.77,0.77
3,0.82,0.67,"{'tv__max_df': 0.2, 'tv__max_features': 50, 'tv__min_df': 3, 'tv__ngram_range': (1, 2), 'tv__stop_words': 'english'}",0.7,"{'TP': 154, 'FP': 64, 'TN': 14, 'FN': 3}",0.65,tvec + multi_nb,0.75,0.71,0.18,0.71,0.72
4,0.63,0.67,"{'tv__max_df': 0.25, 'tv__max_features': 30, 'tv__min_df': 3, 'tv__ngram_range': (1, 1), 'tv__stop_words': 'english'}",0.68,"{'TP': 120, 'FP': 44, 'TN': 34, 'FN': 37}",0.65,tvec + ss + knn,0.65,0.66,0.44,0.66,0.77
5,0.79,0.67,"{'tv__max_df': 0.25, 'tv__max_features': 50, 'tv__min_df': 2, 'tv__ngram_range': (1, 1), 'tv__stop_words': 'english'}",0.71,"{'TP': 140, 'FP': 42, 'TN': 36, 'FN': 17}",0.73,tvec + ss + logreg,0.74,0.75,0.46,0.75,0.74
6,0.7,0.67,"{'hv__ngram_range': (1, 1), 'hv__stop_words': 'english'}",0.67,"{'TP': 157, 'FP': 78, 'TN': 0, 'FN': 0}",0.54,hvec + multi_nb,0.45,0.67,0.0,0.67,0.67
7,0.51,0.67,"{'hv__ngram_range': (1, 1), 'hv__stop_words': 'english'}",0.67,"{'TP': 157, 'FP': 78, 'TN': 0, 'FN': 0}",0.54,hvec + ss + knn,0.45,0.67,0.0,0.67,0.68
8,0.88,0.67,"{'hv__ngram_range': (1, 1), 'hv__stop_words': 'english'}",0.77,"{'TP': 151, 'FP': 40, 'TN': 38, 'FN': 6}",0.79,hvec + ss + logreg,0.81,0.8,0.49,0.8,1.0


## Production Model

In [97]:

#CHECKING SCORES OF THE OPTIMISED MODEL USING TEST DATA
#DEFINING X and y
X = df["text_clean"]
y = df['is_bad']
#TRAIN-TEST SPLIT
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)

tvec_optimised = TfidfVectorizer(max_df= 0.5, max_features=70, min_df=2, ngram_range=(1, 3),stop_words = 'english')
X_train_tvec = tvec_optimised.fit_transform(X_train).todense()
X_test_tvec = tvec_optimised.transform(X_test).todense()

#SAVE TO PICKLE
import pickle
pickle.dump(tvec_optimised, open("vectorizer.pickle", "wb"))

#FINDING THE ACCURACY SCORE ON THE TEST DATA

log_reg = LogisticRegression()
log_reg.fit(X_train_tvec, y_train)
accuracy = log_reg.score(X_test_tvec, y_test)

#CALCULATING AREA UNDER THE CURVE

pred_proba = [i[1] for i in log_reg.predict_proba(X_test_tvec)] 
auc = roc_auc_score(y_test, pred_proba)

print("ACCURACY: {}\nAUC SCORE: {}".format(accuracy, auc) )

ACCURACY: 0.8
AUC SCORE: 0.8534215253960478


In [115]:
test = ["I think it's overfitted"]
vectorizer = pickle.load(open("vectorizer.pickle", "rb"))
new_X = vectorizer.transform(test).todense()
result = log_reg.predict(new_X)
print(result)

[1]
