<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Imports" data-toc-modified-id="Imports-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Imports</a></span></li><li><span><a href="#Importing-dataset-of-labeled-diaster-messages" data-toc-modified-id="Importing-dataset-of-labeled-diaster-messages-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Importing dataset of labeled diaster messages</a></span></li><li><span><a href="#Processing-Messages" data-toc-modified-id="Processing-Messages-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Processing Messages</a></span></li><li><span><a href="#Modelling-Function" data-toc-modified-id="Modelling-Function-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Modelling Function</a></span></li><li><span><a href="#Modeling-Tests" data-toc-modified-id="Modeling-Tests-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Modeling Tests</a></span></li><li><span><a href="#Final-Model" data-toc-modified-id="Final-Model-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>Final Model</a></span></li><li><span><a href="#Coefficients" data-toc-modified-id="Coefficients-7"><span class="toc-item-num">7&nbsp;&nbsp;</span>Coefficients</a></span></li></ul></div>

# Imports 

In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt

import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
import regex as re
import unidecode
from nltk.corpus import stopwords

#For Modeling
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler


# Importing dataset of labeled diaster messages 

In [2]:
df = pd.read_csv('..//datasets/disaster_response_messages_training.csv', header = 0)

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [3]:
df.head()

Unnamed: 0,id,split,message,original,genre,related,PII,request,offer,aid_related,...,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
0,2,train,Weather update - a cold front from Cuba that c...,Un front froid se retrouve sur Cuba ce matin. ...,direct,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,7,train,Is the Hurricane over or is it not over,Cyclone nan fini osinon li pa fini,direct,1,0,0,0,1,...,0,0,1,0,1,0,0,0,0,0
2,12,train,"says: west side of Haiti, rest of the country ...",facade ouest d Haiti et le reste du pays aujou...,direct,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,14,train,Information about the National Palace-,Informtion au nivaux palais nationl,direct,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,15,train,Storm at sacred heart of jesus,Cyclone Coeur sacr de jesus,direct,1,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0


# Processing Messages

In [10]:
stops = stopwords.words('english')

In [14]:
def message_to_words(raw_message):
    
     # remove accents
    unaccented = unidecode.unidecode(raw_message)
    
    # remove all non-letter characters
    letters_only = re.sub("[^a-zA-Z]", " ", unaccented)
    
    # lowercase 
    words = letters_only.lower().split()
    
    # lemmatize
    lemmatizer = WordNetLemmatizer()
    tokens_lem = [lemmatizer.lemmatize(i) for i in words]
    
    # stop words
    meaningful_words = [w for w in words if not w in stops]
    
    # return as a string 
    return(" ".join(meaningful_words))

In [15]:
total_message = df.shape[0]
clean_message = []

print("Cleaning and parsing the message...")

j = 0
for message in df['message']:
    clean_message.append(message_to_words(message))
    
    # If the index is divisible by 100, print a message
    if (j+1) % 100 == 0:
        print(f'Comment {j+1} of {total_message}.')
    
    j += 1
    
    if j == total_message:
        print('Done.')

Cleaning and parsing the message...
Comment 100 of 21046.
Comment 200 of 21046.
Comment 300 of 21046.
Comment 400 of 21046.
Comment 500 of 21046.
Comment 600 of 21046.
Comment 700 of 21046.
Comment 800 of 21046.
Comment 900 of 21046.
Comment 1000 of 21046.
Comment 1100 of 21046.
Comment 1200 of 21046.
Comment 1300 of 21046.
Comment 1400 of 21046.
Comment 1500 of 21046.
Comment 1600 of 21046.
Comment 1700 of 21046.
Comment 1800 of 21046.
Comment 1900 of 21046.
Comment 2000 of 21046.
Comment 2100 of 21046.
Comment 2200 of 21046.
Comment 2300 of 21046.
Comment 2400 of 21046.
Comment 2500 of 21046.
Comment 2600 of 21046.
Comment 2700 of 21046.
Comment 2800 of 21046.
Comment 2900 of 21046.
Comment 3000 of 21046.
Comment 3100 of 21046.
Comment 3200 of 21046.
Comment 3300 of 21046.
Comment 3400 of 21046.
Comment 3500 of 21046.
Comment 3600 of 21046.
Comment 3700 of 21046.
Comment 3800 of 21046.
Comment 3900 of 21046.
Comment 4000 of 21046.
Comment 4100 of 21046.
Comment 4200 of 21046.
Comment

In [18]:
df = df.assign(cleaned_message = clean_message)
df.head(3)

Unnamed: 0,id,split,message,original,genre,related,PII,request,offer,aid_related,...,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report,cleaned_message
0,2,train,Weather update - a cold front from Cuba that c...,Un front froid se retrouve sur Cuba ce matin. ...,direct,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,weather update cold front cuba could pass haiti
1,7,train,Is the Hurricane over or is it not over,Cyclone nan fini osinon li pa fini,direct,1,0,0,0,1,...,0,1,0,1,0,0,0,0,0,hurricane
2,12,train,"says: west side of Haiti, rest of the country ...",facade ouest d Haiti et le reste du pays aujou...,direct,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,says west side haiti rest country today tonight


# Modelling Function

In [19]:
##Taken from NYC I Example 

def text_to_model(X_column, model, vectorizer, params, verbose = 1):
    
    X = df[X_column]                                    #creates X and y
    y = df['direct_report']
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size = .25)    #train test split
    
    pipe = Pipeline([                               #pipeline to run with gridsearch and test hyperparameters
        ('vectorizer', vectorizer),                 #for both the model and vectorizer.
        ('model', model)])                          #this will be done for many vectorizer-model combinations
        
    grid = GridSearchCV(pipe, param_grid=params, cv=5, verbose=verbose)  
    
    grid.fit(X_train, y_train)                       #fitting the grid model to X_train and y_train and 
                                                     #running a 5 fold cross validation
    score_dict = {}
    
    score_dict['X'] = X_column                               #this dict will be converted to dataframe to store 
    score_dict['Vectorizer'] = vectorizer                    #the performance of each gridsearch and return the 
    score_dict['Model'] = model                              #best parameters and score to compare to other models
    score_dict['train_score'] = grid.score(X_train, y_train)
    score_dict['test_score'] = grid.score(X_test, y_test)
    score_dict['best_params'] = grid.best_params_
    
    try:
        return pd.DataFrame(score_dict)
    except:
        return score_dict

# Modeling Tests

In [20]:
# Text Selected - Tokenized messsages
# Running a gridsearch on Logistic Regression and Count Vectorizer 

logreg = LogisticRegression()
cv = CountVectorizer()

params = {
          'vectorizer__max_features':[100, 500, 1000, 1500, 5000],
          'vectorizer__ngram_range':[(1,1), (1,2), (1,3)],
          'vectorizer__min_df':[1, 2, 3, 4],
          'vectorizer__max_df':[1.0],
          'model__penalty':['l2'],
          'model__max_iter': [1500]
         }

count_vect_logreg = text_to_model('cleaned_message', model = logreg, vectorizer=cv, params=params)
count_vect_logreg

Fitting 5 folds for each of 60 candidates, totalling 300 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 300 out of 300 | elapsed:  4.2min finished


Unnamed: 0,X,Vectorizer,Model,train_score,test_score,best_params
model__max_iter,cleaned_message,"CountVectorizer(analyzer='word', binary=False,...","LogisticRegression(C=1.0, class_weight=None, d...",0.873163,0.86355,1500
model__penalty,cleaned_message,"CountVectorizer(analyzer='word', binary=False,...","LogisticRegression(C=1.0, class_weight=None, d...",0.873163,0.86355,l2
vectorizer__max_df,cleaned_message,"CountVectorizer(analyzer='word', binary=False,...","LogisticRegression(C=1.0, class_weight=None, d...",0.873163,0.86355,1
vectorizer__max_features,cleaned_message,"CountVectorizer(analyzer='word', binary=False,...","LogisticRegression(C=1.0, class_weight=None, d...",0.873163,0.86355,1000
vectorizer__min_df,cleaned_message,"CountVectorizer(analyzer='word', binary=False,...","LogisticRegression(C=1.0, class_weight=None, d...",0.873163,0.86355,1
vectorizer__ngram_range,cleaned_message,"CountVectorizer(analyzer='word', binary=False,...","LogisticRegression(C=1.0, class_weight=None, d...",0.873163,0.86355,"(1, 3)"


In [21]:
# Text Selected - Tokenized Message
# Gridsearching Hyperparameters for TF-IDF Vectorizer

logreg = LogisticRegression()
tfidf = TfidfVectorizer()

params = {
          'vectorizer__max_features':[1000, 1500, 5000, 50000],
          'vectorizer__ngram_range':[(1,1), (1,2), (1,3)],
          'vectorizer__min_df':[1, 2, 3, 4],
          'vectorizer__max_df':[1.0],
          'model__penalty':['l2'],
          'model__max_iter': [1500]
         }

tfidf_vect_logreg = text_to_model(X_column='cleaned_message', model = logreg, vectorizer=tfidf, params=params)
tfidf_vect_logreg

Fitting 5 folds for each of 48 candidates, totalling 240 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 240 out of 240 | elapsed:  3.6min finished


Unnamed: 0,X,Vectorizer,Model,train_score,test_score,best_params
model__max_iter,cleaned_message,"TfidfVectorizer(analyzer='word', binary=False,...","LogisticRegression(C=1.0, class_weight=None, d...",0.887164,0.870391,1500
model__penalty,cleaned_message,"TfidfVectorizer(analyzer='word', binary=False,...","LogisticRegression(C=1.0, class_weight=None, d...",0.887164,0.870391,l2
vectorizer__max_df,cleaned_message,"TfidfVectorizer(analyzer='word', binary=False,...","LogisticRegression(C=1.0, class_weight=None, d...",0.887164,0.870391,1
vectorizer__max_features,cleaned_message,"TfidfVectorizer(analyzer='word', binary=False,...","LogisticRegression(C=1.0, class_weight=None, d...",0.887164,0.870391,50000
vectorizer__min_df,cleaned_message,"TfidfVectorizer(analyzer='word', binary=False,...","LogisticRegression(C=1.0, class_weight=None, d...",0.887164,0.870391,3
vectorizer__ngram_range,cleaned_message,"TfidfVectorizer(analyzer='word', binary=False,...","LogisticRegression(C=1.0, class_weight=None, d...",0.887164,0.870391,"(1, 2)"


# Final Model

In [22]:
X = df['cleaned_message']                                   
y = df['direct_report']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size = .2) 

In [23]:
#After grid searching I choose to fit and create predictions with my best fit Logistic Regression model
vec = TfidfVectorizer({ 'max_features':[50000],
                       'ngram_range':(1,2),
                       'min_df':[3],
                       'max_df':[1.0]})

# Scale data with vectorizer
X_train_vec = vec.fit_transform(X_train)
X_train_vec = pd.DataFrame(X_train_vec.toarray(), columns=vec.get_feature_names())

# Transform the test set
X_test_vec = vec.transform(X_test)

In [24]:
# Instantiate model
model = LogisticRegression(solver='lbfgs', penalty = 'l2', max_iter = 1500)

# Fit on training data.
model.fit(X_train_vec, y_train)

# Get scores
print('CV score:', cross_val_score(model, X_train_vec, y_train, cv=5).mean())
print('Training Score:', model.score(X_train_vec, y_train))
print('Testing Score:', model.score(X_test_vec, y_test))

CV score: 0.8601802142776014
Training Score: 0.8825136612021858
Testing Score: 0.8669833729216152


In [25]:
pred = model.predict(X_test_vec)

In [26]:
cm = confusion_matrix(y_test, pred)

In [27]:
cm_df = pd.DataFrame(cm, columns=['predneg', 'pred pos'], index=['actual neg', 'actual pos'])
cm_df

Unnamed: 0,predneg,pred pos
actual neg,3335,128
actual pos,432,315


In [28]:
tn, fp, fn, tp = confusion_matrix(y_test, pred).ravel()

accuracy = (tp + tn) / (tp + fn + fp + tn) * 100
misclassification = (100 - accuracy)
sensitivity = tp / (tp + fn) * 100
specificity = tn / (tn + fp) * 100
precision = tp / (tp + fp) * 100
roc_auc = roc_auc_score(y_test, pred)

print(f'Accuracy: {round(accuracy, 4)}%')
print(f'Misclassification rate: {round(misclassification, 4)}%')
print(f'Sensitivity: {round(sensitivity, 4)}%')
print(f'Specificity: {round(specificity, 4)}%')
print(f'Precision: {round(precision, 4)}%')
print(f'ROC AUC: {round(roc_auc, 4)}')

Accuracy: 86.6983%
Misclassification rate: 13.3017%
Sensitivity: 42.1687%
Specificity: 96.3038%
Precision: 71.1061%
ROC AUC: 0.6924


# Coefficients 

In [29]:
#Create dataframe with coefs and e^coefs for each word

coefs = list(zip(vec.get_feature_names(), model.coef_[0].T))
coefs = pd.DataFrame(coefs, columns = ['word','coef'])
coefs['e^coef'] = np.exp(coefs['coef'])

In [33]:
coefs.sort_values(by='e^coef', ascending=False).head(30)

Unnamed: 0,word,coef,e^coef
15732,need,3.97514,53.257554
24916,us,3.909328,49.865451
8736,food,3.879859,48.417376
10393,help,3.861895,47.555405
10820,hungry,3.852896,47.129349
540,aid,3.464018,31.945077
23387,tents,3.457433,31.735403
23380,tent,3.442254,31.257317
20494,sandy,3.167935,23.758382
25568,water,3.164311,23.672426


In [32]:
coefs.sort_values(by='e^coef').head(20)

Unnamed: 0,word,coef,e^coef
12175,job,-3.166459,0.042153
11438,information,-2.393985,0.091265
20379,said,-2.215541,0.109094
11278,including,-1.900485,0.149496
11694,international,-1.862094,0.155347
16093,notes,-1.743793,0.174856
15844,news,-1.737536,0.175953
9632,government,-1.717573,0.179501
11287,incomplete,-1.700484,0.182595
5128,country,-1.635778,0.194801
