# Project 5 - Leveraging Social Media to Map Natural Disasters
## Modeling and Evaluation

In [1]:
# imports
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup             
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
import regex as re
import matplotlib.pyplot as plt
import seaborn as sns

## Read in Data

In [2]:
df = pd.read_csv('./datasets/clean_df.csv')

In [3]:
df

Unnamed: 0,index,is_retweet,likes,replies,retweets,text,tweet_url,type,target,latitude,longitude
0,0,0.0,1,0.0,0.0,praying for yall in texas! #hurricaneharvey,/RedSoxNation52/status/901232720713469952,hurricane,0,29.622699,-95.243400
1,1,0.0,1,0.0,3.0,fyi #hurricaneharvey,/nataliereyy/status/901232720088637446,hurricane,0,30.067612,-95.696869
2,2,0.0,1,0.0,0.0,my prayers goes to everyone in texas being aff...,/sothiachhoeum2/status/901232719312670720,hurricane,0,29.898720,-95.837471
3,3,0.0,11,2.0,1.0,#hurricaneharvey is coming we are bunkering do...,/tayslade/status/901232707455389696,hurricane,0,29.795760,-95.285158
4,4,0.0,0,0.0,0.0,jim cantore is wearing a modded out baseball h...,/RebeccaBennitt/status/901232706247417856,hurricane,0,29.453960,-94.826653
5,5,0.0,2,0.0,0.0,nothing you can do mr potus will keep #fakenew...,/jmattbarber/status/901232704800276480,hurricane,0,29.453942,-95.338017
6,7,0.0,0,0.0,0.0,praying for se texas #hurricaneharvey means bu...,/dcollison/status/901232692620034048,hurricane,0,29.378336,-94.969912
7,8,0.0,0,0.0,0.0,and yet #hurricaneharvey is not #katrina #trum...,/forgedbytrials_/status/901232690921385984,hurricane,0,30.002333,-95.767803
8,9,0.0,4,0.0,1.0,category #hurricaneharvey,/ASimendinger/status/901232687939280897,hurricane,0,29.797657,-95.655378
9,10,0.0,1,0.0,1.0,well thats a pretty blunt warning #hurricaneha...,/Luciani680NEWS/status/901232682734157824,hurricane,0,29.880248,-94.934300


In [4]:
df.isnull().sum()

index         0
is_retweet    0
likes         0
replies       0
retweets      0
text          0
tweet_url     0
type          0
target        0
latitude      0
longitude     0
dtype: int64

# Model

In [12]:
cust_list = ['flood', 'floods', 'flooding', 'hurricaneharvey', 'tornado', 'noreaster', 'mudslide', 'montecito', 'mudslides',
             'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 
              "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 
              'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 
              'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 
              'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 
              'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 
              'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 
              'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 
              'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 
              'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 
              'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 
              'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 
              'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor',
              'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will',
              'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've',
              'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't",
              'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn',
              "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', 
              "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"]

In [13]:
# making X, y and train/test split
X = df["text"]
y = df["target"]

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

The modeling process is going to include using GridSearchCV and Pipelines to search through many parameters of different modeling techniques at once. In order to make this process flow better, we are going to make a custom function that will run the gridsearch on the established parameters for each model.

In [41]:
def pipe_searcher(pipe, params):
    gs = GridSearchCV(estimator=pipe, param_grid=params, cv=2, verbose=1, n_jobs=-1, scoring="recall")
    gs.fit(X_train, y_train)
    print(f'CrossVal Score: {gs.best_score_}')
    print(f'Training Score: {gs.score(X_train, y_train)}')
    print(f'Testing Score: {gs.score(X_test, y_test)}')
    print(gs.best_params_)
    return gs

In [15]:
# Instantiate a pipe for logistic regression and CountVectorizer
lr_pipe = Pipeline([('cvec', CountVectorizer(stop_words=cust_list)), ('lr', LogisticRegression())])
lr_params = {
    'cvec__max_features': [10_000, 20_000, 40_000, None],
    'cvec__ngram_range': [(1,1), (1,2)],
    'cvec__max_df': [0.5, 0.7, 0.8, 1.0],
    'lr__C' : [10, 20, 40],
    'lr__penalty': ['l1', 'l2']
}

In [16]:
lr_model = pipe_searcher(lr_pipe, lr_params)

Fitting 2 folds for each of 192 candidates, totalling 384 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   17.1s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 384 out of 384 | elapsed:  2.2min finished


CrossVal Score: 0.9902017963373382
Training Score: 0.9999416773591508
Testing Score: 0.9891532540237928
{'cvec__max_df': 0.8, 'cvec__max_features': None, 'cvec__ngram_range': (1, 2), 'lr__C': 10, 'lr__penalty': 'l1'}


In [31]:
# Instantiate a pipe for CountVectorizer and Decision Tree
dt_pipe = Pipeline([('cvec', CountVectorizer()), ('dt', DecisionTreeClassifier(random_state=42))])
dt_params = {
    'cvec__max_features': [25_000, 40_000, 35_000],
    'cvec__ngram_range': [(1,1), (1,2), (2, 2)],
    'cvec__stop_words' : [cust_list],
    'dt__min_samples_leaf' : [2, 10, 25, 50],
    'dt__max_depth': [50, 100, 200]
}

In [87]:
dt_model = pipe_searcher(pipe=dt_pipe, params=dt_params)

Fitting 2 folds for each of 216 candidates, totalling 432 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   13.2s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 432 out of 432 | elapsed:  3.6min finished


CrossVal Score: 0.987162280445819
Training Score: 0.9931143140573029
Testing Score: 0.9842464554524768
{'cvec__max_features': 25000, 'cvec__ngram_range': (1, 2), 'cvec__stop_words': None, 'dt__max_depth': 50, 'dt__min_samples_leaf': 2}


In [42]:
# Instantiate a pipe for CountVectorizer and random forest model
rf_pipe = Pipeline([('cvec', CountVectorizer()), ('rf', RandomForestClassifier(random_state=42))])
rf_params = {
    'cvec__max_features': [25_000, 40_000, 35_000],
    'cvec__ngram_range': [(1,1), (1,2), (2, 2)],
#     'cvec__stop_words' : [cust_list],
    'rf__n_estimators' : [20, 30, 50],
    'rf__min_samples_leaf': [10, 50, 75],
}

In [43]:
rf_model = pipe_searcher(pipe=rf_pipe, params=rf_params)

Fitting 2 folds for each of 81 candidates, totalling 162 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   17.7s
[Parallel(n_jobs=-1)]: Done 162 out of 162 | elapsed:  1.1min finished


CrossVal Score: 0.07934557828924653
Training Score: 0.14839491217443973
Testing Score: 0.1088929219600726
{'cvec__max_features': 35000, 'cvec__ngram_range': (1, 2), 'rf__min_samples_leaf': 10, 'rf__n_estimators': 50}


In [63]:
# Instantiate a pipe for CountVectorizer and adaboost model
ada_pipe = Pipeline([('cvec', CountVectorizer()), ('ada', AdaBoostClassifier(random_state=42))])
ada_params = {
    'cvec__max_features': [32_000, 33_000, 34_000, None],
    'cvec__ngram_range': [(1,1), (1,2), (2, 2)],
#     'cvec__stop_words' : [None, 'english'],
    'ada__n_estimators' : [100, 150, 200],
}

In [64]:
ada_model = pipe_searcher(pipe=ada_pipe, params=ada_params)

Fitting 2 folds for each of 36 candidates, totalling 72 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed:  2.2min finished


CrossVal Score: 0.918831712559597
Training Score: 0.9691096305269533
Testing Score: 0.9038112522686026
{'ada__n_estimators': 200, 'cvec__max_features': None, 'cvec__ngram_range': (1, 2)}


### Final Model

In order to have access to some of the model attributes, we are going to create a new standalone version of our best model parameters

In [56]:
# pulling up best parameters
ada_model.best_params_

{'ada__n_estimators': 150,
 'cvec__max_features': 33000,
 'cvec__ngram_range': (1, 2)}

In [57]:
# fitting best cvec params
cvec = CountVectorizer(stop_words=cust_list, max_features=33_000, ngram_range=(1, 2))
X_train_vec = cvec.fit_transform(X_train)
X_test_vec = cvec.transform(X_test)

# fitting log reg
final_ada_model = AdaBoostClassifier(n_estimators=150)
final_ada_model.fit(X_train_vec, y_train)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=1.0,
                   n_estimators=150, random_state=None)

## Model Analysis

Due to the nature of this project, we want to minimize cases where someone is tweeting that they are in need of help, but this fact is not recognized by the model. The metric we can use for this evaluation is recall, or sensitivity. 

### Confusion Matrix

In [58]:
# making predictions from the logreg model
preds = final_ada_model.predict(X_test_vec)

# creating confusion matrix and setting into df
cf = pd.DataFrame(confusion_matrix(y_test, preds),
             columns=["Predicted Negatives", "Predicted Positives"],
             index=["Actual Negatives", "Actual Positives"])

In [59]:
y_test.value_counts()

0    5165
1     551
Name: target, dtype: int64

In [60]:
# making predictions from the logreg model
train_preds = final_ada_model.predict(X_train_vec)

# creating confusion matrix and setting into df
train_cf = pd.DataFrame(confusion_matrix(y_train, train_preds),
             columns=["Predicted Negatives", "Predicted Positives"],
             index=["Actual Negatives", "Actual Positives"])

In [61]:
# checking df for results
cf

Unnamed: 0,Predicted Negatives,Predicted Positives
Actual Negatives,5161,4
Actual Positives,58,493


In [62]:
train_cf

Unnamed: 0,Predicted Negatives,Predicted Positives
Actual Negatives,15480,15
Actual Positives,94,1557


### Coefficients

In [29]:
# making a dataframe of the vectorized words
vec_df = pd.DataFrame(X_train_vec.toarray(),
                                columns=cvec.get_feature_names())

In [31]:
# making a dataframe of the coefficients, attached to the words
coef_df = pd.DataFrame(final_lr_model.coef_.T, columns = ['coef'])
coef_df.index = vec_df.columns
coef_df = coef_df.sort_values(by = 'coef',ascending = False)
coef_df.head(5)

Unnamed: 0,coef
rescue,13.199816
lost,12.746
fire,12.676695
fires,12.425871
death,12.202326
