# Multiclass Classification Model

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, AdaBoostClassifier
from sklearn.metrics import confusion_matrix
from nltk.corpus import stopwords

## Read in data

In [4]:
df = pd.read_csv("../Data/all_tweets_clean.csv")

In [8]:
# Remove NA's in disaster column
df.dropna(subset=["disaster"], inplace = True)

In [14]:
# Read in training data
df_train = pd.read_csv("../Data/df3_brian - df3_brian.csv")
df_train.shape

(1485, 4)

## Look at how many values with requesting help

In [9]:
df.groupby("disaster")["requesting_help"].value_counts(normalize = True)

disaster   requesting_help
fire       0.0                0.932271
           1.0                0.067729
floods     0.0                0.680585
           1.0                0.319415
hurricane  0.0                0.783730
           1.0                0.216270
Name: requesting_help, dtype: float64

## Split df into training and new data

In [15]:
#df_train = df[df["requesting_help"].notnull()]
#df_train.shape

In [16]:
df_new = df[df["requesting_help"].isnull()]
df_new.shape

(58863, 4)

## Set up X and Y variables

In [17]:
X = df_train["text"]
y = df_train["requesting_help"]

In [18]:
# check for class sizes
y.value_counts(normalize = True)

0    0.836364
1    0.163636
Name: requesting_help, dtype: float64

In [19]:
# Split the data into the training and testing sets.
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    stratify=y,
                                                    random_state=42)

In [97]:
# for future if we want to add more stop words
# ('tvec', TfidfVectorizer(ngram_range = (1,2), stop_words = text.ENGLISH_STOP_WORDS.union(additional_stop_words)))


# set up pipeline
pipe = Pipeline([
            ('tvec', TfidfVectorizer(ngram_range= (1,2), stop_words = "english" )),
            ("bag", BaggingClassifier(random_state = 42))

])

# param options
params = {
    "bag__n_estimators": [100], # default is 10
   # "bag__max_features": [ 1, 5, 10 , 30] # default is 1
}

# run gridsearch
gs = GridSearchCV(pipe, params, cv=5, n_jobs= 3)

In [98]:
# run model
gs.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('tvec',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 2),
                                          

In [99]:
gs.best_estimator_

Pipeline(memory=None,
         steps=[('tvec',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 2), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words='english', strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('bag',
                 BaggingClassifier(base_estimator=None, bootstrap=True,
                                   bootstrap_features=False, max_featur

In [100]:
model = gs.best_estimator_

In [101]:
model.score(X_train, y_train)

1.0

In [102]:
model.score(X_test, y_test)

0.8306451612903226

### Generate predictions on test data set

In [103]:
preds = gs.predict(X_test)

## Review Confusion Matrix

In [104]:
# Generate confusion matrix.
# Documentation here: https://scikit-learn.org/stable/modules/generated/sklearn.metrics.confusion_matrix.html
# tn, fp  positive = asking for help
# fn, tp  negative = not asking for help
confusion_matrix(y_test, # True values.
                 preds)  # Predicted values.

array([[296,  15],
       [ 48,  13]])

## Make Predictions on New Data

In [105]:
X_new = df_new["text"]

In [106]:
df_new.head()

Unnamed: 0,text,requesting_help,disaster,languages
485,he can t nuke the hurricane from poland,0,hurricane,en
505,y all when i just left to go grab dinner like ...,0,hurricane,en
506,is about to fuck shit up like,0,hurricane,en
507,me after shopping at publix today for food i n...,0,hurricane,en
508,cat thats the projection of the category at la...,0,hurricane,en


In [107]:
df_new.loc[:, "requesting_help"] = gs.predict(X_new).astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [108]:
df_new.groupby("disaster")["requesting_help"].value_counts(normalize = True)

disaster   requesting_help
fire       0                  0.987535
           1                  0.012465
floods     0                  0.923684
           1                  0.076316
hurricane  0                  0.975475
           1                  0.024525
Name: requesting_help, dtype: float64

In [109]:
df_new[df_new["requesting_help"] == 1]

Unnamed: 0,text,requesting_help,disaster,languages
522,got all the supplies today but it was crazy ou...,1,hurricane,en
539,me too preparing for and going to pick during ...,1,hurricane,en
562,kudos to for helping south floridian in prepar...,1,hurricane,en
601,everyone at walmart gotta get supplies for the...,1,hurricane,en
781,dear i was able to get supplies at milam s was...,1,hurricane,en
...,...,...,...,...
59998,farmers insurance doesnt like the industry the...,1,fire,en
60026,the massive thomas fire has been devastating f...,1,fire,en
60110,the way for you to support animal rescue in ve...,1,fire,en
60155,the battles on to save the crops that werent d...,1,fire,en
