# Multiclass Classification Model

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, AdaBoostClassifier
from sklearn.metrics import confusion_matrix
from nltk.corpus import stopwords

## Read in data

In [28]:
df = pd.read_csv("../Data/all_tweets_clean.csv")

In [29]:
# Remove NA's in disaster column
df.dropna(subset=["disaster"], inplace = True)

In [30]:
# Read in training data
df_train = pd.read_csv("../Data/df8_SUPER.csv")
df_train.shape

(1006, 4)

## Look at how many values with requesting help

In [31]:
df = df[df["disaster"] != "floods"]

In [32]:
df.groupby("disaster")["requesting_help"].value_counts(normalize = True)

disaster   requesting_help
fire       0.0                0.932271
           1.0                0.067729
hurricane  0.0                0.783730
           1.0                0.216270
Name: requesting_help, dtype: float64

## Split df into training and new data

In [6]:
#df_train = df[df["requesting_help"].notnull()]
#df_train.shape

In [7]:
df_new = df[df["requesting_help"].isnull()]
df_new.shape

(58863, 4)

In [9]:
df_new.shape

(58483, 4)

## Set up X and Y variables

In [10]:
X = df_train["text"]
y = df_train["requesting_help"]

In [11]:
# check for class sizes
y.value_counts(normalize = True)

0    0.77336
1    0.22664
Name: requesting_help, dtype: float64

In [12]:
# Split the data into the training and testing sets.
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    stratify=y,
                                                    random_state=42)

In [13]:
# for future if we want to add more stop words
# ('tvec', TfidfVectorizer(ngram_range = (1,2), stop_words = text.ENGLISH_STOP_WORDS.union(additional_stop_words)))


# set up pipeline
pipe = Pipeline([
            ('tvec', TfidfVectorizer(ngram_range= (1,2), stop_words = "english" )),
            ("bag", BaggingClassifier(random_state = 42))

])

# param options
params = {
    "bag__n_estimators": [100], # default is 10
   # "bag__max_features": [ 1, 5, 10 , 30] # default is 1
}

# run gridsearch
gs = GridSearchCV(pipe, params, cv=5, n_jobs= 3)

In [14]:
# run model
gs.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('tvec',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 2),
                                          

In [15]:
gs.best_estimator_

Pipeline(memory=None,
         steps=[('tvec',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 2), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words='english', strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('bag',
                 BaggingClassifier(base_estimator=None, bootstrap=True,
                                   bootstrap_features=False, max_featur

In [16]:
model = gs.best_estimator_

In [17]:
model.score(X_train, y_train)

1.0

In [18]:
model.score(X_test, y_test)

0.8888888888888888

### Generate predictions on test data set

In [19]:
preds = gs.predict(X_test)

## Review Confusion Matrix

In [20]:
# Generate confusion matrix.
# Documentation here: https://scikit-learn.org/stable/modules/generated/sklearn.metrics.confusion_matrix.html
# tn, fp  positive = asking for help
# fn, tp  negative = not asking for help
confusion_matrix(y_test, # True values.
                 preds)  # Predicted values.

array([[188,   7],
       [ 21,  36]])

## Make Predictions on New Data

In [21]:
X_new = df_new["text"]

In [22]:
df_new.head()

Unnamed: 0,text,requesting_help,disaster,languages
485,he can t nuke the hurricane from poland,,hurricane,en
505,y all when i just left to go grab dinner like ...,,hurricane,en
506,is about to fuck shit up like,,hurricane,en
507,me after shopping at publix today for food i n...,,hurricane,en
508,cat thats the projection of the category at la...,,hurricane,en


In [23]:
df_new.loc[:, "requesting_help"] = gs.predict(X_new).astype(int)

In [25]:
df_new.groupby("disaster")["requesting_help"].value_counts(normalize = True)

disaster   requesting_help
fire       0                  0.917659
           1                  0.082341
hurricane  0                  0.832686
           1                  0.167314
Name: requesting_help, dtype: float64

In [26]:
df_new[df_new["requesting_help"] == 1]

Unnamed: 0,text,requesting_help,disaster,languages
522,got all the supplies today but it was crazy ou...,1,hurricane,en
539,me too preparing for and going to pick during ...,1,hurricane,en
544,i m live guys may be the last one for a while ...,1,hurricane,en
570,as another hurricane approaches the will join ...,1,hurricane,en
573,so passed puerto rico and is headed for s prec...,1,hurricane,en
...,...,...,...,...
60228,mt gov jerry brown requested a major disaster ...,1,fire,en
60253,more artwork to benefit relief efforts in my h...,1,fire,en
60265,i wrote to president trump today urging the ad...,1,fire,en
60307,weve got last minute foodie gift ideas that al...,1,fire,en
