# Multiclass Classification Model

In [25]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, AdaBoostClassifier
from sklearn.metrics import confusion_matrix
from nltk.corpus import stopwords

## Read in data

In [2]:
df = pd.read_csv("../Data/all_tweets_clean.csv")

In [3]:
df.shape

(60352, 4)

In [7]:
df.isnull().sum()

text                   0
requesting_help    58863
disaster               0
languages              0
dtype: int64

In [5]:
# Remove NA's in disaster column
df.dropna(subset=["disaster"], inplace = True)

## Look at how many values with requesting help

In [6]:
df.groupby("disaster")["requesting_help"].value_counts(normalize = True)

disaster   requesting_help
fire       0.0                0.932271
           1.0                0.067729
floods     0.0                0.680585
           1.0                0.319415
hurricane  0.0                0.783730
           1.0                0.216270
Name: requesting_help, dtype: float64

## Split df into training and new data

In [8]:
df_train = df[df["requesting_help"].notnull()]
df_train.shape

(1485, 4)

In [9]:
df_new = df[df["requesting_help"].isnull()]
df_new.shape

(58863, 4)

## Set up X and Y variables

In [10]:
X = df_train["text"]
y = df_train["requesting_help"]

In [11]:
# check for class sizes
y.value_counts(normalize = True)

0.0    0.800673
1.0    0.199327
Name: requesting_help, dtype: float64

In [12]:
# Split the data into the training and testing sets.
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    stratify=y,
                                                    random_state=42)

In [13]:
# for future if we want to add more stop words
# ('tvec', TfidfVectorizer(ngram_range = (1,2), stop_words = text.ENGLISH_STOP_WORDS.union(additional_stop_words)))


# set up pipeline
pipe = Pipeline([
            ('tvec', TfidfVectorizer(ngram_range = (1,2), stop_words = "english" )),
            ("bag", BaggingClassifier(random_state = 42))

])

# param options
params = {}

# run gridsearch
gs = GridSearchCV(pipe, params, cv=5, n_jobs= 3)

In [14]:
X_train.shape

(1113,)

In [15]:
y_train.shape

(1113,)

In [16]:
# run model
gs.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('tvec',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 2),
                                          

In [17]:
gs.score(X_train, y_train)

0.9712488769092543

In [18]:
gs.score(X_test, y_test)

0.782258064516129

### Generate predictions on test data set

In [27]:
preds = gs.predict(X_test)

# Minimize false negatives

In [28]:
# Generate confusion matrix.
# Documentation here: https://scikit-learn.org/stable/modules/generated/sklearn.metrics.confusion_matrix.html
# tn, fp  positive = asking for help
# fn, tp  negative = not asking for help
confusion_matrix(y_test, # True values.
                 preds)  # Predicted values.

array([[269,  29],
       [ 52,  22]])

## Make Predictions on New Data

In [19]:
X_new = df_new["text"]

In [20]:
df_new.head()

Unnamed: 0,text,requesting_help,disaster,languages
485,he can t nuke the hurricane from poland,,hurricane,en
505,y all when i just left to go grab dinner like ...,,hurricane,en
506,is about to fuck shit up like,,hurricane,en
507,me after shopping at publix today for food i n...,,hurricane,en
508,cat thats the projection of the category at la...,,hurricane,en


In [21]:
df_new.loc[:, "requesting_help"] = gs.predict(X_new).astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [22]:
df_new.groupby("disaster")["requesting_help"].value_counts(normalize = True)

disaster   requesting_help
fire       0                  0.976731
           1                  0.023269
floods     0                  0.836842
           1                  0.163158
hurricane  0                  0.958754
           1                  0.041246
Name: requesting_help, dtype: float64

In [23]:
df_new[df_new["requesting_help"] == 1]

Unnamed: 0,text,requesting_help,disaster,languages
522,got all the supplies today but it was crazy ou...,1,hurricane,en
531,water water everywhere can only get max answer to,1,hurricane,en
539,me too preparing for and going to pick during ...,1,hurricane,en
586,the speedway gas station on rouse is price gau...,1,hurricane,en
589,items are flying off the shelves here at walma...,1,hurricane,en
...,...,...,...,...
59800,carpinteria post fire meeting scheduled for fr...,1,fire,en
59896,residents still seeking services are encourage...,1,fire,en
59945,these adorable animals were just flown into th...,1,fire,en
59949,heres a real time dashboard available that mon...,1,fire,en
