# Multiclass Classification Model

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from nltk.corpus import stopwords
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix

## Read in data

In [9]:
df = pd.read_csv("../Data/all_tweets_clean.csv")

In [10]:
df.shape

(60352, 4)

In [11]:
df.isnull().sum()

text                   0
requesting_help    58867
disaster               4
languages              0
dtype: int64

In [12]:
# Remove NA's in disaster column
df.dropna(subset=["disaster"], inplace = True)

In [13]:
df.reset_index(drop=True,inplace=True)

In [15]:
df[df['requesting_help'].notnull()]

Unnamed: 0,text,requesting_help,disaster,languages
0,offically tropical storm dorian where is it go...,0.0,hurricane,en
1,tropical storm dorian projected path spaghetti...,0.0,hurricane,en
2,to become a this week a system located hundred...,0.0,hurricane,en
3,update tropical storm dorian strengthens expec...,0.0,hurricane,en
4,live tonight pm est d railed episode pga tour...,0.0,hurricane,en
...,...,...,...,...
31463,live assistance center for kincade victims opens,1.0,fire,en
31464,residents are lining up at the local assistanc...,1.0,fire,en
31465,our feature this week is castle rock fire depa...,0.0,fire,en
31466,pg e transparency a federal judge is ordering ...,0.0,fire,en


## Look at how many values with requesting help

In [16]:
df.groupby("disaster")["requesting_help"].value_counts()

disaster   requesting_help
fire       0.0                468
           1.0                 34
floods     0.0                326
           1.0                153
hurricane  0.0                395
           1.0                109
Name: requesting_help, dtype: int64

In [17]:
df = pd.get_dummies(df, columns=['disaster'],drop_first=True)

In [12]:
test_df = df[df['requesting_help'].isnull()==True]

In [13]:
train_df = df[df['requesting_help'].isnull()==False]

In [21]:
target = train_df['requesting_help']

In [21]:
target = df['requesting_help']

In [18]:
cvec = CountVectorizer(stop_words='english', min_df=5, max_df=1.0)

In [19]:
term_mat = cvec.fit_transform(df['text'])

In [19]:
term_mat = cvec.fit_transform(train_df['text'])

In [20]:
term_df = pd.DataFrame(term_mat.toarray(), columns=cvec.get_feature_names())

In [22]:
term_df.insert(0, 'requesting_help', target.values)

## Split df into training and new data

In [23]:
X = term_df.drop('requesting_help',axis=1)
y = term_df['requesting_help']

In [29]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

In [28]:
X_train = term_df[term_df['requesting_help'].notnull()].drop('requesting_help', axis=1)

In [32]:
X_test = term_df[term_df['requesting_help'].isnull()].drop('requesting_help', axis=1)

In [33]:
y_train = term_df[term_df['requesting_help'].notnull()]['requesting_help']

In [34]:
y_test = term_df[term_df['requesting_help'].isnull()]['requesting_help']

In [35]:
# Instantiate logistic regression model.
lr = LogisticRegression(solver='liblinear')

In [36]:
# Fit model to training data.
lr.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [37]:
# Evaluate model on training data.
lr.score(X_train, y_train)

0.9771043771043771

In [33]:
lr.score(X_test, y_test)

0.8010752688172043

In [38]:
# predicting y_test
preds = lr.predict(X_test)

In [41]:
prac = pd.DataFrame({'predictions': preds})

In [43]:
prac['predictions'].value_counts()

0.0    57075
1.0     1788
Name: predictions, dtype: int64

In [44]:
prac.shape

(58863, 1)

In [45]:
df.shape

(60348, 5)

In [36]:
y_test

192     0.0
468     0.0
327     0.0
679     1.0
1152    0.0
       ... 
124     0.0
81      1.0
537     0.0
357     0.0
1277    0.0
Name: requesting_help, Length: 372, dtype: float64

In [None]:
preds

In [37]:
# Generate confusion matrix.
# Documentation here: https://scikit-learn.org/stable/modules/generated/sklearn.metrics.confusion_matrix.html

# tn, fp  positive = asking for help
# fn, tp  negative = not asking for help

confusion_matrix(y_test, # True values.
                 preds)  # Predicted values.

array([[274,  24],
       [ 50,  24]])

In [40]:
test_df['requesting_help'] = lr.predict(test_df.drop('requesting_help',axis=1))

ValueError: could not convert string to float: 'he can t nuke the hurricane from poland'

## Set up X and Y variables

In [None]:
X = df_train["text"]
y = df_train["requesting_help"]

In [None]:
# check for class sizes
y.value_counts(normalize = True)

In [None]:
# Split the data into the training and testing sets.
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    stratify=y,
                                                    random_state=42)

In [None]:
# for future if we want to add more stop words
# ('tvec', TfidfVectorizer(ngram_range = (1,2), stop_words = text.ENGLISH_STOP_WORDS.union(additional_stop_words)))


# set up pipeline
pipe = Pipeline([
            ('tvec', TfidfVectorizer(ngram_range = (1,2), stop_words = "english" )),
            ("svc", SVC(gamma = "scale"))

])

# param options
params = {}

# run gridsearch
gs = GridSearchCV(pipe, params, cv=5, n_jobs= 3)

In [None]:
X_train.shape

In [None]:
y_train.shape

In [None]:
# run model
gs.fit(X_train, y_train)

In [None]:
gs.score(X_train, y_train)

In [None]:
gs.score(X_test, y_test)

## Make Predictions

In [None]:
X_new = df_new["text"]

In [None]:
df_new.head()

In [None]:
df_new.loc[:, "requesting_help"] = gs.predict(X_new).astype(int)

In [None]:
df_new.groupby("disaster")["requesting_help"].value_counts(normalize = True)

In [None]:
df_new[df_new["requesting_help"] == 1]