# ML Pipeline Preparation

### Importing libraries and loading data from database.Â¶
1. Importing Python libraries
2. Loading dataset from database with read_sql_table
3. Defining feature and target variables X and Y

In [1]:
# import libraries
import pandas as pd
import numpy as np
import os
import pickle
from sqlalchemy import create_engine
import re
import nltk
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier,AdaBoostClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, accuracy_score, f1_score, fbeta_score, classification_report
from scipy.stats import hmean
from scipy.stats.mstats import gmean

nltk.download(['punkt', 'wordnet', 'averaged_perceptron_tagger'])

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/havishamadhu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/havishamadhu/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/havishamadhu/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [2]:
# load data from database
cwd = os.getcwd()
dbwd = cwd.replace('/models','/data/').replace('\\models','\\data\\')
engine = create_engine('sqlite:///'+dbwd+'DisasterResponse.db')
df = pd.read_sql_table('df',engine)
X = df['message']
Y = df.iloc[:,4:]

### Tokenized function to process the text data

In [3]:
def tokenize(text):
    url_regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    detected_urls = re.findall(url_regex, text)
    for url in detected_urls:
        text = text.replace(url, "urlplaceholder")

    tokens = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()

    clean_tokens = []
    for tok in tokens:
        clean_tok = lemmatizer.lemmatize(tok).lower().strip()
        clean_tokens.append(clean_tok)

    return clean_tokens

### Building a Machine Learning Pipeline

In [4]:
class StartingVerbExtractor(BaseEstimator, TransformerMixin):

    def starting_verb(self, text):
        sentence_list = nltk.sent_tokenize(text)
        for sentence in sentence_list:
            pos_tags = nltk.pos_tag(tokenize(sentence))
            first_word, first_tag = pos_tags[0]
            if first_tag in ['VB', 'VBP'] or first_word == 'RT':
                return True
        return False

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_tagged = pd.Series(X).apply(self.starting_verb)
        return pd.DataFrame(X_tagged)

In [5]:
# This machine pipeline should take in the message column as input and output 
# classification results on the other 36 categories in the dataset. 

def model_pipeline():
    pipeline = Pipeline([
        ('features', FeatureUnion([

            ('text_pipeline', Pipeline([
                ('vect', CountVectorizer(tokenizer=tokenize)),
                ('tfidf', TfidfTransformer())
            ])),

            ('starting_verb', StartingVerbExtractor())
        ])),

        ('clf', MultiOutputClassifier(AdaBoostClassifier()))
    ])

    return pipeline

### Splitting data into training and test sets and training the pipeline

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, Y)

model = []
model = model_pipeline()
model.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('features',
                 FeatureUnion(n_jobs=None,
                              transformer_list=[('text_pipeline',
                                                 Pipeline(memory=None,
                                                          steps=[('vect',
                                                                  CountVectorizer(analyzer='word',
                                                                                  binary=False,
                                                                                  decode_error='strict',
                                                                                  dtype=<class 'numpy.int64'>,
                                                                                  encoding='utf-8',
                                                                                  input='content',
                                                                                  low

### Testing the Model

In [7]:
y_pred = model.predict(X_test)

In [9]:

overall_accuracy = (y_pred == y_test).mean().mean()

print('Average overall accuracy {0:.2f}% \n'.format(overall_accuracy*100))


Average overall accuracy 95.25% 



In [10]:
y_pred_pd = pd.DataFrame(y_pred, columns = y_test.columns)
for column in y_test.columns:
    print('***********************************************************\n')
    print('Column Name: {}\n'.format(column))
    print(classification_report(y_test[column],y_pred_pd[column]))

***********************************************************

Column Name: request

              precision    recall  f1-score   support

           0       0.91      0.97      0.94      5454
           1       0.77      0.54      0.64      1100

    accuracy                           0.90      6554
   macro avg       0.84      0.76      0.79      6554
weighted avg       0.89      0.90      0.89      6554

***********************************************************

Column Name: offer

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      6529
           1       0.00      0.00      0.00        25

    accuracy                           1.00      6554
   macro avg       0.50      0.50      0.50      6554
weighted avg       0.99      1.00      0.99      6554

***********************************************************

Column Name: aid_related

              precision    recall  f1-score   support

           0       0.74      0.87      0

              precision    recall  f1-score   support

           0       0.99      1.00      0.99      6495
           1       0.09      0.03      0.05        59

    accuracy                           0.99      6554
   macro avg       0.54      0.52      0.52      6554
weighted avg       0.98      0.99      0.99      6554

***********************************************************

Column Name: other_infrastructure

              precision    recall  f1-score   support

           0       0.96      0.99      0.98      6272
           1       0.40      0.09      0.14       282

    accuracy                           0.95      6554
   macro avg       0.68      0.54      0.56      6554
weighted avg       0.94      0.95      0.94      6554

***********************************************************

Column Name: weather_related

              precision    recall  f1-score   support

           0       0.88      0.96      0.92      4767
           1       0.85      0.66      0.74      1

### Exporting the model as a pickle file

In [12]:
# save the model to disk
filename = 'classifier.pickle'
pickle.dump(model, open(filename, 'wb'))
