# ML Pipeline Preparation

## 1. Import libraries and load data from database.


In [1]:
# import libraries
import os
import re
import pickle
import pandas as pd
from sqlalchemy import create_engine

import nltk
nltk.download(['punkt', 'wordnet', 'stopwords','averaged_perceptron_tagger'])
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.base import BaseEstimator, TransformerMixin
from xgboost import XGBClassifier

import warnings
warnings.filterwarnings('ignore')

# To display all columns and rows
pd.set_option('display.max_columns', None)  
pd.set_option('display.max_rows', None)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


## 2. Load Data

In [2]:
def load_data(database_filepath):
    # Create a new Engine instance
    engine = create_engine('sqlite:///'+ database_filepath)
    
    # Read data from DisasterMessages SQL table
    df = pd.read_sql_table('DisasterMessages', engine) 
    
    # Select the column names for the categories of the disaster messages
    category_names = df.drop(['id', 'message', 'original','genre'], axis=1).columns.tolist()
    
    # Select the training and test data
    X = df.message
    Y = df[category_names]
    
    return X, Y, category_names

In [3]:
# Define database file path
database_filepath = '/content/sample_data/data/DisasterResponse.db'

# Load training and test data and category names
X, Y, category_names = load_data(database_filepath)

print("Shape of training data: {}\nShape of test data: {}\nLength of category names: {}".format(X.shape, 
                                                                                                Y.shape, len(category_names)))

Shape of training data: (26216,)
Shape of test data: (26216, 36)
Length of category names: 36


In [4]:
# Display for 5 rows of training data
X.head()

0    Weather update - a cold front from Cuba that c...
1              Is the Hurricane over or is it not over
2                      Looking for someone but no name
3    UN reports Leogane 80-90 destroyed. Only Hospi...
4    says: west side of Haiti, rest of the country ...
Name: message, dtype: object

In [5]:
# Display for category names
category_names

['related',
 'request',
 'offer',
 'aid_related',
 'medical_help',
 'medical_products',
 'search_and_rescue',
 'security',
 'military',
 'child_alone',
 'water',
 'food',
 'shelter',
 'clothing',
 'money',
 'missing_people',
 'refugees',
 'death',
 'other_aid',
 'infrastructure_related',
 'transport',
 'buildings',
 'electricity',
 'tools',
 'hospitals',
 'shops',
 'aid_centers',
 'other_infrastructure',
 'weather_related',
 'floods',
 'storm',
 'fire',
 'earthquake',
 'cold',
 'other_weather',
 'direct_report']

## 3. Tokenization function to process text data

In [6]:
# def tokenize(text):
#     # Remove punctuation characters and convert text to lowercase
#     text = re.sub(r"[^a-zA-Z0-9]", " ", text.lower())
    
#     # Return a tokenized copy of the text
#     tokens = word_tokenize(text)
    
#     # Initialize the WordNet Lemmatizer
#     lemmatizer = WordNetLemmatizer()
    
#     # lemmatize and remove stop words
#     lemmatized_tokens = [lemmatizer.lemmatize(w) for w in tokens if w not in stopwords.words("english")]
    
#     return lemmatized_tokens


def tokenize(text):
    url_regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|\
    [!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    detected_urls = re.findall(url_regex, text)
    for url in detected_urls:
        text = text.replace(url, "urlplaceholder")

    tokens = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()

    clean_tokens = []
    for tok in tokens:
        clean_tok = lemmatizer.lemmatize(tok).lower().strip()
        clean_tokens.append(clean_tok)

    return clean_tokens

In [7]:
print("Untokenized text:\n  {}\n\nTokenized:\n {}".format(X[3], tokenize(X[3])))

Untokenized text:
  UN reports Leogane 80-90 destroyed. Only Hospital St. Croix functioning. Needs supplies desperately.

Tokenized:
 ['un', 'report', 'leogane', '80-90', 'destroyed', '.', 'only', 'hospital', 'st.', 'croix', 'functioning', '.', 'needs', 'supply', 'desperately', '.']


## 4. Build a machine learning pipeline
This machine pipeline should take in the `message` column as input and output classification results on the other 36 categories in the dataset. You may find the [MultiOutputClassifier](http://scikit-learn.org/stable/modules/generated/sklearn.multioutput.MultiOutputClassifier.html) helpful for predicting multiple target variables.

In [8]:
def build_model(cvSearch=False):
    
    if not cvSearch:
        # Create data pipeline
        model = Pipeline([
            ('vect', CountVectorizer(tokenizer=tokenize)),
            ('tfidf', TfidfTransformer()),
            ('clf', MultiOutputClassifier(RandomForestClassifier(n_estimators=10)))
        ])
    else:
        # Create data pipeline
        pipeline = Pipeline([
            ('vect', CountVectorizer(tokenizer=tokenize)),
            ('tfidf', TfidfTransformer()),
            ('clf', MultiOutputClassifier(RandomForestClassifier(n_estimators=10)))
        ])
    
        parameters = {
            'tfidf__sublinear_tf': [True, False],
            'vect__ngram_range':((1, 1), (1, 2)),
            'clf__estimator__criterion' : ['gini'],
            'clf__estimator__n_estimators': [5, 10, 20, 30]
        }
        # create grid search object
        model = GridSearchCV(pipeline, param_grid=parameters)
    
    return model

## 5. Function to train model

In [9]:
def train(X, Y, model, category_names):
    # train test split
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

    # fit model
    model.fit(X_train, Y_train)

    return model, X_test, Y_test

## 6. Function to evaluate model
Report the f1 score, precision and recall for each output category of the dataset. You can do this by iterating through the columns and calling sklearn's `classification_report` on each.

In [10]:
def evaluate_model(model, X_test, y_test, category_names):  
    
    # predict on test data
    y_pred = model.predict(X_test)
    
    # create a dataframe of the predicted results
    y_pred_df = pd.DataFrame(y_pred, columns=category_names)
    
    # store the classification metrics (f1-score, precision, recall)
    class_metrics = {}

    for col in y_pred_df:
        class_results = classification_report(y_test[col], y_pred_df[col], output_dict=True)
        class_metrics[col] = {}
        for k in class_results:
            if k == 'accuracy' or k == '2':
                break
            class_metrics[col]['f1_' + k] = class_results[k]['f1-score']
            class_metrics[col]['precision_' + k] = class_results[k]['precision']
            class_metrics[col]['recall_' + k] = class_results[k]['recall']
            class_metrics[col]['accuracy'] = accuracy_score(y_test[col].values, y_pred_df[col].values)

    class_metrics_df = pd.DataFrame(class_metrics).transpose()
    class_metrics_df = class_metrics_df[class_metrics_df.columns.sort_values()]
    
    return class_metrics_df

## 7. Function to export model as a pickle file

In [11]:
def save_model(model, model_filepath, classifier_name='classifier.pkl'):
    """
        Exports the final model as a pickle file
        
        args:
            model: the final trained model
            model_filepath: directory to save the model
    """
    pickle.dump(model, open(os.path.join(model_filepath, classifier_name), 'wb'))

## 8. Run ML Pipeline untuned hyperparameters

In [12]:
# Define file paths
database_filepath = '/content/sample_data/data/DisasterResponse.db'
model_filepath = '/content/sample_data/models'

# Load training and test data and category names
X, Y, category_names = load_data(database_filepath)

# build the model pipeline
untuned_model = build_model()

# Train pipeline
print('Training model...')
untuned_randonForestmodel, X_test, Y_test = train(X, Y, untuned_model, category_names)
print("Model trained.")

# output model test results
print('\nEvaluating model...')
evaluation_results_untuned = evaluate_model(untuned_randonForestmodel, X_test, Y_test, category_names)

# Save model to file
print("\nSaving model...")
save_model(untuned_randonForestmodel, model_filepath, classifier_name='untunedRandomForestclassifier.pkl')
print("Model saved\n\n")

print("=====Model Evalaution Results=====")
evaluation_results_untuned.head(len(evaluation_results_untuned))

Training model...
Model trained.

Evaluating model...

Saving model...
Model saved


=====Model Evalaution Results=====


Unnamed: 0,accuracy,f1_0,f1_1,precision_0,precision_1,recall_0,recall_1
related,0.792143,0.447933,0.872642,0.59944,0.822802,0.357561,0.92891
request,0.882151,0.933073,0.507177,0.887333,0.817481,0.983786,0.36763
offer,0.993898,0.99694,0.0,0.993898,0.0,1.0,0.0
aid_related,0.732838,0.791673,0.627691,0.724354,0.752709,0.872787,0.538286
medical_help,0.921625,0.958962,0.131078,0.925063,0.584906,0.995439,0.07381
medical_products,0.950229,0.974419,0.084211,0.951934,0.545455,0.997992,0.045627
search_and_rescue,0.972731,0.986134,0.182857,0.974138,0.666667,0.998429,0.10596
security,0.982838,0.991341,0.042553,0.982831,1.0,1.0,0.021739
military,0.969489,0.984487,0.08046,0.969819,0.777778,0.999606,0.042424
child_alone,1.0,1.0,,1.0,,1.0,


## 9. Model improvement with tuned hyperparameters
Use grid search to find better parameters. 

In [13]:
untuned_randonForestmodel.get_params()

{'clf': MultiOutputClassifier(estimator=RandomForestClassifier(bootstrap=True,
                                                        ccp_alpha=0.0,
                                                        class_weight=None,
                                                        criterion='gini',
                                                        max_depth=None,
                                                        max_features='auto',
                                                        max_leaf_nodes=None,
                                                        max_samples=None,
                                                        min_impurity_decrease=0.0,
                                                        min_impurity_split=None,
                                                        min_samples_leaf=1,
                                                        min_samples_split=2,
                                                        min_weight_fraction_leaf=0.0

## 10. Test the improved model with tuned hyperparameters

In [14]:
# build the model pipeline with hyperparameters for tuning
tuned_model = build_model(True)

# Train pipeline
print('Training model...')
tuned_randonForestmodel, X_test, Y_test = train(X, Y, tuned_model, category_names)
print("Model trained.")

# output model test results
print('\nEvaluating model...')
evaluation_results_tuned = evaluate_model(tuned_randonForestmodel, X_test, Y_test, category_names)

# Save model to file
print("\nSaving model...")
save_model(tuned_randonForestmodel, model_filepath, classifier_name='tunedRandomForestclassifier.pkl')
print("Model saved\n\n")

print("=====Model Evalaution Results=====")
evaluation_results_tuned.head(len(evaluation_results_tuned))

Training model...
Model trained.

Evaluating model...

Saving model...
Model saved


=====Model Evalaution Results=====


Unnamed: 0,accuracy,f1_0,f1_1,precision_0,precision_1,recall_0,recall_1
related,0.803966,0.417647,0.882636,0.715726,0.813066,0.29485,0.965224
request,0.891114,0.937191,0.591267,0.894958,0.853306,0.983607,0.452355
offer,0.995233,0.997611,0.0,0.995233,0.0,1.0,0.0
aid_related,0.754005,0.808719,0.655449,0.738224,0.791613,0.894098,0.559253
medical_help,0.92029,0.9583,0.099138,0.922412,0.621622,0.997094,0.053864
medical_products,0.946606,0.972468,0.119497,0.947318,0.791667,0.99899,0.064626
search_and_rescue,0.973684,0.986646,0.103896,0.973644,1.0,1.0,0.054795
security,0.983219,0.991537,0.022222,0.983403,0.5,0.999806,0.011364
military,0.968917,0.984195,0.068571,0.969436,0.666667,0.999409,0.036145
child_alone,1.0,1.0,,1.0,,1.0,


## 11. Further model improvement with parts of speech tagging
We will add other features besides the TF-IDF and use part of speech tagging to help the model better characterise the context of the messages and extract relationships between the words, then update the pipeline and try Extreme Gradient Boosting classifier.

#### Parts of Speech (POS) Tagging

In [15]:
from sklearn.base import BaseEstimator, TransformerMixin

class StartingVerbExtractor(BaseEstimator, TransformerMixin):

    def starting_verb(self, text):
        sentence_list = nltk.sent_tokenize(text)
        for sentence in sentence_list:
            pos_tags = nltk.pos_tag(tokenize(sentence))
            first_word, first_tag = pos_tags[0]
            if first_tag in ['VB', 'VBP'] or first_word == 'RT':
                return True
        return False

    def fit(self, x, y=None):
        return self

    def transform(self, X):
        X_tagged = pd.Series(X).apply(self.starting_verb)
        return pd.DataFrame(X_tagged)

#### Update the pipeline model

In [16]:
def build_model_with_postag():
   
  # Create data pipeline
  pipeline = Pipeline([
        ('features', FeatureUnion([

            ('text_pipeline', Pipeline([
                ('vect', CountVectorizer(tokenizer=tokenize)),
                ('tfidf', TfidfTransformer())
            ])),

            ('starting_verb', StartingVerbExtractor())
        ])),

        #('clf', XGBClassifier())
        ('clf', MultiOutputClassifier(XGBClassifier()))
    ])
  
  parameters = {
        'features__text_pipeline__vect__ngram_range': ((1, 1), (1, 2)),
        'features__text_pipeline__vect__max_df': (0.5, 0.75, 1.0)
        }
  
  # create grid search object
  model = GridSearchCV(pipeline, param_grid=parameters)
  
  return model

As this is a multiclass classification, we will be using the softprob objective function to output a matrix of probabilities for each class/category.

## 12. Final Model with GridSearch tuned parameters and part of speech tagging.

In [17]:
# build the model pipeline with tuned hyperparameters and POS tagging
xgb_tuned_model_with_postag = build_model_with_postag()

# Train pipeline
print('Training model...')
tuned_XGBClassifier, X_test, Y_test = train(X, Y, xgb_tuned_model_with_postag, category_names)
print("Model trained.")

# output model test results
print('\nEvaluating model...')
evaluation_results_tuned_postag = evaluate_model(tuned_XGBClassifier, X_test, Y_test, category_names)

# Save model to file
print("\nSaving model...")
save_model(tuned_XGBClassifier, model_filepath, classifier_name='tuned_XGBClassifier.pkl')
print("Model saved\n\n")

print("=====Model Evalaution Results=====")
evaluation_results_tuned_postag.head(len(evaluation_results_tuned_postag))

Training model...
Model trained.

Evaluating model...

Saving model...
Model saved


=====Model Evalaution Results=====


Unnamed: 0,accuracy,f1_0,f1_1,precision_0,precision_1,recall_0,recall_1
related,0.790618,0.359159,0.875228,0.674944,0.801334,0.244681,0.964133
request,0.906369,0.945523,0.667119,0.912029,0.86014,0.981571,0.54485
offer,0.994661,0.997323,0.0,0.994661,0.0,1.0,0.0
aid_related,0.742754,0.803439,0.627862,0.724763,0.790278,0.901275,0.520824
medical_help,0.930206,0.963356,0.268,0.93581,0.650485,0.992573,0.168766
medical_products,0.960336,0.979483,0.405714,0.963516,0.78022,0.995988,0.274131
search_and_rescue,0.973494,0.986517,0.223464,0.976195,0.571429,0.997059,0.138889
security,0.981503,0.99066,0.058252,0.981679,0.75,0.999806,0.030303
military,0.969489,0.984378,0.349593,0.971665,0.767857,0.997428,0.226316
child_alone,1.0,1.0,,1.0,,1.0,
