# ML Pipeline Preparation
Follow the instructions below to help you create your ML pipeline.
### 1. Import libraries and load data from database.
- Import Python libraries
- Load dataset from database with [`read_sql_table`](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_sql_table.html)
- Define feature and target variables X and Y

In [None]:
!pip install nltk
!pip install plotly

In [1]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to /home/dceads/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/dceads/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/dceads/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to /home/dceads/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [1]:
# import libraries
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', 500)

import sys
import os
import re
from sqlalchemy import create_engine
import pickle
import nltk

from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords

from scipy.stats import gmean
# import relevant functions/modules from the sklearn
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import fbeta_score, make_scorer
from sklearn.metrics import precision_recall_fscore_support
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, AdaBoostClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer, TfidfVectorizer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import jaccard_score
from sklearn.base import BaseEstimator,TransformerMixin
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

In [2]:
# load data from database
database_filepath = "../data/disaster_response_db.db"
engine = create_engine('sqlite:///' + database_filepath)
df = pd.read_sql_table('message_categories',engine)
df.head(3)

Unnamed: 0,id,message,original,genre,related,request,offer,aid_related,medical_help,medical_products,search_and_rescue,security,military,child_alone,water,food,shelter,clothing,money,missing_people,refugees,death,other_aid,infrastructure_related,transport,buildings,electricity,tools,hospitals,shops,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
0,2,Weather update - a cold front from Cuba that c...,Un front froid se retrouve sur Cuba ce matin. ...,direct,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,7,Is the Hurricane over or is it not over,Cyclone nan fini osinon li pa fini,direct,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0
2,8,Looking for someone but no name,"Patnm, di Maryani relem pou li banm nouvel li ...",direct,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [3]:
df.describe()

Unnamed: 0,id,related,request,offer,aid_related,medical_help,medical_products,search_and_rescue,security,military,child_alone,water,food,shelter,clothing,money,missing_people,refugees,death,other_aid,infrastructure_related,transport,buildings,electricity,tools,hospitals,shops,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
count,26216.0,26216.0,26216.0,26216.0,26216.0,26216.0,26216.0,26216.0,26216.0,26216.0,26216.0,26216.0,26216.0,26216.0,26216.0,26216.0,26216.0,26216.0,26216.0,26216.0,26216.0,26216.0,26216.0,26216.0,26216.0,26216.0,26216.0,26216.0,26216.0,26216.0,26216.0,26216.0,26216.0,26216.0,26216.0,26216.0,26216.0
mean,15224.82133,0.77365,0.170659,0.004501,0.414251,0.079493,0.050084,0.027617,0.017966,0.032804,0.0,0.063778,0.111497,0.088267,0.015449,0.023039,0.011367,0.033377,0.045545,0.131446,0.065037,0.045812,0.050847,0.020293,0.006065,0.010795,0.004577,0.011787,0.043904,0.278341,0.082202,0.093187,0.010757,0.093645,0.020217,0.052487,0.193584
std,8826.88914,0.435276,0.376218,0.06694,0.492602,0.270513,0.218122,0.163875,0.132831,0.178128,0.0,0.244361,0.314752,0.283688,0.123331,0.150031,0.106011,0.179621,0.2085,0.337894,0.246595,0.209081,0.219689,0.141003,0.077643,0.103338,0.067502,0.107927,0.204887,0.448191,0.274677,0.2907,0.103158,0.29134,0.140743,0.223011,0.395114
min,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,7446.75,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,15662.5,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,22924.25,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,30265.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [4]:
result2 = df.iloc[:,4:].apply(pd.value_counts)
result2


Unnamed: 0,related,request,offer,aid_related,medical_help,medical_products,search_and_rescue,security,military,child_alone,water,food,shelter,clothing,money,missing_people,refugees,death,other_aid,infrastructure_related,transport,buildings,electricity,tools,hospitals,shops,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
0,6122,21742.0,26098.0,15356.0,24132.0,24903.0,25492.0,25745.0,25356.0,26216.0,24544.0,23293.0,23902.0,25811.0,25612.0,25918.0,25341.0,25022.0,22770.0,24511.0,25015.0,24883.0,25684.0,26057.0,25933.0,26096.0,25907.0,25065.0,18919.0,24061.0,23773.0,25934.0,23761.0,25686.0,24840.0,21141.0
1,19906,4474.0,118.0,10860.0,2084.0,1313.0,724.0,471.0,860.0,,1672.0,2923.0,2314.0,405.0,604.0,298.0,875.0,1194.0,3446.0,1705.0,1201.0,1333.0,532.0,159.0,283.0,120.0,309.0,1151.0,7297.0,2155.0,2443.0,282.0,2455.0,530.0,1376.0,5075.0
2,188,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


### Note that there is sparse data for some columns that will affect the classification report



In [5]:

#Remove child alone as it has all zeros only
df = df.drop(['child_alone'],axis=1)

# Given value 2 in the related field are neglible so it could be error. Replacing 2 with 1 to consider it a valid response.

df['related']=df['related'].map(lambda x: 1 if x == 2 else x)

# Extract X and y variables 

X = df['message']
y = df.iloc[:,4:]


### 2. Write functions for tokenization 

In [6]:
def average_classification_report (y_true, y_pred): 
    """A function that measures mean of f1, precision, recall for each class within multi-class prediction 
       Returns a dataframe with columns: 
       f1-score (average for all possible values of specific class)
       precision (average for all possible values of specific class)
       recall (average for all possible values of specific class)
       kindly keep in mind that some classes might be imbalanced and average values may mislead. 
    """
    #instantiating a dataframe
    report = pd.DataFrame ()
    
    for col in y_true.columns:
        #returning dictionary from classification report
        class_dict = classification_report (output_dict = True, y_true = y_true.loc [:,col], y_pred = y_pred.loc [:,col])
    
        #converting from dictionary to dataframe
        eval_df = pd.DataFrame (pd.DataFrame.from_dict (class_dict))
        
        #print (eval_df)
        
        #dropping unnecessary columns
        eval_df.drop(['macro avg', 'weighted avg'], axis =1, inplace = True)
        
        #dropping unnecessary row "support"
        eval_df.drop(index = 'support', inplace = True)
        
        #calculating mean values
        av_eval_df = pd.DataFrame (eval_df.transpose ().mean ())
        
        #transposing columns to rows and vice versa 
        av_eval_df = av_eval_df.transpose ()
    
        #appending result to report df
        report = report.append (av_eval_df, ignore_index = True)    
    
    #renaming indexes for convinience
    report.index = y_true.columns
    
    return report

In [7]:
def tokenize(text):
    """
    Tokenize the text
    
    Arguments:
        text -> text that needs to be tokenized
    Output:
        clean_tokens -> tokenized text
    """
   
    url_regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    
   
    detected_urls = re.findall(url_regex, text)
    
    
    for detected_url in detected_urls:
        text = text.replace(detected_url, 'urlplaceholder')

    
    tokens = nltk.word_tokenize(text)
    
   
    lemmatizer = nltk.WordNetLemmatizer()

    
    clean_tokens = [lemmatizer.lemmatize(w).lower().strip() for w in tokens]
    return clean_tokens

In [8]:
def tokenize_stopwords(text):
    """
    Tokenize the text
    
    Arguments:
        text -> text that needs to be tokenized
    Output:
        tokens_without_sw -> tokenized text without stop words
    """
      
    # Normalize text
    text = re.sub(r"[^a-zA-Z0-9]", " ", text.lower())
    
    stop_words = stopwords.words("english")
    
    #tokenize
    words = word_tokenize (text)
    
    #stemming
    stemmed = [PorterStemmer().stem(w) for w in words]
    
    #lemmatizing
    words_lemmed = [WordNetLemmatizer().lemmatize(w) for w in stemmed if w not in stop_words]
   
    return words_lemmed

### 3. Build a machine learning pipeline
This machine pipeline should take in the `message` column as input and output classification results on the other 36 categories in the dataset. You may find the [MultiOutputClassifier](http://scikit-learn.org/stable/modules/generated/sklearn.multioutput.MultiOutputClassifier.html) helpful for predicting multiple target variables.

In [9]:
def model_pipeline(moc):
    """
    Build initial machine learning pipeline
    
    Arguments moc: A Classifier
        
    Output:
        pipeline built for multi output classification
    """
    
    pipeline1 = Pipeline([
        ('features', FeatureUnion([

            ('text_pipeline', Pipeline([
                ('count_vectorizer', CountVectorizer(tokenizer=tokenize)),
                ('tfidf_transformer', TfidfTransformer())
            ]))
            
        ])),

        ('classifier', moc)
    ])
    
    return pipeline1

In [10]:
def model_pipeline_revised(moc):
    """
    Build initial machine learning pipeline
    
    Arguments moc: A Classifier
        
    Output:
        pipeline built for multi output classification
    """
    
    pipeline1 = Pipeline([
        ('features', FeatureUnion([

            ('text_pipeline', Pipeline([
                ('count_vectorizer', CountVectorizer(ngram_range = (3,3),tokenizer=tokenize_stopwords)),
                ('tfidf_transformer', TfidfTransformer())
            ]))
            
        ])),

        ('classifier', moc)
    ])
    
    return pipeline1

### 4. Train pipeline
- Split data into train and test sets
- Train pipeline

In [11]:
pipeline1= model_pipeline(MultiOutputClassifier(RandomForestClassifier()))

X_train, X_test, y_train, y_test = train_test_split(X, y)
pipeline_fitted = pipeline1.fit(X_train, y_train)


### 5. Test your model
Report the f1 score, precision and recall for each output category of the dataset. You can do this by iterating through the columns and calling sklearn's `classification_report` on each.

In [12]:

y_predicted = pipeline_fitted.predict(X_test)

print(type(y_predicted))

<class 'numpy.ndarray'>


In [41]:
# Print classification report on test data
# Print classification report on test data
print(classification_report(y_test.values, y_predicted, target_names=y.columns.values, zero_division=1))

Y_pred0 = pd.DataFrame(y_predicted, columns = y_test.columns)

for column in y_test.columns:
    print('Model Performance with Category: {}'.format(column))
    print(accuracy_score(y_test[column],Y_pred0[column]))
  




                        precision    recall  f1-score   support

               related       0.82      0.97      0.89      5055
               request       0.87      0.45      0.59      1136
                 offer       1.00      0.00      0.00        37
           aid_related       0.79      0.60      0.69      2711
          medical_help       0.73      0.04      0.08       525
      medical_products       0.64      0.05      0.09       318
     search_and_rescue       1.00      0.03      0.06       202
              security       1.00      0.00      0.00       110
              military       0.56      0.02      0.04       235
                 water       0.89      0.22      0.36       419
                  food       0.90      0.36      0.52       724
               shelter       0.92      0.28      0.43       592
              clothing       0.86      0.06      0.11        99
                 money       1.00      0.04      0.08       152
        missing_people       1.00      

### 6. Improve your model
Use grid search to find better parameters. 

In [14]:
pipeline1.get_params()

{'memory': None,
 'steps': [('features',
   FeatureUnion(transformer_list=[('text_pipeline',
                                   Pipeline(steps=[('count_vectorizer',
                                                    CountVectorizer(tokenizer=<function tokenize at 0x7f4f63aff160>)),
                                                   ('tfidf_transformer',
                                                    TfidfTransformer())]))])),
  ('classifier', MultiOutputClassifier(estimator=RandomForestClassifier()))],
 'verbose': False,
 'features': FeatureUnion(transformer_list=[('text_pipeline',
                                 Pipeline(steps=[('count_vectorizer',
                                                  CountVectorizer(tokenizer=<function tokenize at 0x7f4f63aff160>)),
                                                 ('tfidf_transformer',
                                                  TfidfTransformer())]))]),
 'classifier': MultiOutputClassifier(estimator=RandomForestClassifier()

In [15]:
# Improve with GridSearch


parameters = {
     'features__text_pipeline__count_vectorizer__ngram_range': [(1,1), (1,2), (1,3)]
}

cv = GridSearchCV(pipeline1, parameters, n_jobs=-1)

cv.fit(X_train, y_train)
cv.best_estimator_

Pipeline(steps=[('features',
                 FeatureUnion(transformer_list=[('text_pipeline',
                                                 Pipeline(steps=[('count_vectorizer',
                                                                  CountVectorizer(ngram_range=(1,
                                                                                               3),
                                                                                  tokenizer=<function tokenize at 0x7f4f63aff160>)),
                                                                 ('tfidf_transformer',
                                                                  TfidfTransformer())]))])),
                ('classifier',
                 MultiOutputClassifier(estimator=RandomForestClassifier()))])

In [16]:
# Get the prediction values from the grid search cross validator
y_predicted_cv = cv.predict(X_test)


### 7. Test your model
Show the accuracy, precision, and recall of the tuned model.  

Since this project focuses on code quality, process, and  pipelines, there is no minimum performance metric needed to pass. However, make sure to fine tune your models for accuracy, precision and recall to make your project stand out - especially for your portfolio!

In [17]:
# Print classification report on test data
print(classification_report(y_test.values, y_predicted_cv, target_names=y.columns.values, zero_division=1))

                        precision    recall  f1-score   support

               related       0.82      0.97      0.89      5055
               request       0.86      0.46      0.60      1136
                 offer       1.00      0.00      0.00        37
           aid_related       0.81      0.53      0.64      2711
          medical_help       0.68      0.04      0.07       525
      medical_products       0.69      0.06      0.10       318
     search_and_rescue       0.79      0.05      0.10       202
              security       0.00      0.00      0.00       110
              military       0.00      0.00      0.00       235
                 water       0.90      0.39      0.54       419
                  food       0.88      0.45      0.59       724
               shelter       0.87      0.31      0.46       592
              clothing       1.00      0.06      0.11        99
                 money       1.00      0.03      0.05       152
        missing_people       1.00      

In [21]:
from sklearn.metrics import accuracy_score

Y_pred2 = pd.DataFrame(y_predicted_cv, columns = y_test.columns)

for column in y_test.columns:
    print('Model Performance with Category: {}'.format(column))
    print(accuracy_score(y_test[column],Y_pred2[column]))
  

Model Performance with Category: related
0.8101922490082393
Model Performance with Category: request
0.8939578883124809
Model Performance with Category: offer
0.9943545926151969
Model Performance with Category: aid_related
0.7529752822703693
Model Performance with Category: medical_help
0.9214220323466585
Model Performance with Category: medical_products
0.9530057979859627
Model Performance with Category: search_and_rescue
0.9703997558742753
Model Performance with Category: security
0.9830637778455905
Model Performance with Category: military
0.9638388770216662
Model Performance with Category: water
0.9580408910588953
Model Performance with Category: food
0.932255111382362
Model Performance with Category: shelter
0.9336283185840708
Model Performance with Category: clothing
0.9858101922490082
Model Performance with Category: money
0.9774183704607873
Model Performance with Category: missing_people
0.9887091852303936
Model Performance with Category: refugees
0.9682636557827281
Model Perfo

### 8. Try improving your model further. Here are a few ideas:
* try other machine learning algorithms
* add other features besides the TF-IDF

In [11]:


# Testing with Stopwords and ngram range of 1,3 we found above
pipeline_revised = model_pipeline(MultiOutputClassifier(AdaBoostClassifier()))



In [12]:

pipeline_revised.fit(X_train, y_train)
y_pred_revised = pipeline_revised.predict(X_test)

NameError: name 'X_train' is not defined

In [39]:

# Print classification report on test data
print(classification_report(y_test.values, y_pred_revised, target_names=y.columns.values, zero_division=1))

Y_pred3 = pd.DataFrame(y_pred_revised, columns = y_test.columns)

for column in y_test.columns:
    print('Model Performance with Category: {}'.format(column))
    print(accuracy_score(y_test[column],Y_pred3[column]))
  


                        precision    recall  f1-score   support

               related       0.83      0.94      0.88      5055
               request       0.76      0.54      0.63      1136
                 offer       0.00      0.00      0.00        37
           aid_related       0.77      0.58      0.66      2711
          medical_help       0.58      0.24      0.34       525
      medical_products       0.63      0.31      0.41       318
     search_and_rescue       0.71      0.17      0.28       202
              security       0.27      0.05      0.09       110
              military       0.60      0.29      0.39       235
                 water       0.73      0.60      0.66       419
                  food       0.80      0.69      0.74       724
               shelter       0.76      0.53      0.63       592
              clothing       0.70      0.43      0.54        99
                 money       0.55      0.25      0.34       152
        missing_people       0.55      

In [42]:
report0 = average_classification_report (y_test, Y_pred0)
report1 = average_classification_report (y_test, Y_pred2)
report2 = average_classification_report (y_test, Y_pred3)

display ((report1-report0).mean ())

#report0 Original model is better than grid adjusted report1. Now lets see if report2 is better


precision   -0.009767
recall      -0.001002
f1-score    -0.000652
dtype: float64

In [43]:
display ((report2-report0).mean ())

precision   -0.023197
recall       0.051302
f1-score     0.061557
dtype: float64

In [None]:
pipeline_revised.get_params()

### From above f1-score of AdaBoost are better so lets use that

### 9. Export your model as a pickle file

In [44]:

pickle.dump(pipeline_revised, open('message_classifier.pkl', 'wb'))



### 10. Use this notebook to complete `train.py`
Use the template file attached in the Resources folder to write a script that runs the steps above to create a database and export a model based on a new dataset specified by the user.