# ML Pipeline Preparation

## 1. Import libraries and load data from database.

    Import Python libraries
    Load dataset from database with read_sql_table
    Define feature and target variables X and Y

In [1]:
# import libraries
import sqlite3
from sqlalchemy import create_engine
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import pickle

import numpy as np
from sklearn.datasets import make_multilabel_classification
from sklearn.multioutput import MultiOutputClassifier
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, make_scorer

import nltk
nltk.download(['punkt', 'wordnet', 'averaged_perceptron_tagger', 'stopwords'])

[nltk_data] Downloading package punkt to /home/hugo/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/hugo/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/hugo/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to /home/hugo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
# load data from database
engine = create_engine('sqlite:///messages_db.db')
df = pd.read_sql_table('messages', engine)

# defining X and Y variables
X = df['message']
Y = df[df.columns[4:]]

In [7]:
df.columns[4:]

Index(['related', 'request', 'offer', 'aid_related', 'medical_help',
       'medical_products', 'search_and_rescue', 'security', 'military',
       'child_alone', 'water', 'food', 'shelter', 'clothing', 'money',
       'missing_people', 'refugees', 'death', 'other_aid',
       'infrastructure_related', 'transport', 'buildings', 'electricity',
       'tools', 'hospitals', 'shops', 'aid_centers', 'other_infrastructure',
       'weather_related', 'floods', 'storm', 'fire', 'earthquake', 'cold',
       'other_weather', 'direct_report'],
      dtype='object')

In [3]:
df.head(2)

Unnamed: 0,id,message,original,genre,related,request,offer,aid_related,medical_help,medical_products,...,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
0,2,Weather update - a cold front from Cuba that c...,Un front froid se retrouve sur Cuba ce matin. ...,direct,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,7,Is the Hurricane over or is it not over,Cyclone nan fini osinon li pa fini,direct,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


## 2. Write a tokenization function to process your text data

In [4]:
def tokenize(text):
    """input: text - the text messages of the database
    word_tokenize - process the text and separate each word as a list's item
    stopwords - remove english stopwords like: a, an, and, to, etc...
    lemmatizer - normalize the words
    lower() - lowercase all words
    strip() - remove whitespaces
    output: a list with each transformed and procesed word
    """
    tokens = word_tokenize(text)
    tokens = [w for w in tokens if w not in stopwords.words("english")]
    lemmatizer = WordNetLemmatizer()

    clean_tokens = []
    for tok in tokens:
        clean_tok = lemmatizer.lemmatize(tok).lower().strip()
        clean_tokens.append(clean_tok)

    return clean_tokens

## 3. Build a machine learning pipeline

This machine pipeline should take in the message column as input and output classification results on the other 36 categories in the dataset. You may find the MultiOutputClassifier helpful for predicting multiple target variables

In [5]:
def build_model():
    """
    this function build the ML model to process the text
    pipeline - simplifies the text's processing
    CountVectorizer - vectorize the words of each message (text)
    tfidf - reflects how important is a word to the message
    clf - it's the ML classifier
    MultiOutputClassifier - It's used because we have 36 categories
    RandomForestClassifier - the classifier
    """
    
    pipeline = Pipeline([

        ('vect', CountVectorizer(tokenizer=tokenize)),
        ('tfidf', TfidfTransformer()),
        ('clf', MultiOutputClassifier(RandomForestClassifier()))
        
    ])
    
    """
    GridSearchCV iterate through the parameters, in this case, the 
    number of estimators of the RandomForest classifier to assess
    which model's parameter is the most accurate
    """
    parameters = {
    
        'clf__estimator__n_estimators': [100],
    
    }

    cv = GridSearchCV(pipeline, param_grid=parameters)
    
    return cv

## 4. Train pipeline

* Split data into train and test sets
* Train pipeline

In [6]:
# her we train the model with the training data and then we can
# make the predictions on the test data

X_train, X_test, y_train, y_test = train_test_split(X, Y)

model = build_model()

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

## 7. Test your model

Accuracy, precision, recall and f1-score of the tuned model.

In [None]:
def evaluate_model(test, pred, col_names):
    """    
    inputs - 
    test - y_test data.
    pred - predicted data.
    col_names - categories' names.
       
    output - 
    df_scores: accuracy, precision, recall and f1 score for each
    category.
    """
    scores = []
    
    # accuracy, precision, recall and f1_score for each category
    for i in range(len(col_names)):
        accuracy = accuracy_score(test[:, i], pred[:, i])
        precision = precision_score(test[:, i], pred[:, i])
        recall = recall_score(test[:, i], pred[:, i])
        f1_sco = f1_score(test[:, i], pred[:, i])
        
        scores.append([accuracy, precision, recall, f1_sco])
    
    scores = np.array(scores)
    df_scores = pd.DataFrame(
        data = scores, index = col_names, columns = [
            'Accuracy', 'Precision', 'Recall', 'F1_score'])
      
    return df_scores   

Here we call the previous function to evaluate the model

In [None]:
print(evaluate_model(np.array(y_test), y_pred, list(Y.columns)))

### 9. Exporting the model as a pickle file

In [None]:
# saving the model as a pickle file
pickle.dump(model, open('model.plk', 'wb'))