# ML Pipeline Preparation

In this notebook, we prototype a ML pipeline to preprocess and classify the text message data.
### 1. Import libraries and load data from database.
- Import Python libraries
- Load dataset from database
- Prepare feature and target variables X and Y

In [52]:
# import libraries
import numpy as np
import pandas as pd
import sklearn
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import FunctionTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

import nltk
from nltk import WordNetLemmatizer as Lemmatizer
from nltk import word_tokenize
from nltk import PorterStemmer
nltk.download(['punkt','wordnet'])

from sqlalchemy import create_engine

[nltk_data] Downloading package punkt to /Users/hchen/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/hchen/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
# load data from database
database_path = 'Disaster_Response.db'
table_name = 'labelled_messages'
engine = create_engine('sqlite:///'+database_path)
df = pd.read_sql_table(table_name, engine)
df.head()

Unnamed: 0,id,message,original,genre,related,request,offer,aid_related,medical_help,medical_products,...,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
0,2,Weather update - a cold front from Cuba that c...,Un front froid se retrouve sur Cuba ce matin. ...,direct,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,7,Is the Hurricane over or is it not over,Cyclone nan fini osinon li pa fini,direct,1,0,0,1,0,0,...,0,0,1,0,1,0,0,0,0,0
2,8,Looking for someone but no name,"Patnm, di Maryani relem pou li banm nouvel li ...",direct,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,9,UN reports Leogane 80-90 destroyed. Only Hospi...,UN reports Leogane 80-90 destroyed. Only Hospi...,direct,1,1,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
4,12,"says: west side of Haiti, rest of the country ...",facade ouest d Haiti et le reste du pays aujou...,direct,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [3]:
sum(df.original.isna())

16046

In [4]:
df.original.nunique()

9630

In [32]:
df.shape

(26216, 40)

In [35]:
sum(df.genre.isna())

0

In [5]:
df.genre.unique()

array(['direct', 'social', 'news'], dtype=object)

In [6]:
df.genre.value_counts()

news      13054
direct    10766
social     2396
Name: genre, dtype: int64

In [7]:
# columns of df
columns = df.columns.values
columns[1:4], columns[4:]

(array(['message', 'original', 'genre'], dtype=object),
 array(['related', 'request', 'offer', 'aid_related', 'medical_help',
        'medical_products', 'search_and_rescue', 'security', 'military',
        'child_alone', 'water', 'food', 'shelter', 'clothing', 'money',
        'missing_people', 'refugees', 'death', 'other_aid',
        'infrastructure_related', 'transport', 'buildings', 'electricity',
        'tools', 'hospitals', 'shops', 'aid_centers',
        'other_infrastructure', 'weather_related', 'floods', 'storm',
        'fire', 'earthquake', 'cold', 'other_weather', 'direct_report'],
       dtype=object))

In [8]:
# read off predictors X and targets y
X = df[columns[1:4:2]].values
y = df[columns[4:]].values

In [9]:
X[:,0]

array(['Weather update - a cold front from Cuba that could pass over Haiti',
       'Is the Hurricane over or is it not over',
       'Looking for someone but no name', ...,
       "Proshika, operating in Cox's Bazar municipality and 5 other unions, Ramu and Chokoria, assessment, 5 kg rice, 1,5 kg lentils to 700 families.",
       'Some 2,000 women protesting against the conduct of the elections were teargassed as they tried to converge on the local electoral commission offices in the southern oil city of Port Harcourt.',
       'A radical shift in thinking came about as a result of this meeting, recognizing that HIV/AIDS is at the core of the humanitarian crisis and identifying the crisis itself as a function of the HIV/AIDS pandemic.'],
      dtype=object)

In [18]:
X

array([['Weather update - a cold front from Cuba that could pass over Haiti',
        'direct'],
       ['Is the Hurricane over or is it not over', 'direct'],
       ['Looking for someone but no name', 'direct'],
       ...,
       ["Proshika, operating in Cox's Bazar municipality and 5 other unions, Ramu and Chokoria, assessment, 5 kg rice, 1,5 kg lentils to 700 families.",
        'news'],
       ['Some 2,000 women protesting against the conduct of the elections were teargassed as they tried to converge on the local electoral commission offices in the southern oil city of Port Harcourt.',
        'news'],
       ['A radical shift in thinking came about as a result of this meeting, recognizing that HIV/AIDS is at the core of the humanitarian crisis and identifying the crisis itself as a function of the HIV/AIDS pandemic.',
        'news']], dtype=object)

In [19]:
y

array([[1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       ...,
       [1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0]])

### 2. A tokenization function to process the text data

In [10]:
def tokenize(text, stemmize=False):
    """
    perform the following transformation of text
    lower
    tokenize
    lemmatize
    stemmize
    
    Input:
    text - string
    stemmize - boolean, if True, then stemmize the tokens. Default = False.
    
    Output:
    list consists of stemmed tokens from the input text
    """
    # lower, tokenize
    tokens = word_tokenize(text.lower())
    
    lemmatizer = Lemmatizer()
    stem_tokens = [lemmatizer.lemmatize(x) for x in tokens]
    if stemmize:
        stemmer = PorterStemmer()
        stem_tokens = [stemmer.stem(x) for x in stem_tokens]
    
    
    
    return stem_tokens

In [12]:
# Example
tokenize(df.message[0])

['weather',
 'update',
 '-',
 'a',
 'cold',
 'front',
 'from',
 'cuba',
 'that',
 'could',
 'pas',
 'over',
 'haiti']

In [13]:
tokenize(df.message[0], stemmize=True)

['weather',
 'updat',
 '-',
 'a',
 'cold',
 'front',
 'from',
 'cuba',
 'that',
 'could',
 'pa',
 'over',
 'haiti']

### 3. Build a machine learning pipeline
This machine pipeline should take in the `message` column as input and output classification results on the other 36 categories in the dataset. You may find the [MultiOutputClassifier](http://scikit-learn.org/stable/modules/generated/sklearn.multioutput.MultiOutputClassifier.html) helpful for predicting multiple target variables.

Pipeline structure:

- features

    - tfidf vectorization from text message
    
        - extract the 'message' column
        - tokenization
        - tfidfvectorization
        
    - one-hot encodings from genre
        
        - extract the 'genre' column
        - encoding
        
- classifiers

- GridSearchCV
    

In [50]:
# vectorize the function tokenize
vect_tokenize = np.vectorize(tokenize)
# supped up to a transformer
class Tokenizer(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return np.array(list(map(tokenize, X)))

# customized transformer to get dummy variables
class GetDummies(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return pd.get_dummies(X)


pipeline = Pipeline([
    
    ('features', FeatureUnion([
        
        ('vect', Pipeline([
            
            ('message', FunctionTransformer(lambda x: x[:, 0])),
            ('tfidfvect',TfidfVectorizer(tokenizer=tokenize, stop_words='english'))
        ])),
        
        ('encoding', Pipeline([
            
            ('genre', FunctionTransformer(lambda x: x[:,1])), 
            ('dummies',GetDummies())]))])),

    ('clf',RandomForestClassifier())])

### 4. Train pipeline (Baseline model)
- Split data into train and test sets
- Train pipeline

In [53]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.35, random_state=57)

In [51]:
pipeline.fit(X_train,y_train)

Pipeline(steps=[('features',
                 FeatureUnion(transformer_list=[('vect',
                                                 Pipeline(steps=[('message',
                                                                  FunctionTransformer(func=<function <lambda> at 0x7fe82976e8b0>)),
                                                                 ('tfidfvect',
                                                                  TfidfVectorizer(stop_words='english',
                                                                                  tokenizer=<function tokenize at 0x7fe84a36e670>))])),
                                                ('encoding',
                                                 Pipeline(steps=[('genre',
                                                                  FunctionTransformer(func=<function <lambda> at 0x7fe82976e940>)),
                                                                 ('dummies',
                                         

### 5. Hyperparameter tuning.
Use Bayesian hyperparameter optimization to find better parameters. 

### 6. Packaging into python script