# ML Pipeline Preparation
Follow the instructions below to help you create your ML pipeline.
### 1. Import libraries and load data from database.
- Import Python libraries
- Load dataset from database with [`read_sql_table`](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_sql_table.html)
- Define feature and target variables X and Y

In [1]:
# import libraries
import re
import nltk
import pickle
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /home/nbusr/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/nbusr/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /home/nbusr/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
# load data from database
engine = create_engine('sqlite:///InsertDatabaseName.db')
df = pd.read_sql_table('InsertTableName', engine)

Y_labels = ['related', 'request', 'offer', 'aid_related', 
        'medical_help', 'medical_products', 'search_and_rescue', 
        'security', 'military', 'child_alone', 'water', 'food', 
        'shelter', 'clothing', 'money', 'missing_people', 'refugees', 
        'death', 'other_aid', 'infrastructure_related', 'transport', 
        'buildings', 'electricity', 'tools', 'hospitals', 'shops', 
        'aid_centers', 'other_infrastructure', 'weather_related', 
        'floods', 'storm', 'fire', 'earthquake', 'cold', 
        'other_weather', 'direct_report']
X = df['message'].values
Y = df[Y_labels].values
category_names = Y_labels


### 2. Write a tokenization function to process your text data

In [3]:
X[3]


'UN reports Leogane 80-90 destroyed. Only Hospital St. Croix functioning. Needs supplies desperately.'

In [4]:
def tokenize(text):
    stop_words = stopwords.words("english")
    lemmatizer = WordNetLemmatizer()
    
    # normalize case and remove punctuation
    text = re.sub(r"[^a-zA-Z0-9]", " ", text.lower())
    
    # tokenize text
    tokens = word_tokenize(text)
    
    # lemmatize andremove stop words
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]

    return(tokens)

tokenize(X[3])


['un',
 'report',
 'leogane',
 '80',
 '90',
 'destroyed',
 'hospital',
 'st',
 'croix',
 'functioning',
 'need',
 'supply',
 'desperately']

### 3. Build a machine learning pipeline
- You'll find the [MultiOutputClassifier](http://scikit-learn.org/stable/modules/generated/sklearn.multioutput.MultiOutputClassifier.html) helpful for predicting multiple target variables.

In [5]:
pipeline = Pipeline([
    ('vect', CountVectorizer(tokenizer = tokenize)),
    ('tfidf', TfidfTransformer()),
    ('clf', MultiOutputClassifier(DecisionTreeClassifier(random_state = 42), n_jobs = -1))
])


### 4. Train pipeline
- Split data into train and test sets
- Train pipeline

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.3, random_state = 42)
pipeline.fit(X_train, y_train)


Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...tion_leaf=0.0, presort=False, random_state=42,
            splitter='best'),
           n_jobs=-1))])

### 5. Test your model
Report the f1 score, precision and recall on both the training set and the test set. You can use sklearn's `classification_report` function here. 

In [7]:
## get the f1 score of X_test
y_pred = pipeline.predict(X_test)
print(classification_report(y_test[:,1], y_pred[:,1], target_names = category_names))


                        precision    recall  f1-score   support

               related       0.86      0.88      0.87      6555
               request       0.30      0.27      0.28      1308

           avg / total       0.76      0.77      0.77      7863



  .format(len(labels), len(target_names))


### 6. Improve your model
Use grid search to find better parameters. 

In [8]:
pipeline.get_params()


{'clf': MultiOutputClassifier(estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
             max_features=None, max_leaf_nodes=None,
             min_impurity_decrease=0.0, min_impurity_split=None,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, presort=False, random_state=42,
             splitter='best'),
            n_jobs=-1),
 'clf__estimator': DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
             max_features=None, max_leaf_nodes=None,
             min_impurity_decrease=0.0, min_impurity_split=None,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, presort=False, random_state=42,
             splitter='best'),
 'clf__estimator__class_weight': None,
 'clf__estimator__criterion': 'gini',
 'clf__estimator__max_depth': None,
 'clf__estimator__max_features': None,
 'clf__estimator__max_leaf_nodes': None,
 'clf__estimator

In [9]:
parameters = {
    'vect__min_df': [1],
    'vect__lowercase': [False],
    'tfidf__smooth_idf': [False],
}

cv = GridSearchCV(pipeline, param_grid = parameters, cv = 2)


### 7. Test your model
Show the accuracy, precision, and recall of the tuned model.

In [10]:
cv.fit(X_train, y_train)


GridSearchCV(cv=2, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...tion_leaf=0.0, presort=False, random_state=42,
            splitter='best'),
           n_jobs=-1))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'vect__min_df': [1], 'vect__lowercase': [False], 'tfidf__smooth_idf': [False]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [11]:
cv.best_score_


0.03696031399912778

In [12]:
## get the f1 score of X_test
y_pred = cv.predict(X_test)
print(classification_report(y_test[:,1], y_pred[:,1], target_names=category_names))


                        precision    recall  f1-score   support

               related       0.86      0.87      0.87      6555
               request       0.31      0.28      0.29      1308

           avg / total       0.77      0.78      0.77      7863



  .format(len(labels), len(target_names))


### 8. Try improving your model further. Here are a few ideas:
* try other machine learning algorithms
* add other features besides the TF-IDF

In [13]:
pipeline = Pipeline([
    ('vect', CountVectorizer(tokenizer = tokenize)),
    ('tfidf', TfidfTransformer()),
    ('clf', MultiOutputClassifier(RandomForestClassifier(random_state = 42), n_jobs = -1))
])

parameters = {
    'vect__min_df': [1],
    'vect__lowercase': [False],
    'tfidf__smooth_idf': [False],
}

## cv = GridSearchCV(pipeline, param_grid = parameters, cv = 2)
cv = GridSearchCV(pipeline, parameters, cv = 2, n_jobs = -1)
cv.fit(X_train, y_train)


  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)


GridSearchCV(cv=2, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...
            oob_score=False, random_state=42, verbose=0, warm_start=False),
           n_jobs=-1))]),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'vect__min_df': [1], 'vect__lowercase': [False], 'tfidf__smooth_idf': [False]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [14]:
## get the f1 score of X_test
y_pred = cv.predict(X_test)
print(classification_report(y_test[:,1], y_pred[:,1], target_names=category_names))


                        precision    recall  f1-score   support

               related       0.84      0.97      0.90      6555
               request       0.39      0.10      0.16      1308

           avg / total       0.77      0.82      0.78      7863



  .format(len(labels), len(target_names))


### 9. Export your model as a pickle file

In [15]:
with open('clf.pickle', 'wb') as f:
    pickle.dump(cv, f)


### 10. Use this notebook to complete `train.py`
Use the template file attached in the Resources folder to write a script that runs the steps above to create a database and export a model based on a new dataset specified by the user.