# ML Pipeline Preparation

### 1. Import libraries and load data from database.

In [9]:
# import libraries
import numpy as np
import pandas as pd
from sqlalchemy import create_engine
import nltk
nltk.download(['punkt', 'wordnet','averaged_perceptron_tagger','stopwords'])
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import re
import pickle
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.metrics import classification_report
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier

[nltk_data] Downloading package punkt to /home/iris/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/iris/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/iris/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to /home/iris/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
def load_data():
    
    # load data from database
    engine = create_engine('sqlite:///disaster_clean.db')
    df = pd.read_sql_table('disaster_clean',con=engine)
    #  define feature X and target variables y and names of features category_names 
    #df = df.sample(frac=0.1, random_state=1)
    
    X = df['message'].values 
    y = df[df.columns[4:]]
    category_names = y.columns.tolist()
    
    return X, y, category_names

### 2. A tokenization function that processes text data

In [11]:
def tokenize(text):
    
    #Regex to find urls
    url_regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    
    # Finds all urls from the provided text
    detected_urls = re.findall(url_regex, text)
    
    #Replaces all urls found with the "urlplaceholder"
    for url in detected_urls:
        text = text.replace(url, "urlplaceholder")
        
    # Normalize text
    text = re.sub(r"[^a-zA-Z0-9]", " ", text.lower())    
        
    # Extracts the word tokens from the provided text    
    tokens = word_tokenize(text)
      
    # Remove stop words
    stop = stopwords.words("english")
    words = [t for t in tokens if t not in stop]
    
    #Lemmanitizer to remove inflectional and derivationally related forms of a word
    lemmatizer = WordNetLemmatizer()

    # Makes a list of clean tokens
    clean_tokens = []
    for tok in words:
        clean_tok = lemmatizer.lemmatize(tok).lower().strip()
        clean_tokens.append(clean_tok)

    return clean_tokens

### 3. Build a machine learning pipeline
This machine pipeline should take in the `message` column as input and output classification results on the other 36 categories in the dataset. Y

In [12]:
def make_pipeline(model):
    
    pipeline = Pipeline([
        ('vect', CountVectorizer(tokenizer = tokenize)),
        ('tfidf', TfidfTransformer()),
        ('clf', model)
    ])
    
    return pipeline   

In [13]:
def scores(y_test, y_pred):
    '''
    Printing the classification report for each label
    Calculating the total accuracy of the model
    ''' 
    
    
    for i, col in enumerate(y_test):
        print('Feature {}: {}'.format(i+1, col))
        print(classification_report(y_test[col], y_pred[:, i]))
        
    accuracy = (y_pred == y_test.values).mean()
    print('The model accuracy is {:.3f}'.format(accuracy))
    
    return

We would like to minize the false negatives as this would mean that the messages are identified as irrelevant in cases they are in reality relevant. Thus we would like to maximize the $recall=\frac{tp}{tp+fn}$, thus the recall score is more relevant. 

### 5. Test model
Report the f1 score, precision and recall for each output category of the dataset. 

In [14]:
def grid_search(pipeline, X_train, y_train):
    
    '''Tuning pipeline parameters'''
    
    parameters = {'clf__estimator__n_estimators': [120, 140]}

    # create grid search object
    model = GridSearchCV(pipeline, param_grid=parameters, scoring='recall_micro', cv=4)
    print('Training model...')
    model.fit(X_train, y_train)
    print(model.best_params_)
    
    return model

In [15]:
# Load data
X, y, category_names = load_data()
# Make Pipeline
pipeline = make_pipeline(MultiOutputClassifier(GradientBoostingClassifier(max_depth=6)))   
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state =7, train_size=0.77)
# Perform parameter tuning
model = grid_search(pipeline, X_train, y_train)
print('Predict on test data..')
y_pred = model.predict(X_test)
scores(y_test, y_pred)

#Save model
pickle.dump(model, open('model.pkl', 'wb'))

Training model...
{'clf__estimator__n_estimators': 140}
Predict on test data..
Feature 1: request
              precision    recall  f1-score   support

           0       0.90      0.97      0.94      4960
           1       0.79      0.51      0.62      1070

    accuracy                           0.89      6030
   macro avg       0.85      0.74      0.78      6030
weighted avg       0.88      0.89      0.88      6030

Feature 2: offer
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      5971
           1       0.12      0.14      0.13        59

    accuracy                           0.98      6030
   macro avg       0.56      0.56      0.56      6030
weighted avg       0.98      0.98      0.98      6030

Feature 3: aid_related
              precision    recall  f1-score   support

           0       0.75      0.87      0.81      3480
           1       0.78      0.60      0.68      2550

    accuracy                           0.76 