### 8. Try improving your model further. Here are a few ideas:
* try other machine learning algorithms
* add other features besides the TF-IDF

In [1]:
# import libraries
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
import re
import pickle
import string 
import sys 

import nltk
nltk.download(['punkt', 'wordnet','stopwords'])

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer

from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, make_scorer
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import BernoulliNB


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\loisn\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\loisn\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\loisn\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# load data from database
engine = create_engine('sqlite:///DisasterResponse.db')
df = pd.read_sql_table('DS_messages', engine)
X = df['message']
y = df.drop(['message', 'genre', 'id', 'original'], axis=1)
categories = y.columns.tolist()

In [3]:
def tokenize(text):
    # normalize text and remove punctuation
    text = re.sub(r"[^a-zA-Z0-9]", " ", text.lower())
    
    # tokenize text
    tokens = word_tokenize(text)
    stop_words = stopwords.words("english")
    words = [w for w in tokens if w not in stop_words]
    
    # Reduce words to their stems
    stemmer = PorterStemmer()
    stemmed = [stemmer.stem(w) for w in words]
    
    # Reduce words to their root form
    lemmatizer = WordNetLemmatizer()
    lemmed = [lemmatizer.lemmatize(w) for w in stemmed]
    
    return lemmed

### Using Bernourlli algorithm with custom parameters

In [4]:
pipeline = Pipeline([
    ('vect', CountVectorizer(tokenizer=tokenize)),
    ('tfidf', TfidfTransformer()),
    ('clf', MultiOutputClassifier(BernoulliNB()))
])

parameters = {
    'vect__max_df': (0.5, 0.75, 1.0),
    'vect__ngram_range': ((1, 1), (1,2)),
    'vect__max_features': (None, 5000,10000),
    'tfidf__use_idf': (True, False)
}

cv = GridSearchCV(pipeline, param_grid=parameters)


In [18]:
def report_metrics(actual, predicted, categories):
    """Calculate evaluation metrics for ML model
    
    Args:
    actual: array. Array containing actual labels.
    predicted: array. Array containing predicted labels.
    col_names: list of strings. List containing names for each of the predicted fields.
       
    Returns:
    metrics_df: dataframe. Dataframe containing the precision, recall 
    and f1 score for a given set of actual and predicted labels.
    """
    categories = y.columns.tolist()
    metrics = []
    
    # Calculate evaluation metrics for each set of labels
    for i in range(len(categories)):
        accuracy = round(accuracy_score(actual[:, i], predicted[:, i]), 2)
        recall = round(recall_score(actual[:, i], predicted[:, i], zero_division=0), 2)
        precision = round(precision_score(actual[:, i], predicted[:, i], zero_division=0), 2)
        f1 = round(f1_score(actual[:, i], predicted[:, i], zero_division=0), 2)
        metrics.append([accuracy, precision, recall, f1])
    
    # Create dataframe containing metrics
    metrics = np.array(metrics)
    metrics_df = pd.DataFrame(data = metrics, index = categories, columns = ['accuracy', 'precision','recall', 'f1'])
      
    return metrics_df

In [7]:
# Training model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
cv.fit(X_train, y_train)


GridSearchCV(estimator=Pipeline(steps=[('vect',
                                        CountVectorizer(tokenizer=<function tokenize at 0x00000230A024FDC0>)),
                                       ('tfidf', TfidfTransformer()),
                                       ('clf',
                                        MultiOutputClassifier(estimator=BernoulliNB()))]),
             param_grid={'tfidf__use_idf': (True, False),
                         'vect__max_df': (0.5, 0.75, 1.0),
                         'vect__max_features': (None, 5000, 10000),
                         'vect__ngram_range': ((1, 1), (1, 2))})

In [19]:
y_pred = cv.predict(X_test)

# Show model evaluation result
print(report_metrics(np.array(y_test), y_pred, categories))

                        accuracy  precision  recall    f1
related                     0.79       0.91    0.80  0.86
request                     0.88       0.62    0.73  0.67
offer                       0.99       0.00    0.00  0.00
aid_related                 0.76       0.75    0.64  0.69
medical_help                0.89       0.33    0.34  0.33
medical_products            0.93       0.36    0.36  0.36
search_and_rescue           0.96       0.16    0.12  0.14
security                    0.97       0.06    0.02  0.03
military                    0.95       0.31    0.33  0.32
water                       0.94       0.54    0.53  0.53
food                        0.92       0.65    0.65  0.65
shelter                     0.91       0.50    0.46  0.48
clothing                    0.98       0.27    0.14  0.19
money                       0.97       0.20    0.11  0.14
missing_people              0.98       0.09    0.04  0.06
refugees                    0.95       0.19    0.18  0.18
death         

Bernoulli algorithm results better scores compared to RandomForestClassifier(), so I chose it as my final model for this project.

### Finding best estimators on Bernoulli

In [21]:
cv.best_estimator_

Pipeline(steps=[('vect',
                 CountVectorizer(max_df=0.5, max_features=10000,
                                 ngram_range=(1, 2),
                                 tokenizer=<function tokenize at 0x00000230A024FDC0>)),
                ('tfidf', TfidfTransformer()),
                ('clf', MultiOutputClassifier(estimator=BernoulliNB()))])

### 9. Export your model as a pickle file

In [23]:
import joblib 

joblib.dump(cv, 'DS_model.pkl')

['DS_model.pkl']