# ML Pipeline Preparation

### 1. Import libraries and load data from database

In [1]:
# import libraries
import re
from sqlalchemy import create_engine

import pandas as pd
import numpy as np

In [2]:
import nltk
nltk.download(['punkt', 'wordnet','stopwords'])

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Heschmat\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Heschmat\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Heschmat\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
from sklearn.model_selection import train_test_split

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin

from sklearn.multioutput import MultiOutputClassifier
from sklearn.naive_bayes import MultinomialNB

from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import precision_recall_fscore_support as scores

In [4]:
# load data from database
engine = create_engine('sqlite:///DisasterResponseDB.db')
df = pd.read_sql_table('DisasterResponse', engine)

In [5]:
df.iloc[:3, :10]

Unnamed: 0,id,message,original,genre,related,request,offer,aid_related,medical_help,medical_products
0,2,Weather update - a cold front from Cuba that c...,Un front froid se retrouve sur Cuba ce matin. ...,direct,1,0,0,0,0,0
1,7,Is the Hurricane over or is it not over,Cyclone nan fini osinon li pa fini,direct,1,0,0,1,0,0
2,8,Looking for someone but no name,"Patnm, di Maryani relem pou li banm nouvel li ...",direct,1,0,0,0,0,0


In [6]:
X = df['message']
y = df.iloc[:, 4:]

In [7]:
# Show some of the labels for the first 5 messages
## each message has 36 labels.
y.iloc[:5, :14]

Unnamed: 0,related,request,offer,aid_related,medical_help,medical_products,search_and_rescue,security,military,child_alone,water,food,shelter,clothing
0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1,0,0,1,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,0,0,0,0
3,1,1,0,1,0,1,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,0,0,0,0


In [8]:
labels = y.columns
labels

Index(['related', 'request', 'offer', 'aid_related', 'medical_help',
       'medical_products', 'search_and_rescue', 'security', 'military',
       'child_alone', 'water', 'food', 'shelter', 'clothing', 'money',
       'missing_people', 'refugees', 'death', 'other_aid',
       'infrastructure_related', 'transport', 'buildings', 'electricity',
       'tools', 'hospitals', 'shops', 'aid_centers', 'other_infrastructure',
       'weather_related', 'floods', 'storm', 'fire', 'earthquake', 'cold',
       'other_weather', 'direct_report'],
      dtype='object')

In [9]:
# `relaed` category has 3 values
## Here, "2" means "indirectly related"
y['related'].value_counts()

1    19906
0     6122
2      188
Name: related, dtype: int64

In [10]:
# For the requirement of this project, I'll convert the 2s as 1s.
y.loc[y['related'] == 2, 'related'] = 1
# Make sure it worked as expected
y['related'].value_counts()

1    20094
0     6122
Name: related, dtype: int64

In [11]:
# Convert the target values to a numpy array
y = y.values

In [12]:
# Check out first 5 messages
X.head().values

array(['Weather update - a cold front from Cuba that could pass over Haiti',
       'Is the Hurricane over or is it not over',
       'Looking for someone but no name',
       'UN reports Leogane 80-90 destroyed. Only Hospital St. Croix functioning. Needs supplies desperately.',
       'says: west side of Haiti, rest of the country today and tonight'],
      dtype=object)

### 2. Write a tokenization function to process your text data

In [13]:
def tokenize(msg):
    # Convert to lowercase, and remove non-alphanumeric characters
    msg = re.sub(r"[^a-zA-Z0-9]", " ", msg.lower())
    # Split the message to list of words
    words = word_tokenize(msg)
    
    # Remove stop words, as thye are non-informative in our case
    stop_words = stopwords.words('english')
    tokens = [word for word in words if word not in stop_words]
    
    # Initialize lemmatizer
    lemmatizer = WordNetLemmatizer()
    # Lemmatize - reduce words to their dictionary form
    ## and remove the white spaces - leading & trailing
    tokens_clean = [lemmatizer.lemmatize(token.strip()) for token in tokens]
    # Lemmatize the verbs as well
    tokens_clean = [lemmatizer.lemmatize(token, pos = 'v') for token in tokens_clean]
    
    return tokens_clean

### 3. Build a machine learning pipeline
This machine pipeline takes in the `message` column as input and outputs classification results on the other 36 categories in the dataset.

In [14]:
# Build the pipeline
pipe = Pipeline(steps = [
        ('vect', CountVectorizer(tokenizer = tokenize)),
        ('tfidf', TfidfTransformer()),
        ('clf', MultiOutputClassifier(MultinomialNB()))
    ])

### 4. Train pipeline
- Split data into train and test sets
- Train pipeline

In [15]:
# Split data into train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state= 0)

In [16]:
# Train classifier
pipe.fit(X_train, y_train)

Pipeline(steps=[('vect',
                 CountVectorizer(tokenizer=<function tokenize at 0x0000021FEBCA49D0>)),
                ('tfidf', TfidfTransformer()),
                ('clf', MultiOutputClassifier(estimator=MultinomialNB()))])

### 5. Test your model
Report the f1 score, precision and recall for each output category of the dataset.

In [17]:
# Get the predictions for test set
y_test_preds = pipe.predict(X_test)
y_test_preds.shape

(6554, 36)

In [18]:
print('true lables: ', y_test[0])
print('pred labels: ', y_test_preds[0])

true lables:  [1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 1 0 0 0 1 0]
pred labels:  [1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]


In [19]:
print('true lables: ', y_test[1])
print('pred labels: ', y_test_preds[1])

true lables:  [1 0 1 1 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
pred labels:  [1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]


In [20]:
for i, label in enumerate(labels):
    s = f'{label} {"=" * 55}'
    print(s[:55])
    y_true = y_test[:, i]
    y_pred = y_test_preds[:, i]
    print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

           0       0.80      0.09      0.16      1509
           1       0.79      0.99      0.88      5045

    accuracy                           0.79      6554
   macro avg       0.79      0.54      0.52      6554
weighted avg       0.79      0.79      0.71      6554

              precision    recall  f1-score   support

           0       0.86      0.99      0.92      5419
           1       0.86      0.20      0.32      1135

    accuracy                           0.86      6554
   macro avg       0.86      0.60      0.62      6554
weighted avg       0.86      0.86      0.82      6554

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      6531
           1       0.00      0.00      0.00        23

    accuracy                           1.00      6554
   macro avg       0.50      0.50      0.50      6554
weighted avg       0.99      1.00      0.99      6554

              preci

  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.97      1.00      0.99      6364
           1       0.00      0.00      0.00       190

    accuracy                           0.97      6554
   macro avg       0.49      0.50      0.49      6554
weighted avg       0.94      0.97      0.96      6554

              precision    recall  f1-score   support

           0       0.98      1.00      0.99      6445
           1       0.00      0.00      0.00       109

    accuracy                           0.98      6554
   macro avg       0.49      0.50      0.50      6554
weighted avg       0.97      0.98      0.98      6554

              precision    recall  f1-score   support

           0       0.97      1.00      0.98      6355
           1       0.00      0.00      0.00       199

    accuracy                           0.97      6554
   macro avg       0.48      0.50      0.49      6554
weighted avg       0.94      0.97      0.95      6554

              preci

              precision    recall  f1-score   support

           0       0.83      0.99      0.90      5263
           1       0.78      0.14      0.24      1291

    accuracy                           0.82      6554
   macro avg       0.80      0.57      0.57      6554
weighted avg       0.82      0.82      0.77      6554



In [21]:
scores_all = np.zeros(5)
for ix, label in enumerate(labels):
    y_true = y_test[:, ix]
    y_pred = y_test_preds[:, ix]
    # true positive
    TP, FP, TN, FN = 0, 0, 0, 0
    for i in range(len(y_true)): 
        if y_true[i] == y_pred [i] == 1:
            TP += 1
        if y_true[i] == 1 and y_true[i] != y_pred[i]:
            FP += 1
        if y_true[i] == y_pred[i] == 0:
            TN += 1
        if y_true[i] == 0 and y_true[i] != y_pred[i]:
            FN += 1
    prec = TP / (TP + FP + .00001)
    recal = TP / (TP + FN + .00001)
    acc = (TP + TN) / (TP + TN + FP + FN)
    accuracy = (y_true == y_pred).mean().round(3)
    f1score = (2 * prec * recal) / (prec + recal + .00001)
    scores = np.array([prec, recal, f1score, acc, accuracy])
    scores_all = np.vstack((scores_all, scores))

In [22]:
dfscores = pd.DataFrame(scores_all).round(3).iloc[1:]
dfscores.columns = ['precision', 'recall', 'f1score', 'accuracy_v1', 'accuracy_v2']
dfscores.index = labels
dfscores

Unnamed: 0,precision,recall,f1score,accuracy_v1,accuracy_v2
related,0.993,0.785,0.877,0.785,0.785
request,0.197,0.862,0.321,0.856,0.856
offer,0.0,0.0,0.0,0.996,0.996
aid_related,0.614,0.743,0.673,0.75,0.75
medical_help,0.0,0.0,0.0,0.92,0.92
medical_products,0.003,0.5,0.006,0.95,0.95
search_and_rescue,0.0,0.0,0.0,0.971,0.971
security,0.0,0.0,0.0,0.983,0.983
military,0.0,0.0,0.0,0.97,0.97
child_alone,0.0,0.0,0.0,1.0,1.0


### 6. Improve your model
Use grid search to find better parameters. 

In [23]:
parameters = 

cv = 

SyntaxError: invalid syntax (<ipython-input-23-1e622dfcf28d>, line 1)

### 7. Test your model
Show the accuracy, precision, and recall of the tuned model.   

### 8. Improve the model further

### 9. Export the final model as a pickle file