# ML Pipeline Preparation
Follow the instructions below to help you create your ML pipeline.
### 1. Import libraries and load data from database.
- Import Python libraries
- Load dataset from database with [`read_sql_table`](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_sql_table.html)
- Define feature and target variables X and Y

In [1]:
# import libraries
import pandas as pd
import numpy as np
import sqlite3
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re
import nltk
from nltk.stem.porter import PorterStemmer
nltk.download('stopwords')
from nltk.corpus import stopwords
nltk.download('wordnet')
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report , accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
import pickle
pd.set_option('display.max_columns', 500)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\eppmi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\eppmi\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
# load data from database

conn = sqlite3.connect(r'E:\Dropbox\Pessoal\Python\Udacity\Disaster-Response-Pipelines\databases\DisasterResponse.db')


df = pd.read_sql('SELECT * FROM disaster', con = conn)


df.describe()

Unnamed: 0,id,related,request,offer,aid_related,medical_help,medical_products,search_and_rescue,security,military,child_alone,water,food,shelter,clothing,money,missing_people,refugees,death,other_aid,infrastructure_related,transport,buildings,electricity,tools,hospitals,shops,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
count,26180.0,26180.0,26180.0,26180.0,26180.0,26180.0,26180.0,26180.0,26180.0,26180.0,26180.0,26180.0,26180.0,26180.0,26180.0,26180.0,26180.0,26180.0,26180.0,26180.0,26180.0,26180.0,26180.0,26180.0,26180.0,26180.0,26180.0,26180.0,26180.0,26180.0,26180.0,26180.0,26180.0,26180.0,26180.0,26180.0,26180.0
mean,15227.563942,1.0,0.170512,0.004507,0.414095,0.079488,0.050076,0.027655,0.017991,0.032811,0.0,0.063751,0.111421,0.088159,0.015432,0.023033,0.011383,0.033384,0.045531,0.131436,0.065126,0.045798,0.05084,0.020321,0.006073,0.01081,0.004584,0.011803,0.043965,0.278304,0.082086,0.093201,0.010772,0.093659,0.020168,0.052559,0.19343
std,8827.269301,0.0,0.376089,0.066986,0.492574,0.270504,0.218107,0.163985,0.13292,0.178146,0.0,0.244313,0.314659,0.283531,0.123264,0.150011,0.106083,0.179641,0.20847,0.337883,0.246753,0.209051,0.219676,0.141098,0.077696,0.103409,0.067549,0.108,0.205021,0.448172,0.2745,0.290719,0.103228,0.29136,0.140578,0.223156,0.394995
min,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,7449.75,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,15665.5,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,22927.25,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,30265.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [3]:
#Remove child alone as it has all zeros only
df = df.drop(['child_alone'],axis=1)

In [4]:
# Thr related column has more than a 2 categories
df.related.value_counts()

1    26180
Name: related, dtype: int64

In [5]:
#Dropping the related rows with value  = 2
df = df[df['related'] != 2]

In [6]:

X = df['message']
Y = df.loc[:,'related':'direct_report']

### 2. Write a tokenization function to process your text data

In [7]:
def tokenize(text):
    tokens = word_tokenize(re.sub(r"[^a-zA-Z0-9]", " ", text.lower()).replace("  ",""))
    words = [w for w in tokens if w not in stopwords.words("english")]
    

    # Reduce words to their stems
    #stemmed = [PorterStemmer().stem(w) for w in words]
    
    # Reduce words to their stems
    
    lemmed = [WordNetLemmatizer().lemmatize(w) for w in words]
    
    return lemmed 

### 3. Build a machine learning pipeline
This machine pipeline should take in the `message` column as input and output classification results on the other 36 categories in the dataset. You may find the [MultiOutputClassifier](http://scikit-learn.org/stable/modules/generated/sklearn.multioutput.MultiOutputClassifier.html) helpful for predicting multiple target variables.

In [8]:
pipeline = Pipeline([
        ('features', FeatureUnion([

            ('text_pipeline', Pipeline([
                ('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer())
            ]))
            
        ])),

        ('clf', MultiOutputClassifier(KNeighborsClassifier(n_jobs=-1)))
    ])




### 4. Train pipeline
- Split data into train and test sets
- Train pipeline

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, Y)


In [10]:
pipeline.fit(X_train,y_train)

Pipeline(steps=[('features',
                 FeatureUnion(transformer_list=[('text_pipeline',
                                                 Pipeline(steps=[('vect',
                                                                  CountVectorizer()),
                                                                 ('tfidf',
                                                                  TfidfTransformer())]))])),
                ('clf',
                 MultiOutputClassifier(estimator=KNeighborsClassifier(n_jobs=-1)))])

### 5. Test your model
Report the f1 score, precision and recall for each output category of the dataset. You can do this by iterating through the columns and calling sklearn's `classification_report` on each.

In [11]:
y_pred = pipeline.predict(X_test)

In [12]:
for n, col in enumerate(y_test.columns):
    print(col)
    print(classification_report(y_test[col], y_pred[:,n]))

related
              precision    recall  f1-score   support

           1       1.00      1.00      1.00      6545

    accuracy                           1.00      6545
   macro avg       1.00      1.00      1.00      6545
weighted avg       1.00      1.00      1.00      6545

request
              precision    recall  f1-score   support

           0       0.84      1.00      0.91      5440
           1       0.84      0.07      0.13      1105

    accuracy                           0.84      6545
   macro avg       0.84      0.53      0.52      6545
weighted avg       0.84      0.84      0.78      6545

offer
              precision    recall  f1-score   support

           0       0.99      1.00      1.00      6512
           1       0.00      0.00      0.00        33

    accuracy                           0.99      6545
   macro avg       0.50      0.50      0.50      6545
weighted avg       0.99      0.99      0.99      6545

aid_related
              precision    recall  f1-s

  _warn_prf(average, modifier, msg_start, len(result))



clothing
              precision    recall  f1-score   support

           0       0.99      1.00      0.99      6454
           1       0.00      0.00      0.00        91

    accuracy                           0.99      6545
   macro avg       0.49      0.50      0.50      6545
weighted avg       0.97      0.99      0.98      6545

money
              precision    recall  f1-score   support

           0       0.97      1.00      0.99      6373
           1       1.00      0.01      0.02       172

    accuracy                           0.97      6545
   macro avg       0.99      0.51      0.50      6545
weighted avg       0.97      0.97      0.96      6545

missing_people
              precision    recall  f1-score   support

           0       0.99      1.00      0.99      6468
           1       0.00      0.00      0.00        77

    accuracy                           0.99      6545
   macro avg       0.49      0.50      0.50      6545
weighted avg       0.98      0.99      0.98

In [13]:
# inspired by https://github.com/dabln/dsnd-DisasterResponsePipeline/blob/master/notebooks_code_development%20/ML%20Pipeline.ipynb
def report(y_test,y_pred, average = 'weighted', score=f1_score):
    report = {}
    for n, col in enumerate(y_test.columns):
        output = classification_report(y_test[col], y_pred[:,n], output_dict=True)
        report[col] = {}
        for i in output:   
            if i == 'accuracy':
                break
            report[col]['f1_' + i] = output[i]['f1-score']
            report[col]['precision_' + i] = output[i]['precision']
            report[col]['recall_' + i] = output[i]['recall']

    report_df = pd.DataFrame(report).transpose()
    report_df = report_df[report_df.columns.sort_values()]
    report_df_mean = report_df.mean()
    
    print("Table for each column:")
    print (report_df)
    print('\n')
    print('mean of results:')
    print(report_df_mean)
    print('\n')
    print('{} ({}): {}'.format(score, average, score(y_test, y_pred, average=average)))
    

In [14]:
report(y_test,y_pred)

Table for each column:
                            f1_0      f1_1  precision_0  precision_1  \
related                      NaN  1.000000          NaN     1.000000   
request                 0.912239  0.125628     0.840458     0.842697   
offer                   0.997473  0.000000     0.994958     0.000000   
aid_related             0.741275  0.062853     0.590648     0.824074   
medical_help            0.957879  0.003766     0.919163     1.000000   
medical_products        0.975082  0.030488     0.951522     0.833333   
search_and_rescue       0.985271  0.000000     0.970970     0.000000   
security                0.991448  0.000000     0.983040     0.000000   
military                0.982511  0.000000     0.965623     0.000000   
water                   0.967727  0.089286     0.938190     0.800000   
food                    0.943957  0.113111     0.894964     0.846154   
shelter                 0.953488  0.064516     0.912370     0.689655   
clothing                0.992999  0.00000

## 6 - 9. Improve your model , test your model, try improving your model further, Export your model as a pickle file. 

### I joined the parts 6 through 9. For this i made a class called Pipeline_scorer that can do the training and predict of the data and then give the requested results.
### For time purpouses i will keep the models simple.  

In [11]:
from Pipeline_scorer import Pipeline_Scorer

In [11]:
metrics       = ['minkowski','euclidean','manhattan'] 
weights       = ['uniform','distance'] #10.0**np.arange(-5,4)
numNeighbors  = np.arange(5,10)
parameters_grid    = dict(clf__estimator__metric=metrics,clf__estimator__weights=weights,clf__estimator__n_neighbors=numNeighbors)
pipe1 = Pipeline_Scorer(parameters_grid=parameters_grid,classifier=KNeighborsClassifier())

In [12]:
pipe1.pipeline(X_train, y_train, X_test,y_test,jobs=-1)

{'clf__estimator__metric': 'minkowski', 'clf__estimator__n_neighbors': 9, 'clf__estimator__weights': 'distance'}


In [13]:
pipe1.report()

  _warn_prf(average, modifier, msg_start, len(result))


Table for each column:
                            f1_0      f1_1  precision_0  precision_1  \
related                 0.138075  0.850372     0.340206     0.770864   
request                 0.896528  0.189891     0.841881     0.386111   
offer                   0.997829  0.000000     0.995668     0.000000   
aid_related             0.677293  0.284292     0.585598     0.435545   
medical_help            0.957497  0.000000     0.918601     0.000000   
medical_products        0.975347  0.000000     0.952175     0.000000   
search_and_rescue       0.987625  0.000000     0.975553     0.000000   
security                0.990787  0.000000     0.981894     0.000000   
military                0.981964  0.000000     0.964568     0.000000   
water                   0.968725  0.000000     0.939929     0.000000   
food                    0.939797  0.000000     0.887117     0.000000   
shelter                 0.949533  0.000000     0.904334     0.000000   
clothing                0.992046  0.00000

In [15]:

pickle.dump(pipe1, open('cv_KNeighborsClassifier', "wb"))

In [18]:
learning_rate = [0.01,0.1] 
n_estimators = [50] #10.0**np.arange(-5,4)
criterion = ['friedman_mse', 'mse']
parameters_grid    = dict(clf__estimator__learning_rate=learning_rate,
                          clf__estimator__n_estimators=n_estimators,clf__estimator__criterion=criterion)
pipe2 = Pipeline_Scorer(parameters_grid=parameters_grid,classifier=GradientBoostingClassifier())

In [19]:
pipe2.pipeline(X_train, y_train, X_test,y_test,jobs=4)

{'clf__estimator__criterion': 'friedman_mse', 'clf__estimator__learning_rate': 0.1, 'clf__estimator__n_estimators': 50}


In [20]:
pipe2.report()

  _warn_prf(average, modifier, msg_start, len(result))


Table for each column:
                            f1_0      f1_1  precision_0  precision_1  \
related                 0.005358  0.870113     0.400000     0.770804   
request                 0.903380  0.064463     0.830248     0.438202   
offer                   0.995101  0.030769     0.994560     0.034483   
aid_related             0.735367  0.040787     0.587169     0.478992   
medical_help            0.957833  0.000000     0.919078     0.000000   
medical_products        0.971597  0.000000     0.944762     0.000000   
search_and_rescue       0.979392  0.000000     0.968608     0.000000   
security                0.987381  0.035928     0.983604     0.050847   
military                0.982362  0.008850     0.968934     0.040000   
water                   0.966659  0.004773     0.935614     0.500000   
food                    0.938132  0.002649     0.883746     0.333333   
shelter                 0.951067  0.000000     0.906700     0.000000   
clothing                0.986748  0.02312

In [21]:
pickle.dump(pipe2, open('cv_GradientBoostingClassifier.pkl', "wb"))

In [22]:
max_features =["sqrt", "log2" ]
n_estimators = [50] 
criterion = ['gine', 'entropy']
parameters_grid    = dict(clf__estimator__max_features=max_features,
                          clf__estimator__n_estimators=n_estimators,clf__estimator__criterion=criterion)
pipe3 = Pipeline_Scorer(parameters_grid=parameters_grid,classifier=RandomForestClassifier())

In [23]:
pipe3.pipeline(X_train, y_train, X_test,y_test,jobs=4)

{'clf__estimator__criterion': 'entropy', 'clf__estimator__max_features': 'sqrt', 'clf__estimator__n_estimators': 50}


In [24]:
pipe3.report()

  _warn_prf(average, modifier, msg_start, len(result))


Table for each column:
                            f1_0      f1_1  precision_0  precision_1  \
related                 0.075077  0.867003     0.429577     0.775036   
request                 0.905837  0.072208     0.831378     0.614286   
offer                   0.997207  0.000000     0.994430     0.000000   
aid_related             0.704567  0.240817     0.594056     0.461376   
medical_help            0.957581  0.000000     0.919040     0.000000   
medical_products        0.971597  0.000000     0.944762     0.000000   
search_and_rescue       0.984124  0.000000     0.968895     0.000000   
security                0.991574  0.000000     0.983289     0.000000   
military                0.984122  0.009804     0.969040     0.333333   
water                   0.966664  0.000000     0.935479     0.000000   
food                    0.938188  0.010526     0.884121     0.500000   
shelter                 0.951059  0.003306     0.906826     0.500000   
clothing                0.991023  0.00000

In [25]:
pickle.dump(pipe3, open('cv_RandomForestClassifier.pkl', "wb"))

In [12]:
learning_rate =[0.5,1 ]
n_estimators = [50,100] 
algorithm = ['SAMME', 'SAMME.R']
parameters_grid    = dict(clf__estimator__learning_rate= learning_rate,
                          clf__estimator__n_estimators=n_estimators,clf__estimator__algorithm=algorithm)
pipe4 = Pipeline_Scorer(parameters_grid=parameters_grid,classifier=AdaBoostClassifier())

In [13]:
pipe4.pipeline(X_train, y_train, X_test,y_test,jobs=-1)

{'clf__estimator__algorithm': 'SAMME.R', 'clf__estimator__learning_rate': 1, 'clf__estimator__n_estimators': 50}


In [14]:
pipe4.pipeline(X_train, y_train, X_test,y_test,jobs=4)

{'clf__estimator__algorithm': 'SAMME.R', 'clf__estimator__learning_rate': 1, 'clf__estimator__n_estimators': 50}


In [15]:
pipe4.report()

Table for each column:
                            f1_0      f1_1  precision_0  precision_1  \
related                 0.081194  0.863212     0.400000     0.771651   
request                 0.900862  0.207376     0.845450     0.435673   
offer                   0.997674  0.000000     0.995512     0.000000   
aid_related             0.716086  0.205173     0.596780     0.465955   
medical_help            0.958330  0.003854     0.921563     0.083333   
medical_products        0.974615  0.000000     0.951223     0.000000   
search_and_rescue       0.986750  0.011696     0.974299     0.250000   
security                0.991102  0.000000     0.982513     0.000000   
military                0.981643  0.000000     0.966341     0.000000   
water                   0.967242  0.000000     0.937432     0.000000   
food                    0.940202  0.008163     0.890875     0.100000   
shelter                 0.952751  0.006814     0.912166     0.105263   
clothing                0.991727  0.07017

In [16]:
pickle.dump(pipe4, open('cv_Adabosst.pkl', "wb"))

### 10. Use this notebook to complete `train.py`
Use the template file attached in the Resources folder to write a script that runs the steps above to create a database and export a model based on a new dataset specified by the user.

In [8]:
 pipeline = Pipeline([
            ('features', FeatureUnion([

                ('text_pipeline', Pipeline([
                    ('vect', CountVectorizer()),
                    ('transformer', TfidfTransformer())
                ]))
            ])),
            ('clf', MultiOutputClassifier(AdaBoostClassifier(algorithm = 'SAMME.R', learning_rate = 1, n_estimators = 50)))
        ])

In [18]:
pipeline.fit(X_train,y_train)

Pipeline(steps=[('features',
                 FeatureUnion(transformer_list=[('text_pipeline',
                                                 Pipeline(steps=[('vect',
                                                                  CountVectorizer()),
                                                                 ('tfidf',
                                                                  TfidfTransformer())]))])),
                ('clf',
                 MultiOutputClassifier(estimator=KNeighborsClassifier(n_jobs=-1)))])

In [20]:
pickle.dump(pipeline, open('cv_Adabosst.pkl', "wb"))