# ML Pipeline Preparation

### 1. Import libraries and load data from database.

In [19]:
# import libraries
import numpy as np
import pandas as pd
from sqlalchemy import create_engine
import nltk
nltk.download(['punkt', 'wordnet','averaged_perceptron_tagger'])
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re


from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.metrics import classification_report
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import GridSearchCV


from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import RidgeClassifierCV
#from sklearn.neighbors import RadiusNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier 
from sklearn.tree import ExtraTreeClassifier  

[nltk_data] Downloading package punkt to /home/iris/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/iris/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/iris/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [20]:
# load data from database
engine = create_engine('sqlite:///disaster_clean.db')
df = pd.read_sql_table('disaster_clean',con=engine)
######REMEMBER TO REMOVE NEXT LINE
df = df.sample(frac=0.05).reset_index(drop=True)

#  define feature and target variables X and Y

X = df['message'].values 
y = df[df.columns[4:]]
category_names = y.columns.tolist()

### 2. A tokenization function that processes text data

In [21]:
url_regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'

In [22]:
def tokenize(text):
    detected_urls = re.findall(url_regex, text)
    for url in detected_urls:
        text = text.replace(url, "urlplaceholder")

    tokens = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()

    clean_tokens = []
    for tok in tokens:
        clean_tok = lemmatizer.lemmatize(tok).lower().strip()
        clean_tokens.append(clean_tok)

    return clean_tokens

### 3. Build a machine learning pipeline
This machine pipeline should take in the `message` column as input and output classification results on the other 36 categories in the dataset. Y

In [23]:
class StartingVerbExtractor(BaseEstimator, TransformerMixin):

    def starting_verb(self, text):
        sentence_list = nltk.sent_tokenize(text)
        for sentence in sentence_list:
            pos_tags = nltk.pos_tag(tokenize(sentence))
            first_word, first_tag = pos_tags[0]
            if first_tag in ['VB', 'VBP'] or first_word == 'RT':
                return True
        return False

    def fit(self, x, y=None):
        return self

    def transform(self, X):
        X_tagged = pd.Series(X).apply(self.starting_verb)
        return pd.DataFrame(X_tagged)

In [24]:
def make_pipeline(model):
    
    pipeline = Pipeline([
                     ('features', FeatureUnion([
                                                ('text_pipeline', Pipeline([
                                                                    ('vect', CountVectorizer(tokenizer=tokenize)),
                                                                    ('tfidf', TfidfTransformer())
                                                ])),

                    ('starting_verb', StartingVerbExtractor())
                    ])),
    
                    ('clf', model)
                    ])
    return pipeline   

We would like to minize the false negatives as this would mean that the messages are identified as irrelevant in cases they are in reality relevant. Thus we would like to maximize the $recall=\frac{tp}{tp+fn}$, thus the recall score is more relevant. 

In [25]:
# One category in this project has only one label that 1 , i guess that is child_alone , hence the error.
#SVC does not support that.

models = [     
    #      MultiOutputClassifier(RidgeClassifierCV()), #recall_micro=0.288 f1_micro=0.207 accuracy=0.311
   #       MultiOutputClassifier(MLPClassifier()),     #recall_micro=0.280f1_micro=0.391 accuracy=0.257
          MultiOutputClassifier(RandomForestClassifier()), # recall_micro=0.143 f1_micro=0.297 accuracy=0.344
          MultiOutputClassifier(KNeighborsClassifier()), # recall_micro=0.288 f1_micro=0.407 accuracy=0.268
          MultiOutputClassifier(DecisionTreeClassifier())]#, # recall_micro = 0.426 f1_micro=0.447 accuracy=0.154
   #       MultiOutputClassifier(ExtraTreeClassifier()), #recall_micro=0.270 f1_micro=0.303, accuracy=0.107
   #       MultiOutputClassifier(ExtraTreesClassifier())] #recall_micor=0.207 f1_micro=0.331, 
      


#sklearn.tree.ExtraTreeClassifier
#sklearn.ensemble.ExtraTreesClassifier



# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state =7, train_size=0.8)

def test_algorithms(pipeline):
    kfold = KFold(n_splits=2)
    for score in ['accuracy','f1_micro', 'recall_micro']:
        print(score)
        predicted = cross_val_score(pipeline, X_train, y_train, cv=kfold, scoring=score)
        print(predicted.mean())

### 5. Test model
Report the f1 score, precision and recall for each output category of the dataset. 

In [26]:
#Hyperparameter tuning

def grid_search(pipeline):
    
    parameters = {
  #      'features__text_pipeline__vect__ngram_range': ((1, 1), (1, 2)),
  #      'features__text_pipeline__vect__max_df': (0.5, 0.75, 1.0),
  #      'features__text_pipeline__vect__max_features': (None, 5000, 10000),
  #      'features__text_pipeline__tfidf__use_idf': (True, False),
        'features__transformer_weights': (
            {'text_pipeline': 1, 'starting_verb': 0.5},
            {'text_pipeline': 0.5, 'starting_verb': 1},
            {'text_pipeline': 0.8, 'starting_verb': 1},
        )
    }

    # create grid search object
    cv = GridSearchCV(pipeline, param_grid=parameters, scoring='recall_micro')
    
    return cv

In [27]:
for model in models:
    print(model)
    pipeline = make_pipeline(model)
    test_algorithms(pipeline)
    cv = grid_search(pipeline)
    print('Training model...')
    cv.fit(X_train, y_train)
    print('Predict on test data..')
    y_pred = cv.predict(X_test)
    print(classification_report(y_test, y_pred, target_names=category_names))

MultiOutputClassifier(estimator=RandomForestClassifier(bootstrap=True,
                                                       ccp_alpha=0.0,
                                                       class_weight=None,
                                                       criterion='gini',
                                                       max_depth=None,
                                                       max_features='auto',
                                                       max_leaf_nodes=None,
                                                       max_samples=None,
                                                       min_impurity_decrease=0.0,
                                                       min_impurity_split=None,
                                                       min_samples_leaf=1,
                                                       min_samples_split=2,
                                                       min_weight_fraction_leaf=0.0,
                  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                        precision    recall  f1-score   support

               request       0.88      0.28      0.42        50
                 offer       0.00      0.00      0.00         4
           aid_related       0.78      0.57      0.66       113
          medical_help       0.00      0.00      0.00        27
      medical_products       0.00      0.00      0.00        15
     search_and_rescue       0.00      0.00      0.00         7
              security       0.00      0.00      0.00         5
              military       0.00      0.00      0.00        11
           child_alone       0.00      0.00      0.00         0
                 water       1.00      0.06      0.11        17
                  food       0.89      0.30      0.44        27
               shelter       1.00      0.03      0.07        29
              clothing       0.00      0.00      0.00         5
                 money       0.00      0.00      0.00        11
        missing_people       0.00      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                        precision    recall  f1-score   support

               request       0.67      0.60      0.63        50
                 offer       0.00      0.00      0.00         4
           aid_related       0.57      0.81      0.67       113
          medical_help       0.20      0.44      0.27        27
      medical_products       0.09      0.33      0.15        15
     search_and_rescue       0.00      0.00      0.00         7
              security       0.00      0.00      0.00         5
              military       0.00      0.00      0.00        11
           child_alone       0.00      0.00      0.00         0
                 water       0.17      0.53      0.26        17
                  food       0.19      0.56      0.29        27
               shelter       0.16      0.45      0.24        29
              clothing       0.00      0.00      0.00         5
                 money       0.00      0.00      0.00        11
        missing_people       0.00      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
