# ML Pipeline Preparation
Follow the instructions below to help you create your ML pipeline.
### 1. Import libraries and load data from database.
- Import Python libraries
- Load dataset from database with [`read_sql_table`](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_sql_table.html)
- Define feature and target variables X and Y

In [1]:
# import libraries
import pandas as pd
import numpy as np
import re
import pickle
import warnings

import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

from sklearn.metrics import precision_score, recall_score, f1_score, classification_report, accuracy_score, make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.utils import resample
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import FunctionTransformer

from sqlalchemy import create_engine
from iterstrat.ml_stratifiers import MultilabelStratifiedShuffleSplit
from imblearn.under_sampling import RandomUnderSampler

nltk.download(['punkt', 'stopwords'])
nltk.download('averaged_perceptron_tagger')
warnings.simplefilter('ignore')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hnbez\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hnbez\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
# load data from database
engine = create_engine('sqlite:///db_tweet_disasters.db')
df = pd.read_sql('SELECT * FROM message_categories', engine)

### 2. Write a tokenization function to process your text data

In [3]:
url_regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
stop_words = stopwords.words('english')
def tokenize2(text):
    detected_urls = re.findall(url_regex, text)
    for url in detected_urls:
        text = text.replace(url, "urlplaceholder")
    
    # remove punctuation
    text = re.sub(r"[^a-zA-Z0-9]", " ", text.lower())
    
    tokens = word_tokenize(text)
    tokens = [w for w in tokens if w not in stop_words]
    lemmatizer = WordNetLemmatizer()

    clean_tokens = []
    for tok in tokens:
        clean_tok = lemmatizer.lemmatize(tok).lower().strip()
        clean_tokens.append(clean_tok)

    return clean_tokens

In [4]:
def tokenize(text):
    text = re.sub(r"[^a-zA-Z0-9]", " ", text.lower())
    
    tokens = word_tokenize(text)
    
    stemmer = PorterStemmer()
    stop_words = stopwords.words("english")
    
    stemmed_tokens = [stemmer.stem(word) for word in tokens if word not in stop_words]
    
    return stemmed_tokens

### Defining evaluation functions

In [5]:
def acc_pred_rec(y_test,y_pred):
    accuracy = (y_test == y_pred).mean()
    precision = precision_score(y_test, y_pred, average='weighted')
    recalls = recall_score(y_test, y_pred, average='weighted')
    f1_scores = f1_score(y_test, y_pred, average='weighted')
    print('the accuracy is ' + str(accuracy))
    print('the precision is ' + str(precision))
    print('the recall is ' + str(recalls))
    print('the f1_score is ' + str(f1_scores))

In [6]:
def metricss(test_labels, predicted_labels, col_names):
    metrics = []
    for i in range(len(col_names)):
        sum1s = sum(test_labels.iloc[:, i])
        sum0s = len(test_labels) - sum1s
        accuracy = accuracy_score(test_labels.iloc[:, i], predicted_labels.iloc[:, i])
        precision = precision_score(test_labels.iloc[:, i], predicted_labels.iloc[:, i])
        recall = recall_score(test_labels.iloc[:, i], predicted_labels.iloc[:, i])
        f1 = f1_score(test_labels.iloc[:, i], predicted_labels.iloc[:, i])
        
        metrics.append([sum1s, sum0s, accuracy, precision, recall, f1])
    
    col_names.append('mean')
    
    metrics = np.array(metrics)
    metrics_df = pd.DataFrame(data = metrics, columns = ['# 1s', '# 0s', 'Accuracy', 'Precision', 'Recall', 'F1'])
    metrics_df = metrics_df.append({'# 1s': sum(metrics[:,0]/len(metrics[:,0])), 
                                    '# 0s': sum(metrics[:,1]/len(metrics[:,1])), 
                                    'Accuracy': sum(metrics[:,2]/len(metrics[:,2])),
                                    'Precision': sum(metrics[:,3]/len(metrics[:,3])),
                                    'Recall': sum(metrics[:,4]/len(metrics[:,4])),
                                    'F1': sum(metrics[:,5]/len(metrics[:,5]))}, ignore_index=True)
    
    metrics_df.index = col_names
    
    return metrics_df

### Adding custom estimators

In [7]:
class StartVerbExtractor(BaseEstimator, TransformerMixin):


    def start_verb(self, text):
        sentence_list = nltk.sent_tokenize(text)
        for sentence in sentence_list:
            pos_tags = nltk.pos_tag(tokenize(sentence))
            if len(pos_tags) != 0:
                first_word, first_tag = pos_tags[0]
                if first_tag in ['VB', 'VBP'] or first_word == 'RT':
                    return 1
        return 0


    def fit(self, X, y=None):
        return self
    

    def transform(self, X):
        X_tag = pd.Series(X).apply(self.start_verb)
        return pd.DataFrame(X_tag)

In [8]:
def get_text_len(data):
    return np.array([len(text) for text in data]).reshape(-1, 1)

### Spliting into training and test datasets

In [9]:
X = df['message']
y = df.drop(['message','related_alone','offer','request'],axis=1)
y_cols = y.columns.tolist()

# Here we separate into stratified training and test datasets
msss = MultilabelStratifiedShuffleSplit(n_splits=1, test_size=0.25, random_state=42)

for train_index, test_index in msss.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y.values[train_index], y.values[test_index]

In [10]:
y_train = pd.DataFrame(y_train,columns=y_cols)
y_test = pd.DataFrame(y_test,columns=y_cols)

In [11]:
y_train

Unnamed: 0,related,aid_related,medical_help,medical_products,search_and_rescue,security,military,water,food,shelter,...,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,1,0,0,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19516,1,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19517,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19518,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
19519,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Training the model with adaboost with StartVerbExtractor

It is a little worst than adaboost with StartVebExtractor and text_len, so I will use that for gridsearch

In [12]:
def Adaboost_pipeline():
    pipeline = Pipeline([
        ('features', FeatureUnion([

            ('text_pipeline', Pipeline([
                ('vect', CountVectorizer(tokenizer=tokenize)),
                ('tfidf', TfidfTransformer())
            ])),

            ('start_verb', StartVerbExtractor())
        ])),

        ('clf', MultiOutputClassifier(AdaBoostClassifier()))
    ])

    return pipeline

In [13]:
#model_ada = Adaboost_pipeline()

In [14]:
#np.random.seed(42)
#model_ada.fit(X_train, y_train)

In [15]:
#y_pred_ada = model_ada.predict(X_test)
#y_pred_ada = pd.DataFrame(y_pred_ada, columns=y_test.columns.tolist())

In [16]:
#metricss(y_test, y_pred_ada, y_test.columns.tolist())

In [17]:
#test_msg = ['There is a fire in building nearby and there are people in it.']
#test = model_ada.predict(test_msg)
#print(y_train.columns.values[(test.flatten()==1)])

## Training the model with adaboost, StartVerbExtractor and text_len

In [18]:
def Adaboost_len_pipeline():
    pipeline = Pipeline([
        ('features', FeatureUnion([

            ('text_pipeline', Pipeline([
                ('vect', CountVectorizer(tokenizer=tokenize)),
                ('tfidf', TfidfTransformer())
            ])),

            ('start_verb', StartVerbExtractor()),
            ('length', Pipeline([('text_length', FunctionTransformer(get_text_len, validate=False))]))
        ])),

        ('clf', MultiOutputClassifier(AdaBoostClassifier()))
    ])

    return pipeline

In [19]:
model_ada_len = Adaboost_len_pipeline()

In [20]:
np.random.seed(42)
model_ada_len.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('features',
                 FeatureUnion(n_jobs=None,
                              transformer_list=[('text_pipeline',
                                                 Pipeline(memory=None,
                                                          steps=[('vect',
                                                                  CountVectorizer(analyzer='word',
                                                                                  binary=False,
                                                                                  decode_error='strict',
                                                                                  dtype=<class 'numpy.int64'>,
                                                                                  encoding='utf-8',
                                                                                  input='content',
                                                                                  low

In [21]:
y_pred_ada_len = model_ada_len.predict(X_test)
y_pred_ada_len = pd.DataFrame(y_pred_ada_len, columns=y_test.columns.tolist())

In [22]:
metricss(y_test, y_pred_ada_len, y_test.columns.tolist())

Unnamed: 0,# 1s,# 0s,Accuracy,Precision,Recall,F1
related,4976.0,1531.0,0.806516,0.846929,0.911777,0.878157
aid_related,2715.0,3792.0,0.754572,0.749109,0.619153,0.677959
medical_help,521.0,5986.0,0.92608,0.576923,0.287908,0.384123
medical_products,328.0,6179.0,0.95574,0.619048,0.317073,0.419355
search_and_rescue,181.0,6326.0,0.974489,0.659574,0.171271,0.27193
security,118.0,6389.0,0.980329,0.25,0.042373,0.072464
military,215.0,6292.0,0.970801,0.606838,0.330233,0.427711
water,418.0,6089.0,0.960197,0.710875,0.641148,0.674214
food,731.0,5776.0,0.94406,0.796446,0.674419,0.73037
shelter,578.0,5929.0,0.945136,0.776942,0.536332,0.634596


In [23]:
model_ada_len.get_params()

{'memory': None, 'steps': [('features', FeatureUnion(n_jobs=None,
                transformer_list=[('text_pipeline',
                                   Pipeline(memory=None,
                                            steps=[('vect',
                                                    CountVectorizer(analyzer='word',
                                                                    binary=False,
                                                                    decode_error='strict',
                                                                    dtype=<class 'numpy.int64'>,
                                                                    encoding='utf-8',
                                                                    input='content',
                                                                    lowercase=True,
                                                                    max_df=1.0,
                                                                    max_fea

## Training model with LogisticRegression

Training showed that LogisticRegression is worst than Adaboost and RandomForest

In [24]:
def Logistic_pipeline():
    pipeline = Pipeline([
        ('features', FeatureUnion([

            ('text_pipeline', Pipeline([
                ('vect', CountVectorizer(tokenizer=tokenize)),
                ('tfidf', TfidfTransformer())
            ])),

            ('start_verb', StartVerbExtractor())
        ])),

        ('clf', MultiOutputClassifier(LogisticRegression()))
    ])

    return pipeline

In [25]:
#model_log = Logistic_pipeline()

In [26]:
#np.random.seed(42)
#model_log.fit(X_train, y_train)

In [27]:
#y_pred_log = model_log.predict(X_test)
#y_pred_log = pd.DataFrame(y_pred_log, columns=y_test.columns.tolist())

In [28]:
#metricss(y_test, y_pred_log, y_test.columns.tolist())

In [29]:
#test_msg = ['There is a fire in building nearby and there are people in it.']
#test = model_log.predict(test_msg)
#print(y_train.columns.values[(test.flatten()==1)])

## Training the model with randomforest and start verb Extractor

The recall is worst than adaboost, and since it is a disaster response, it is better to have a higher recall score

In [30]:
def simple_pipeline():
    pipeline = Pipeline([
        ('features', FeatureUnion([

            ('text_pipeline', Pipeline([
                ('vect', CountVectorizer(tokenizer=tokenize)),
                ('tfidf', TfidfTransformer())
            ])),

            ('start_verb', StartVerbExtractor())
        ])),

        ('clf', MultiOutputClassifier(RandomForestClassifier()))
    ])
    
    return pipeline

In [31]:
#model_stem = simple_pipeline()

In [32]:
#np.random.seed(42)
#model_stem.fit(X_train,y_train)

In [33]:
#y_pred_stem = model_stem.predict(X_test)
#y_pred_stem = pd.DataFrame(y_pred_stem, columns=y_test.columns.tolist())

In [34]:
#metricss(y_test, y_pred_stem, y_test.columns.tolist())

## Training the model with randomforest and start verb Extractor AND text_len

Training showed that LogisticRegression is worst than just with the startverbExtractor

In [35]:
def simple_pipeline2():
    pipeline = Pipeline([
        ('features', FeatureUnion([

            ('text_pipeline', Pipeline([
                ('vect', CountVectorizer(tokenizer=tokenize)),
                ('tfidf', TfidfTransformer())
            ])),

            ('start_verb', StartVerbExtractor()),
            ('length', Pipeline([('text_length', FunctionTransformer(get_text_len, validate=False))]))
        ])),

        ('clf', MultiOutputClassifier(RandomForestClassifier()))
    ])
    
    return pipeline

In [36]:
#model_stem_len = simple_pipeline2()

In [37]:
#np.random.seed(42)
#model_stem_len.fit(X_train,y_train)

In [38]:
#y_pred_stem_len = model_stem_len.predict(X_test)
#y_pred_stem_len = pd.DataFrame(y_pred_stem_len, columns=y_test.columns.tolist())

In [39]:
#metricss(y_test, y_pred_stem_len, y_test.columns.tolist())

## Training the model with randomforest and text_len

Training showed that LogisticRegression is worst than just with the startverbExtractor

In [40]:
def simple_pipeline3():
    pipeline = Pipeline([
        ('features', FeatureUnion([

            ('text_pipeline', Pipeline([
                ('vect', CountVectorizer(tokenizer=tokenize)),
                ('tfidf', TfidfTransformer())
            ])),

            ('length', Pipeline([('text_length', FunctionTransformer(get_text_len, validate=False))]))
        ])),

        ('clf', MultiOutputClassifier(RandomForestClassifier()))
    ])
    
    return pipeline

In [41]:
#model_len = simple_pipeline3()

In [42]:
#np.random.seed(42)
#model_len.fit(X_train,y_train)

In [43]:
#y_pred_len = model_len.predict(X_test)
#y_pred_len = pd.DataFrame(y_pred_len, columns=y_test.columns.tolist())

In [44]:
#metricss(y_test, y_pred_len, y_test.columns.tolist())

## Improving the model

Since in the case of a disaster is worst to have a false negative than a false positive, I will make a scorer that will try to improve more Recall than Precision

In [45]:
def recall_metric(y_true, y_pred):
    """Calculate mean Recall 
    
    Args:
    y_true: array. Array containing actual labels.
    y_pred: array. Array containing predicted labels.
        
    Returns:
    score: float. Median F1 score for all of the output classifiers
    """
    score_list = []
    for i in range(np.shape(y_pred)[1]):
        score_i = (recall_score(np.array(y_true)[:, i], y_pred[:, i]) * 3 + 
                   precision_score(np.array(y_true)[:, i], y_pred[:, i])) / 4
        score_list.append(score_i)
        
    score = np.mean(score_list)
    return score

In [46]:
parameters = {'features__text_pipeline__vect__ngram_range':[(1,2),(2,2)],
            'clf__estimator__n_estimators':[50, 100, 300]
             }

In [47]:
pipeline_adaboost_len = Adaboost_len_pipeline()

In [48]:
scorer = make_scorer(recall_metric)
np.random.seed(42)
cv = GridSearchCV(pipeline_adaboost_len, param_grid=parameters, scoring=scorer, n_jobs=5, verbose=10)
cv.fit(X_train, y_train)

Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done   3 tasks      | elapsed:  5.3min
[Parallel(n_jobs=5)]: Done   8 tasks      | elapsed:  8.9min
[Parallel(n_jobs=5)]: Done  15 tasks      | elapsed: 18.4min
[Parallel(n_jobs=5)]: Done  25 out of  30 | elapsed: 49.5min remaining:  9.9min
[Parallel(n_jobs=5)]: Done  30 out of  30 | elapsed: 63.8min finished


GridSearchCV(cv=None, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('features',
                                        FeatureUnion(n_jobs=None,
                                                     transformer_list=[('text_pipeline',
                                                                        Pipeline(memory=None,
                                                                                 steps=[('vect',
                                                                                         CountVectorizer(analyzer='word',
                                                                                                         binary=False,
                                                                                                         decode_error='strict',
                                                                                                         dtype=<class 'numpy.int64'>,
               

In [49]:
y_cv_pred = cv.predict(X_test)
y_cv_pred = pd.DataFrame(y_cv_pred, columns=y_test.columns.tolist())

In [50]:
metricss(y_test, y_cv_pred, y_test.columns.tolist())

Unnamed: 0,# 1s,# 0s,Accuracy,Precision,Recall,F1
related,4976.0,1531.0,0.818196,0.87296,0.892082,0.882417
aid_related,2715.0,3792.0,0.759336,0.725383,0.681031,0.702508
medical_help,521.0,5986.0,0.918396,0.48731,0.368522,0.419672
medical_products,328.0,6179.0,0.94698,0.470588,0.414634,0.440843
search_and_rescue,181.0,6326.0,0.965268,0.335766,0.254144,0.289308
security,118.0,6389.0,0.970186,0.12,0.101695,0.110092
military,215.0,6292.0,0.963578,0.447115,0.432558,0.439716
water,418.0,6089.0,0.955894,0.646532,0.691388,0.668208
food,731.0,5776.0,0.93945,0.731139,0.729138,0.730137
shelter,578.0,5929.0,0.933917,0.635531,0.600346,0.617438


In [53]:
cv.cv_results_

{'mean_fit_time': array([ 286.11497765,  214.34299169,  500.95988688,  355.87668743,
        1362.29578128,  907.69777088]),
 'std_fit_time': array([ 18.27465811,  21.81364108,  35.58426586,  37.43845512,
        108.89083467,  95.4036249 ]),
 'mean_score_time': array([22.25593967, 20.91288867, 25.14417477, 23.73936653, 40.48160877,
        28.26198945]),
 'std_score_time': array([3.8203817 , 1.73477563, 3.53129051, 0.8291392 , 4.97029861,
        5.44547765]),
 'param_clf__estimator__n_estimators': masked_array(data=[50, 50, 100, 100, 300, 300],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_features__text_pipeline__vect__ngram_range': masked_array(data=[(1, 2), (2, 2), (1, 2), (2, 2), (1, 2), (2, 2)],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'clf__estimator__n_estimators': 50,
   'features__text_pipeline__vect__ngram_range'

In [57]:
cv.best_params_

{'clf__estimator__n_estimators': 300,
 'features__text_pipeline__vect__ngram_range': (1, 2)}

The test message below, is an arbitrary test, just to check if it gets the mean point here: some place is on Fire, therefore the firefighters should be called. 

In [58]:
test_msg = ['There is a fire in building nearby and there are people in it.']
test = cv.predict(test_msg)
print(y_train.columns.values[(test.flatten()==1)])

['related' 'fire']


It is clear that the 'fire' feature was predicted, so with a simple script the firefighters could be called. It worked!

The best resul is for:

- AdaboosClassifier

- Using StartVerbExtractor

- Using text_len

- with vect_ngram_range (1,2)

- with n_estimators 300.

### 9. Export your model as a pickle file

In [51]:
# Pickle best model
pickle.dump(cv, open('disaster_model_hnbez.sav', 'wb'))