# ML Pipeline Preparation

### 1. Import libraries and load data from database.

In [1]:
# import libraries
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize
nltk.download(['punkt','stopwords', 'wordnet'])
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.naive_bayes import MultinomialNB
import pickle
import warnings

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
# load data from database
def load_data():
    engine = create_engine('sqlite:///category.db')
    df = pd.read_sql_table('category', engine)
    df['related'] = df['related'].apply(lambda x: 0 if x == 0 else 1)
    X = df['message'].values
    y = df.drop(df.columns[:4], axis =1).values
    categories = df.drop(df.columns[:4], axis =1).columns
    return X, y, categories

In [3]:
X, y, categories = load_data()
X_train, X_test, y_train, y_test = train_test_split(X, y)

### 2. Write a tokenization function to process the text data

In [4]:
def tokenize(text):
    stop_words = stopwords.words("english")
    lemmatizer = WordNetLemmatizer()
    
    # normalize case and remove punctuation
    text = re.sub(r"[^a-zA-Z0-9]", " ", text.lower())
    
    # tokenize text
    tokens = word_tokenize(text)
    
    # lemmatize andremove stop words
    tokens = [lemmatizer.lemmatize(word).strip() for word in tokens if word not in stop_words]

    return tokens

### 3. Build a machine learning pipeline

In [5]:
pipeline = Pipeline([
    ('text_pipeline', Pipeline([
        ('vect', CountVectorizer(tokenizer=tokenize)),
        ('tfidf', TfidfTransformer())
    ])),

    ('clf', MultiOutputClassifier(estimator = MultinomialNB()))
])

In [6]:
pipeline.get_params()

{'memory': None, 'steps': [('text_pipeline', Pipeline(memory=None,
        steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
           dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
           lowercase=True, max_df=1.0, max_features=None, min_df=1,
           ngram_range=(1, 1), preprocessor=None, stop_words=None,
           strip...y=None)), ('tfidf', TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True))])),
  ('clf',
   MultiOutputClassifier(estimator=MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True),
              n_jobs=1))], 'text_pipeline': Pipeline(memory=None,
      steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
         dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
         lowercase=True, max_df=1.0, max_features=None, min_df=1,
         ngram_range=(1, 1), preprocessor=None, stop_words=None,
         strip...y=None)), ('tfidf', Tf

### 4. Train pipeline
- Split data into train and test sets
- Train pipeline

In [7]:
warnings.filterwarnings("ignore")

pipeline.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('text_pipeline', Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1...ssifier(estimator=MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True),
           n_jobs=1))])

### 5. Test the model

In [8]:
def display_results(cv, y_test, y_pred):

    for i in range(len(categories)):
         print("Classification Report for {}:".format(categories[i]), '\n', classification_report(y_test[:, i], y_pred[:, i]))       

In [9]:
warnings.filterwarnings("ignore")

y_pred = pipeline.predict(X_test)

display_results(pipeline, y_test, y_pred)

Classification Report for related: 
              precision    recall  f1-score   support

          0       0.79      0.09      0.17      1543
          1       0.78      0.99      0.87      5011

avg / total       0.78      0.78      0.71      6554

Classification Report for request: 
              precision    recall  f1-score   support

          0       0.87      0.99      0.92      5456
          1       0.88      0.23      0.37      1098

avg / total       0.87      0.87      0.83      6554

Classification Report for offer: 
              precision    recall  f1-score   support

          0       1.00      1.00      1.00      6526
          1       0.00      0.00      0.00        28

avg / total       0.99      1.00      0.99      6554

Classification Report for aid_related: 
              precision    recall  f1-score   support

          0       0.76      0.84      0.80      3844
          1       0.74      0.63      0.68      2710

avg / total       0.75      0.75      0.75  

### 6. Improve the model

In [10]:
def build_model():
    pipeline = Pipeline([
        ('text_pipeline', Pipeline([
            ('vect', CountVectorizer(tokenizer=tokenize)),
            ('tfidf', TfidfTransformer())
        ])),

        ('clf', MultiOutputClassifier(estimator = MultinomialNB()))
    ])

    parameters = {'text_pipeline__vect__ngram_range': [(1, 1), (1, 2)],
                  'text_pipeline__tfidf__use_idf': (True, False),
                  'clf__estimator__alpha': [1, 1e-1, 1e-2]}

    cv = GridSearchCV(pipeline, param_grid=parameters, scoring='f1_micro')

    return cv

### 7. Test the model

In [11]:
def main():
    X, y, categories = load_data()
    X_train, X_test, y_train, y_test = train_test_split(X, y)

    model = build_model()

    model.fit(X_train, y_train)
    print(model.best_estimator_)
    print(model.best_params_)
    y_pred = model.predict(X_test)

    display_results(model, y_test, y_pred)


In [12]:
warnings.filterwarnings("ignore")

X, y, categories = load_data()
X_train, X_test, y_train, y_test = train_test_split(X, y)

model = build_model()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

display_results(model, y_test, y_pred)

Classification Report for related: 
              precision    recall  f1-score   support

          0       0.68      0.45      0.55      1515
          1       0.85      0.94      0.89      5039

avg / total       0.81      0.82      0.81      6554

Classification Report for request: 
              precision    recall  f1-score   support

          0       0.93      0.94      0.93      5416
          1       0.68      0.65      0.67      1138

avg / total       0.89      0.89      0.89      6554

Classification Report for offer: 
              precision    recall  f1-score   support

          0       0.99      1.00      1.00      6514
          1       0.00      0.00      0.00        40

avg / total       0.99      0.99      0.99      6554

Classification Report for aid_related: 
              precision    recall  f1-score   support

          0       0.79      0.75      0.77      3807
          1       0.67      0.72      0.70      2747

avg / total       0.74      0.74      0.74  

In [13]:
print("Best Model:" ,model.best_estimator_)
print("Best Params:", model.best_params_)

Best Model: Pipeline(memory=None,
     steps=[('text_pipeline', Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2...sifier(estimator=MultinomialNB(alpha=0.01, class_prior=None, fit_prior=True),
           n_jobs=1))])
Best Params: {'clf__estimator__alpha': 0.01, 'text_pipeline__tfidf__use_idf': False, 'text_pipeline__vect__ngram_range': (1, 2)}


### 8. Export the model as a pickle file

In [14]:
filename = 'classifier.sav'
pickle.dump(model, open(filename, 'wb'))

# loaded_model = pickle.load(open(filename, 'rb'))