## ML Pipeline Preparation

### Main blocks of code
* Importing libraries and loading data from database
* Writing a tokenization function to process text data
* Building a machine learning Pipeline
* Improving model with grid search and testing new model
* Export model as a pickle file



______________________________

### Importing libraries and load data from database

In [None]:
# importing libraries
import pandas as pd 
from sqlalchemy import create_engine

import nltk
nltk.download(['punkt', 'wordnet', 'stopwords'])

import re
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

In [None]:
# connect to database
#engine = create_engine('sqlite:///disaster_responses.db', echo=True)
# table named disaster_responses will be returned as a dataframe
#df = pd.read_sql_table('disaster_responses', con=engine)
#print(df.head())

In [None]:
df = pd.read_csv(r"C:\Users\BernadettKepenyes\Documents\GitHub\disaster-response-pipelines\data\disaster_responses.csv")

In [None]:
# extract message column
X = df['message']

# classification labels
# Y = df.drop(['id', 'message', 'original', 'genre'], axis = 1), or:
y = df.iloc[:, 4:]

_____________________

### Writing a tokenization function to process text data

In [None]:
# tokenization function to process text data
def tokenize(text):
    '''
    function: returning the root form of the words of messages
    input: message text(str)
    output: cleaned list of words of messages
    '''
    
    text = re.sub(r"[^a-zA-Z0-9]", " ", text.lower()) # normalizing text
    words = word_tokenize(text) # tokenizing text
    words = [w for w in words if w not in stopwords.words("english")] # removing stop words
    lemmatizer = WordNetLemmatizer() # initiating text
    
    # lemmatizing - iterate through each token
    clean_words = []
    for w in words:
        clean = lemmatizer.lemmatize(w)
        clean_words.append(clean)
    
    return clean_words

# testing out function
for message in X[:5]:
    words = tokenize(message)
    print(message)
    print(words, '\n')

________________

### Building a machine learning pipeline

In [None]:
### defining pipeline
pipeline = Pipeline([
        ('vect', CountVectorizer(tokenizer = tokenize)),
        ('tfidf', TfidfTransformer()),
        ('clf',  MultiOutputClassifier(RandomForestClassifier()))
    ])

# splitting data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y)
# fit Random Forest Classifier
pipeline.fit(X_train, y_train)
# prediction
y_pred = pipeline.predict(X_test)

In [None]:
# testing function
def model_report(y_test, y_pred):
    i = 0
    for col in y_test:
        print('Category {}: {}'.format(i+1, col))
        print(classification_report(y_test[col], y_pred[:, i]))
        i = i+1
    accuracy = (y_pred == y_test).mean()
    print('Accuracy: ', accuracy)

model_report(y_test, y_pred)

_________________

### Improving model with grid search and testing new model

In [None]:
pipeline.get_params()

In [None]:
# grid search
parameters = {
            'clf__estimator__n_estimators': [60]
}

cv = GridSearchCV(pipeline, param_grid=parameters)
cv

In [None]:
# evaluating model
Y_pred = model.predict(X_test)
    
i = 0
for col in Y_test:
    print('Category {}: {}'.format(i+1, col))
    print(classification_report(Y_test[col], Y_pred[:, i]))
    i = i+1
accuracy = (Y_pred == Y_test).mean()
print('Accuracy: ', accuracy)
sample_accuracy = accuracy.mean()
print('Average accuracy: ', sample_accuracy)

In [None]:
# saving model
pickle.dump(model, open(r"C:\Users\BernadettKepenyes\Documents\GitHub\disaster-response-project\models", 'wb'))