# ML Pipeline Preparation
Follow the instructions below to help you create your ML pipeline.
### 1. Import libraries and load data from database.
- Import Python libraries
- Load dataset from database with [`read_sql_table`](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_sql_table.html)
- Define feature and target variables X and Y

In [None]:
# import libraries
import pandas as pd 
from sqlalchemy import create_engine
import re
import nltk
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('wordnet')

from nltk.corpus import stopwords
nltk.download('stopwords')

from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import GridSearchCV



In [45]:
# load data from database
engine = create_engine('sqlite:///dbtest.db')
df = pd.read_sql('disaster_messages', engine)

X = df.message
Y = df.drop(columns = ['message', 'original', 'genre','id'])

### 2. Write a tokenization function to process your text data

In [62]:
stopwords_eng = stopwords.words("english")
lemmatizer = WordNetLemmatizer()


def tokenize(text):
     # Replace all urls with a urlplaceholder string
    url_regex = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    
    # Extract all the urls from the provided text 
    detected_urls = re.findall(url_regex, text)
    # Replace url with a url placeholder string
    for detected_url in detected_urls:
        text = text.replace(detected_url, 'urlplaceholder')
    # replace everything that's not a space or a a
    text = re.sub(r"[^\w\s]"," ", text)
    
    tokens = nltk.word_tokenize(text)
    tokens = [w.lower().strip() for w in tokens if w not in stopwords_eng]

    clean_tokens = []
    for tok in tokens:
        clean_tok = lemmatizer.lemmatize(tok).lower().strip()
        clean_tokens.append(clean_tok)

    return clean_tokens


### 3. Build a machine learning pipeline
This machine pipeline should take in the `message` column as input and output classification results on the other 36 categories in the dataset. You may find the [MultiOutputClassifier](http://scikit-learn.org/stable/modules/generated/sklearn.multioutput.MultiOutputClassifier.html) helpful for predicting multiple target variables.

In [None]:
Y = Y.loc[:,~(Y.sum() == 0).to_numpy()]
X_train, X_test, y_train, y_test = train_test_split(X, Y)

pipeline  = Pipeline(steps=[
        ('vect', CountVectorizer(tokenizer=tokenize)),
        ('tfidf', TfidfTransformer()),
        ('moc', MultiOutputClassifier(GradientBoostingClassifier()))
    ], verbose=True)

param_grid = {
    # Parameters for CountVectorizer
    'vect__max_df': [0.75, 0.9],  # Ignore words that appear in more than 75% or 90% of documents
    'vect__ngram_range': [(1, 1), (1, 2)],  # Use unigrams and bigrams
    
    # Parameters for GradientBoostingClassifier inside MultiOutputClassifier
    'moc__estimator__n_estimators': [50, 100],  # Number of boosting stages
    'moc__estimator__learning_rate': [0.1, 0.01],  # Learning rate for boosting
    'moc__estimator__max_depth': [3, 5],  # Maximum depth of trees
    }


grid_search = GridSearchCV(pipeline, param_grid, cv=3, verbose=3, n_jobs=1)

# train classifier
pipeline.fit(X_train, y_train)

# # predict on test data
y_pred = pipeline.predict(X_test)

