# ML Pipeline Preparation

### 1. Import libraries and load data from database.

In [1]:
# import libraries
import nltk

nltk.download(["punkt", "wordnet", "averaged_perceptron_tagger", "stopwords"])

import datetime as dt
import numpy as np
from sqlalchemy import create_engine
import sqlalchemy
from pathlib import Path
from typing import Callable, List, Union
import pandas as pd
import re
import pickle
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.ensemble import RandomForestClassifier
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.multioutput import MultiOutputClassifier
from sklearn.feature_extraction.text import (
    CountVectorizer,
    TfidfTransformer,
)
from scipy.stats import randint

[nltk_data] Downloading package punkt to /home/juzun/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/juzun/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/juzun/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to /home/juzun/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# load data from database
current_path = Path.cwd()
parent_path = current_path.parent
database_filepath = "data/DisasterResponse"
engine = create_engine(f"sqlite:///{parent_path / database_filepath}.db")
df = pd.read_sql(f"select * from {sqlalchemy.inspect(engine).get_table_names()[0]}", engine)

target_columns = []
for col in df.columns:
    if col not in ["id", "message", "genre", "original"]:
        target_columns.append(col)

X = df["message"]
y = df[target_columns]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

### 2. Write a tokenization function to process your text data

In [3]:
lemmatizer = WordNetLemmatizer()
stop_words = stopwords.words("english")


def tokenize(text: str) -> List[str]:
    url_regex = r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"
    detected_urls = re.findall(url_regex, text)

    for url in detected_urls:
        text = text.replace(url, "urlplaceholder")

    tokens = word_tokenize(text)

    cleaned_tokens = [
        lemmatizer.lemmatize(token).lower().strip() for token in tokens if token not in stop_words
    ]

    return cleaned_tokens

In [4]:
class StartingVerbExtractor(BaseEstimator, TransformerMixin):

    def __init__(self, tokenizer: Callable[[str], List[str]], messages_col_name: str = None):
        self.tokenizer = tokenizer
        self.messages_col_name = messages_col_name

    def starting_verb(self, text: str) -> bool:
        sentence_list = sent_tokenize(text)

        for sentence in sentence_list:
            pos_tags = nltk.pos_tag(self.tokenizer(sentence))
            if not pos_tags:
                continue
            first_word, first_tag = pos_tags[0]
            if first_tag in ["VB", "VBP"] or first_word == "RT":
                return True

        return False

    def fit(self, x, y=None) -> "StartingVerbExtractor":
        return self

    def transform(self, X: Union[pd.DataFrame, pd.Series]) -> pd.DataFrame:
        try:
            if not isinstance(X, pd.Series):
                X_tagged = X[self.messages_col_name].apply(self.starting_verb)
            else:
                X_tagged = X.apply(self.starting_verb)
        except KeyError as error_message:
            raise KeyError(
                f"Wrong column name for messages text was used: {error_message}. Available columns: {list(X.columns)}"
            )
        return pd.DataFrame(X_tagged)

In [5]:
class GenreTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, genre_col_name: str):
        self.genre_col_name = genre_col_name

    def fit(self, X, y=None):
        return self

    def transform(self, X: pd.DataFrame):
        genres_encoded = pd.get_dummies(X[self.genre_col_name], drop_first=False)
        return genres_encoded

### 3. Build a machine learning pipeline

In [6]:
pipeline = Pipeline(
    [
        (
            "features",
            FeatureUnion(
                [
                    (
                        "text_pipeline",
                        Pipeline(
                            [("vect", CountVectorizer(tokenizer=tokenize)), ("tfidf", TfidfTransformer())]
                        ),
                    ),
                    ("starting_verb", StartingVerbExtractor(tokenizer=tokenize)),
                ]
            ),
        ),
        ("clf", MultiOutputClassifier(estimator=RandomForestClassifier())),
    ]
)

### 4. Train pipeline

In [7]:
pipeline.fit(X=X_train, y=y_train)



### 5. Test your model

In [8]:
pipeline.predict(pd.Series("something"))

array([[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

In [9]:
y_pred = pipeline.predict(X_test)

In [10]:
pipeline_report = pd.DataFrame(
    classification_report(
        y_pred=y_pred,
        y_true=y_test,
        target_names=y.columns,
        output_dict=True,
        zero_division=np.nan,
    )
).transpose()
pipeline_report

Unnamed: 0,precision,recall,f1-score,support
related,0.819447,0.965978,0.8867,4938.0
request,0.835404,0.487319,0.615561,1104.0
offer,,0.0,0.0,26.0
aid_related,0.771835,0.680545,0.723321,2714.0
medical_help,0.7,0.064576,0.118243,542.0
medical_products,0.833333,0.086455,0.156658,347.0
search_and_rescue,0.833333,0.052083,0.098039,192.0
security,0.333333,0.007519,0.014706,133.0
military,0.6875,0.048889,0.091286,225.0
water,0.892857,0.348837,0.501672,430.0


### 6. Improve model

In [14]:
pipeline

In [15]:
param_grid = {
    "clf__estimator__n_estimators": [100, 200, 300],
    "clf__estimator__max_depth": [None, 5, 10],
}

# Instantiate GridSearchCV
grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=5, n_jobs=-1)

In [16]:
grid_search.fit(X=X_train, y=y_train)



: 

In [None]:
param_dist = {
    "features__text_pipeline__tfidf__use_idf": [True, False],  # Use IDF scaling
    "clf__estimator__n_estimators": randint(70, 200),  # Number of trees in Random Forest
    "clf__estimator__max_depth": randint(10, 30),  # Max depth of trees
    "clf__estimator__min_samples_split": randint(2, 6),  # Min samples required to split
}

cv = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=param_dist,
    n_iter=10,
    scoring="f1_micro",
    random_state=42,
    n_jobs=-1,
)

In [31]:
cv.fit(X=X_train, y=y_train)



In [32]:
cv.best_params_

{'clf__estimator__max_depth': 29,
 'clf__estimator__min_samples_split': 5,
 'clf__estimator__n_estimators': 120,
 'features__text_pipeline__tfidf__use_idf': True}

### 7. Test your model again

In [33]:
y_pred_cv = cv.predict(X_test)

In [34]:
cv_report = pd.DataFrame(
    classification_report(
        y_pred=y_pred_cv,
        y_true=y_test,
        target_names=y.columns,
        output_dict=True,
        zero_division=np.nan,
    )
).transpose()
cv_report

Unnamed: 0,precision,recall,f1-score,support
related,0.761493,0.999595,0.864448,4938.0
request,0.926316,0.07971,0.146789,1104.0
offer,,0.0,0.0,26.0
aid_related,0.865369,0.393147,0.540664,2714.0
medical_help,0.5,0.001845,0.003676,542.0
medical_products,,0.0,0.0,347.0
search_and_rescue,,0.0,0.0,192.0
security,,0.0,0.0,133.0
military,,0.0,0.0,225.0
water,1.0,0.002326,0.00464,430.0


### 8. Export model as a pickle file

In [25]:
with open(f'pipeline {dt.datetime.now().strftime("%m-%d %H-%M-%S")}.pkl', "wb") as file:
    pickle.dump(pipeline, file)

In [None]:
with open(f"pipeline 11-16 20-24-21.pkl", "rb") as file:
    pipeline_loaded: Pipeline = pickle.load(file)

### Test pipelines parts if working

In [15]:
vect = ColumnTransformer([("vect", CountVectorizer(tokenizer=tokenize), "message")])
text_feature_counts = vect.fit_transform(X_train)
print(text_feature_counts.shape)

tfidf = TfidfTransformer()
text_feature = tfidf.fit_transform(text_feature_counts)
print(text_feature.shape)

starting_verb = StartingVerbExtractor(tokenizer=tokenize, messages_col_name="message")
starting_verb_feature = starting_verb.fit_transform(X_train)
print(starting_verb_feature.shape)

(19521, 29716)
(19521, 29716)
(19521, 1)


In [None]:
clf_text = MultiOutputClassifier(estimator=RandomForestClassifier())
clf_text.fit(text_feature, y_train)

clf_starting_verb = MultiOutputClassifier(estimator=RandomForestClassifier())
clf_starting_verb.fit(starting_verb_feature, y_train)

### Try different pipelines

#### Just messages

In [16]:
pipeline_poor = Pipeline(
    [
        (
            "text_pipeline",
            Pipeline(
                [
                    (
                        "col_transformer",
                        ColumnTransformer(
                            [
                                (
                                    "vect",
                                    CountVectorizer(tokenizer=tokenize),
                                    "message",
                                ),
                            ]
                        ),
                    ),
                    ("tfidf", TfidfTransformer()),
                ]
            ),
        ),
        ("clf", MultiOutputClassifier(estimator=RandomForestClassifier())),
    ]
)

In [17]:
pipeline_poor.fit(X=X_train, y=y_train)

In [19]:
y_pred_poor = pipeline_poor.predict(X_test)

In [20]:
pd.DataFrame(
    classification_report(
        y_pred=y_pred_poor,
        y_true=y_test,
        target_names=y.columns,
        output_dict=True,
        zero_division=np.nan,
    )
).transpose()

Unnamed: 0,precision,recall,f1-score,support
related,0.819983,0.963953,0.886158,4938.0
request,0.829268,0.492754,0.618182,1104.0
offer,,0.0,0.0,26.0
aid_related,0.768198,0.696021,0.730331,2714.0
medical_help,0.692308,0.049815,0.092943,542.0
medical_products,0.861111,0.089337,0.16188,347.0
search_and_rescue,0.785714,0.057292,0.106796,192.0
security,0.5,0.007519,0.014815,133.0
military,0.764706,0.057778,0.107438,225.0
water,0.883721,0.353488,0.504983,430.0


#### Text and starting verb and genre

In [None]:
pipeline_not_so_poor = Pipeline(
    [
        (
            "features",
            FeatureUnion(
                [
                    (
                        "text_pipeline",
                        Pipeline(
                            [
                                (
                                    "col_transformer",
                                    ColumnTransformer(
                                        [
                                            (
                                                "vect",
                                                CountVectorizer(tokenizer=tokenize),
                                                "message",
                                            ),
                                        ]
                                    ),
                                ),
                                ("tfidf", TfidfTransformer()),
                            ]
                        ),
                    ),
                    (
                        "starting_verb",
                        StartingVerbExtractor(tokenizer=tokenize, messages_col_name="message"),
                    ),
                    ("genre", GenreTransformer(genre_col_name="genre")),
                ]
            ),
        ),
        ("clf", MultiOutputClassifier(estimator=RandomForestClassifier())),
    ]
)

In [None]:
X_with_genre = df[["message", "genre"]]
y_with_genre = df[target_columns]
X_train_with_genre, X_test_with_genre, y_train_with_genre, y_test_with_genre = train_test_split(
    X_with_genre, y_with_genre, random_state=42
)

In [None]:
pipeline_not_so_poor.fit(X=X_train_with_genre, y=y_train_with_genre)



In [None]:
y_pred_not_so_poor = pipeline_not_so_poor.predict(X_test_with_genre)

In [None]:
pd.DataFrame(
    classification_report(
        y_pred=y_pred_not_so_poor,
        y_true=y_test,
        target_names=y.columns,
        output_dict=True,
        zero_division=np.nan,
    )
).transpose()

Unnamed: 0,precision,recall,f1-score,support
related,0.817621,0.965978,0.885629,4938.0
request,0.840121,0.504529,0.630447,1104.0
offer,,0.0,0.0,26.0
aid_related,0.772126,0.687915,0.727592,2714.0
medical_help,0.673913,0.057196,0.105442,542.0
medical_products,0.769231,0.057637,0.107239,347.0
search_and_rescue,0.6,0.03125,0.059406,192.0
security,0.5,0.007519,0.014815,133.0
military,0.666667,0.035556,0.067511,225.0
water,0.887701,0.386047,0.538088,430.0


#### Add original messages

In [None]:
pipeline_with_orig = Pipeline(
    [
        (
            "features",
            FeatureUnion(
                [
                    (
                        "text_pipeline",
                        Pipeline(
                            [
                                (
                                    "col_transformer",
                                    ColumnTransformer(
                                        [
                                            (
                                                "vect",
                                                CountVectorizer(tokenizer=tokenize),
                                                "message",
                                            ),
                                        ]
                                    ),
                                ),
                                ("tfidf", TfidfTransformer()),
                            ]
                        ),
                    ),
                    (
                        "text_original_pipeline",
                        Pipeline(
                            [
                                (
                                    "col_transformer",
                                    ColumnTransformer(
                                        [
                                            (
                                                "vect",
                                                CountVectorizer(tokenizer=tokenize),
                                                "original",
                                            ),
                                        ]
                                    ),
                                ),
                                ("tfidf", TfidfTransformer()),
                            ]
                        ),
                    ),
                    (
                        "starting_verb",
                        StartingVerbExtractor(tokenizer=tokenize, messages_col_name="message"),
                    ),
                    ("genre", GenreTransformer(genre_col_name="genre")),
                ]
            ),
        ),
        ("clf", MultiOutputClassifier(estimator=RandomForestClassifier())),
    ]
)

In [None]:
X_with_orig = df[["message", "genre", "original"]]
X_with_orig["original"] = X_with_orig["original"].fillna("")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_with_orig["original"] = X_with_orig["original"].fillna("")


In [None]:
X_train_with_orig, X_test_with_orig, y_train_with_orig, y_test_with_orig = train_test_split(
    X_with_orig, y, random_state=42
)

In [None]:
pipeline_with_orig.fit(X=X_train_with_orig, y=y_train_with_orig)



In [None]:
y_pred_with_orig = pipeline_with_orig.predict(X_test_with_orig)

In [None]:
pipeline_with_orig_report = pd.DataFrame(
    classification_report(
        y_pred=y_pred_with_orig,
        y_true=y_test_with_orig,
        target_names=y.columns,
        output_dict=True,
        zero_division=np.nan,
    )
).transpose()
pipeline_with_orig_report

Unnamed: 0,precision,recall,f1-score,support
related,0.811611,0.971041,0.884197,4938.0
request,0.837341,0.536232,0.653782,1104.0
offer,,0.0,0.0,26.0
aid_related,0.782113,0.657332,0.714314,2714.0
medical_help,0.609756,0.046125,0.085763,542.0
medical_products,0.894737,0.097983,0.176623,347.0
search_and_rescue,0.6,0.046875,0.086957,192.0
security,0.333333,0.007519,0.014706,133.0
military,0.75,0.04,0.075949,225.0
water,0.890805,0.360465,0.513245,430.0


In [66]:
with open(f'pipeline with original {dt.datetime.now().strftime("%m-%d %H-%M-%S")}.pkl', "wb") as file:
    pickle.dump(pipeline_with_orig, file)

### Compare best models

In [83]:
pipeline_with_orig_report.mean()

precision       0.712839
recall          0.225100
f1-score        0.277135
support      2662.948718
dtype: float64

In [82]:
pipeline_report.mean()

precision       0.717679
recall          0.228080
f1-score        0.278612
support      2662.948718
dtype: float64

In [None]:
cv_report.mean()

precision       0.872464
recall          0.058234
f1-score        0.062295
support      2662.948718
dtype: float64