In [34]:
# import libraries
import pandas as pd
import re, nltk
from sqlalchemy import create_engine

from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, AdaBoostClassifier

nltk.download(['punkt','stopwords','wordnet','averaged_perceptron_tagger'])

lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/kamleshkumarrangi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/kamleshkumarrangi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/kamleshkumarrangi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/kamleshkumarrangi/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [35]:
# Load the ETL pipeline 
engine = create_engine('sqlite:///disaster_response_table.db')
df = pd.read_sql_table('disaster_response_table.db', engine) 
X = df['message']
y = df.drop(['id','message','original','genre'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

In [36]:
X_train.head(2)

17572    Also, a near 800mt-wide breach appeared in the...
19406    Under Federation support, a PMI national respo...
Name: message, dtype: object

In [37]:
y_train.head(2)

Unnamed: 0,related,request,offer,aid_related,medical_help,medical_products,search_and_rescue,security,military,water,...,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
17572,1,0,0,0,0,0,0,0,0,0,...,0,0,1,1,0,0,0,0,0,0
19406,1,0,0,1,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1


In [38]:
url_regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'

def tokenize(text):
   
    detected_urls = re.findall(url_regex, text)
    for url in detected_urls:
        text = text.replace(url, "urlplaceholder")

    tokens = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()

    clean_tokens = []
    for tok in tokens:
        clean_tok = lemmatizer.lemmatize(tok).lower().strip()
        clean_tokens.append(clean_tok)

    return clean_tokens


class StartingVerbExtractor(BaseEstimator, TransformerMixin):

    def starting_verb(self, text):
        sentence_list = nltk.sent_tokenize(text)
        for sentence in sentence_list:
            pos_tags = nltk.pos_tag(tokenize(sentence))
            first_word, first_tag = pos_tags[0]
            if first_tag in ['VB', 'VBP'] or first_word == 'RT':
                return True
        return False

    def fit(self, x, y=None):
        return self

    def transform(self, X):
        X_tagged = pd.Series(X).apply(self.starting_verb)
        return pd.DataFrame(X_tagged)

In [39]:
pipeline = Pipeline([
        ('features', FeatureUnion([

            ('text_pipeline', Pipeline([
                ('count_vectorizer', CountVectorizer(tokenizer=tokenize)),
                ('tfidf_transformer', TfidfTransformer())
            ])),

            ('starting_verb_transformer', StartingVerbExtractor())
        ])),

        ('classifier', MultiOutputClassifier(AdaBoostClassifier()))
    ])

In [40]:
parameters = {
        "classifier__estimator__learning_rate": [0.01, 0.02, 0.05, 0.08, 0.10],
        "classifier__estimator__n_estimators": [10, 20, 30],}

In [41]:
model = GridSearchCV(pipeline, param_grid=parameters, scoring="f1_micro", n_jobs=-1)

In [42]:
model.fit(X_train. v, y_train.values)

GridSearchCV(estimator=Pipeline(steps=[('features',
                                        FeatureUnion(transformer_list=[('text_pipeline',
                                                                        Pipeline(steps=[('count_vectorizer',
                                                                                         CountVectorizer(tokenizer=<function tokenize at 0x7fa5e2a02c10>)),
                                                                                        ('tfidf_transformer',
                                                                                         TfidfTransformer())])),
                                                                       ('starting_verb_transformer',
                                                                        StartingVerbExtractor())])),
                                       ('classifier',
                                        MultiOutputClassifier(estimator=AdaBoostClassifier()))]),
             n_jo

In [43]:
y_prediction_test = model.predict(X_test)
#y_prediction_train = model.predict(X_train)

In [28]:
accuracy = (y_prediction_test == y_test).mean()

In [44]:
accuracy

related                   0.764399
request                   0.852638
offer                     0.995296
aid_related               0.648570
medical_help              0.923077
medical_products          0.951049
search_and_rescue         0.976732
security                  0.980801
military                  0.968722
water                     0.951176
food                      0.947870
shelter                   0.930833
clothing                  0.987540
money                     0.978385
missing_people            0.990591
refugees                  0.965162
death                     0.957788
other_aid                 0.868531
infrastructure_related    0.937190
transport                 0.960076
buildings                 0.948760
electricity               0.978640
tools                     0.993388
hospitals                 0.990591
shops                     0.995931
aid_centers               0.987540
other_infrastructure      0.957533
weather_related           0.776351
floods              

In [45]:
import pickle
from joblib import dump, load

filename = 'classifier.pkl'
dump(model, 'classifier.joblib')

['classifier.joblib']

In [47]:
ka = load('classifier.joblib')

In [51]:
y_pred = ka.predict(X_test)

In [52]:
(y_pred==y_test).mean()

related                   0.764399
request                   0.852638
offer                     0.995296
aid_related               0.648570
medical_help              0.923077
medical_products          0.951049
search_and_rescue         0.976732
security                  0.980801
military                  0.968722
water                     0.951176
food                      0.947870
shelter                   0.930833
clothing                  0.987540
money                     0.978385
missing_people            0.990591
refugees                  0.965162
death                     0.957788
other_aid                 0.868531
infrastructure_related    0.937190
transport                 0.960076
buildings                 0.948760
electricity               0.978640
tools                     0.993388
hospitals                 0.990591
shops                     0.995931
aid_centers               0.987540
other_infrastructure      0.957533
weather_related           0.776351
floods              

In [53]:
import plotly.express as px