In [77]:
# import libraries
import pandas as pd
import re, nltk
from sqlalchemy import create_engine

from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

nltk.download(['punkt','stopwords','wordnet','averaged_perceptron_tagger'])

lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/kamleshkumarrangi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/kamleshkumarrangi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/kamleshkumarrangi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/kamleshkumarrangi/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [54]:
# Load the ETL pipeline 
engine = create_engine('sqlite:///disaster_response_table.db')
df = pd.read_sql_table('disaster_response_table.db', engine) 
X = df['message'].values
y = df.drop(['id','message','original','genre'], axis=1).values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

In [48]:
url_regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'

def tokenize(text):
   
    detected_urls = re.findall(url_regex, text)
    for url in detected_urls:
        text = text.replace(url, "urlplaceholder")

    tokens = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()

    clean_tokens = []
    for tok in tokens:
        clean_tok = lemmatizer.lemmatize(tok).lower().strip()
        clean_tokens.append(clean_tok)

    return clean_tokens


class StartingVerbExtractor(BaseEstimator, TransformerMixin):

    def starting_verb(self, text):
        sentence_list = nltk.sent_tokenize(text)
        for sentence in sentence_list:
            pos_tags = nltk.pos_tag(tokenize(sentence))
            first_word, first_tag = pos_tags[0]
            if first_tag in ['VB', 'VBP'] or first_word == 'RT':
                return True
        return False

    def fit(self, x, y=None):
        return self

    def transform(self, X):
        X_tagged = pd.Series(X).apply(self.starting_verb)
        return pd.DataFrame(X_tagged)

In [49]:
pipeline = Pipeline([
    ('features', FeatureUnion([

        ('text_pipeline', Pipeline([
            ('vect', CountVectorizer(tokenizer=tokenize)),
            ('tfidf', TfidfTransformer())
        ])),

        ('starting_verb', StartingVerbExtractor())
    ])),

    ('clf', MultiOutputClassifier(RandomForestClassifier()))
])

In [50]:
pipeline.get_params()

{'memory': None,
 'steps': [('features',
   FeatureUnion(transformer_list=[('text_pipeline',
                                   Pipeline(steps=[('vect',
                                                    CountVectorizer(tokenizer=<function tokenize at 0x7fe898016160>)),
                                                   ('tfidf',
                                                    TfidfTransformer())])),
                                  ('starting_verb', StartingVerbExtractor())])),
  ('clf', MultiOutputClassifier(estimator=RandomForestClassifier()))],
 'verbose': False,
 'features': FeatureUnion(transformer_list=[('text_pipeline',
                                 Pipeline(steps=[('vect',
                                                  CountVectorizer(tokenizer=<function tokenize at 0x7fe898016160>)),
                                                 ('tfidf',
                                                  TfidfTransformer())])),
                                ('starting_verb', 

In [71]:
#pipeline.fit(X, Y)
#print(X.shape)
#tokenized = X['message'].apply(tokenize)
vect = CountVectorizer(tokenizer=tokenize)
tfidf = TfidfTransformer()
clf = RandomForestClassifier()

#X_train = X_train.message.values
vector_count = vect.fit_transform(X_train)
vfidf = tfidf.fit_transform(vector_count)
ml = MultiOutputClassifier(RandomForestClassifier()).fit(vfidf, y_train)

In [78]:
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, AdaBoostClassifier
pipeline_3 = Pipeline([
        ('features', FeatureUnion([

            ('text_pipeline', Pipeline([
                ('count_vectorizer', CountVectorizer(tokenizer=tokenize)),
                ('tfidf_transformer', TfidfTransformer())
            ])),

            ('starting_verb_transformer', StartingVerbExtractor())
        ])),

        ('classifier', MultiOutputClassifier(AdaBoostClassifier()))
    ])

In [79]:
pipeline_3.fit(X_train, y_train)

Pipeline(steps=[('features',
                 FeatureUnion(transformer_list=[('text_pipeline',
                                                 Pipeline(steps=[('count_vectorizer',
                                                                  CountVectorizer(tokenizer=<function tokenize at 0x7fe898016160>)),
                                                                 ('tfidf_transformer',
                                                                  TfidfTransformer())])),
                                                ('starting_verb_transformer',
                                                 StartingVerbExtractor())])),
                ('classifier',
                 MultiOutputClassifier(estimator=AdaBoostClassifier()))])

In [92]:
from sklearn.metrics import classification_report

y_prediction_train = pipeline_3.predict(X_train)
y_prediction_test = pipeline_3.predict(X_test)

# Print classification report on test data
print(classification_report(y_test, y_prediction_test, target_names=labels))

                        precision    recall  f1-score   support

               related       0.83      0.94      0.88      6012
               request       0.78      0.54      0.64      1313
                 offer       0.08      0.03      0.04        36
           aid_related       0.76      0.58      0.65      3255
          medical_help       0.61      0.29      0.39       629
      medical_products       0.60      0.32      0.41       406
     search_and_rescue       0.58      0.19      0.29       202
              security       0.17      0.03      0.05       151
              military       0.51      0.23      0.31       244
                 water       0.75      0.62      0.68       493
                  food       0.82      0.69      0.75       861
               shelter       0.76      0.53      0.63       715
              clothing       0.61      0.35      0.44       114
                 money       0.53      0.27      0.36       174
        missing_people       0.68      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [90]:
labels = df.drop(['id','message','original','genre'], axis=1).columns.values