In [1]:
# Libraries for data load
import pandas as pd
import re
from sqlalchemy import create_engine

# Library for data visualization
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

# Libraries for data cleaning and pre-processing
import nltk
nltk.download(['punkt', 'wordnet'])
nltk.download('averaged_perceptron_tagger')
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.base import BaseEstimator,TransformerMixin

# Libraries for pipeline and model building
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV

# Libraries for model evaluation
from sklearn.metrics import f1_score, precision_recall_fscore_support, accuracy_score, make_scorer
import pickle

import warnings
warnings.filterwarnings("ignore")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hianj\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\hianj\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\hianj\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hianj\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
def load_data(db, database_name, feature, pos):
    
    # load data from database
    engine = create_engine(db)
    
    sql = 'SELECT * FROM ' + database_name
    df = pd.read_sql(sql, engine)
    X = df[feature]
    y = df.iloc[:,pos:]
    
    return X, y

In [5]:
def tokenize(text):
    '''
    To clean and pre-process the raw data. Here are the steps done by the function
    1) Clean the data to remove all HTML tags
    2) Normalize by converting the text to lowercase and removing punctuations
    3) Split text into tokens
    4) Remove English stop words
    
    '''
    
    # Replace url tags with the string 'urlplaceholder'
    url_regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    detected_urls = re.findall(url_regex, text)
    for url in detected_urls:
        text = text.replace(url, "urlplaceholder")
        
    # Convert to lowercase
    text = text.lower() 
    
    # Remove punctuation characters
    text = re.sub(r"[^a-zA-Z0-9]", " ", text) 
    
    # Tokenize text
    words = word_tokenize(text)
    
    # Remove stop words
    words = [w for w in words if w not in stopwords.words("english")]
    
    # Lemmatize each word to create clean tokens
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(word, pos='n').strip() for word in words]
    clean_tokens = [lemmatizer.lemmatize(token, pos='v').strip() for token in lemmatized_tokens]
    
    return clean_tokens

In [59]:
class StartingVerbExtractor(BaseEstimator, TransformerMixin):

    def starting_verb(self, text):
        # tokenize by sentences
        sentence_list = nltk.sent_tokenize(text)
        
        '''
        # Debugging
        for sentence in sentence_list:
            len_sent = len(sentence)
            if len_sent < 10:
                print(len_sent, ' : ' ,sentence)
        '''        
        
        for sentence in sentence_list:
            print(sentence)
            
            # tokenize each sentence into words and tag part of speech
            pos_tags = nltk.pos_tag(tokenize(sentence))
            print(pos_tags)            
            
            # index pos_tags to get the first word and part of speech tag
            first_word, first_tag = pos_tags[0]
            
            # return true if the first word is an appropriate verb or RT for retweet
            if first_tag in ['VB', 'VBP'] or first_word == 'RT':
                return True

            return False
            

    def fit(self, x, y=None):
        return self

    def transform(self, X):
        # apply starting_verb function to all values in X
        X_tagged = pd.Series(X).apply(self.starting_verb)

        return pd.DataFrame(X_tagged)

In [54]:
def model_pipeline():
    text_pipeline = Pipeline([
                            ('vect', CountVectorizer(tokenizer=tokenize)),
                            ('tfidf', TfidfTransformer())
                            ])
    pipeline = Pipeline([
                        ('feature_union', FeatureUnion([('text_pipeline', text_pipeline), ('verb_extractor', StartingVerbExtractor())]))
                        #,('clf', RandomForestClassifier(n_estimators=10))
                        ]) 
    return pipeline

In [31]:
def train(X_train, X_test, y_train, y_test, model):
    '''
    This function will train the model and predict on the test data.
    It returns the predicted y values
    '''   
    
    model.fit(X_train, y_train) # Train the model on train data
    
    y_pred = model.predict(X_test) #Predict on test data
    
    target_col_names = y_test.columns
    y_pred = pd.DataFrame(y_pred, columns = target_col_names) #Convert y_pred to a dataframe
    
    return y_pred

In [58]:
def main():
    X, y = load_data('sqlite:///DisasterMessages.db', 'DisasterMessages', 'message', 4)
    
    y.drop(['child_alone'], axis=1, inplace=True) # Drop this column from the target as it has only 0s
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 40)

    verb_extractor = StartingVerbExtractor()
    verb_extractor.fit_transform(X_train)
    
    #model = model_pipeline()    
    #y_pred = train(X_train, X_test, y_train, y_test, model)

In [60]:
main()

26/05/2015 - Moneragala, Sri Lanka: The Sri Lanka Red Cross Society's Moneragala Branch has taken adequate steps to provide water & sanitation support to over 500 families within the district who are in the drought stricken area of the district.
[('26', 'CD'), ('05', 'CD'), ('2015', 'CD'), ('moneragala', 'NN'), ('sri', 'NN'), ('lanka', 'NN'), ('sri', 'NN'), ('lanka', 'NN'), ('red', 'JJ'), ('cross', 'NN'), ('society', 'NN'), ('moneragala', 'NN'), ('branch', 'NN'), ('take', 'VB'), ('adequate', 'JJ'), ('step', 'NN'), ('provide', 'VBP'), ('water', 'NN'), ('sanitation', 'NN'), ('support', 'NN'), ('500', 'CD'), ('family', 'NN'), ('within', 'IN'), ('district', 'NN'), ('drought', 'NN'), ('stricken', 'JJ'), ('area', 'NN'), ('district', 'NN')]
my kids died, my house is destroyed.
[('kid', 'NN'), ('die', 'NN'), ('house', 'NN'), ('destroy', 'NN')]
And of course by now there's no firewood left at all to boil water -- on my trip I saw families trying to burn the remains of their bamboo houses to fue

[('officially', 'RB'), ('make', 'VBP'), ('first', 'JJ'), ('storm', 'NN'), ('newsroom', 'NN')]
More information on the 4636 number in order for me to participate.
[('information', 'NN'), ('4636', 'CD'), ('number', 'NN'), ('order', 'NN'), ('participate', 'VB')]
If you can help a person who got aids.
[('help', 'NN'), ('person', 'NN'), ('get', 'VB'), ('aid', 'NN')]
Getting slowly back to normal in Santiago\n\nIn the South we still have curfew
[('get', 'VB'), ('slowly', 'RB'), ('back', 'RB'), ('normal', 'JJ'), ('santiago', 'NN'), ('n', 'NN'), ('nin', 'NN'), ('south', 'NN'), ('still', 'RB'), ('curfew', 'VB')]
The SMS:  Rezilta Bak la sot bay 12 Janvye a nan lekl la vi a.
[('sm', 'NN'), ('rezilta', 'NN'), ('bak', 'NN'), ('la', 'NN'), ('sot', 'VBD'), ('bay', 'JJ'), ('12', 'CD'), ('janvye', 'NN'), ('nan', 'NN'), ('lekl', 'NN'), ('la', 'NN'), ('vi', 'NN')]
13 November 2012 - The Federal Ministry of Health (FMOH) in Sudan has notified WHO of a yellow fever outbreak affecting 23 localities in Grea

[('spate', 'NN'), ('arm', 'NN'), ('incursion', 'NN'), ('target', 'NN'), ('detention', 'NN'), ('facility', 'NN'), ('witness', 'IN'), ('major', 'JJ'), ('city', 'NN'), ('maiduguri', 'NN'), ('bauchi', 'VBD'), ('lakoja', 'JJ'), ('abuja', 'NN'), ('recently', 'RB'), ('tunga', 'VBD'), ('also', 'RB'), ('lead', 'JJ'), ('release', 'NN'), ('incarcerate', 'JJ'), ('insurgent', 'NN'), ('skilled', 'VBD'), ('strategist', 'JJ'), ('bomb', 'NN'), ('maker', 'NN'), ('specialist', 'NN'), ('kidnap', 'VBD'), ('finance', 'NN'), ('activity', 'NN')]
clothing ( wide range of styles and for various ages ) ; non-perishable food ; hygiene products
[('clothe', 'NN'), ('wide', 'JJ'), ('range', 'NN'), ('style', 'NN'), ('various', 'JJ'), ('age', 'NN'), ('non', 'NN'), ('perishable', 'JJ'), ('food', 'NN'), ('hygiene', 'NN'), ('product', 'NN')]
UNICEF, with Government support, is setting up 50 Maternity Health camps, two Nutritional Rehabilitation Centres to treat severely malnourished children, and more than 265 Alternativ

[('russian', 'JJ'), ('aggression', 'NN'), ('cause', 'NN'), ('another', 'DT'), ('challenge', 'NN'), ('ukraine', 'JJ'), ('protection', 'NN'), ('donbas', 'NN'), ('environment', 'NN')]
I can help distribute food to Hurricane Sandy victims in Petrides Center .
[('help', 'NN'), ('distribute', 'VB'), ('food', 'NN'), ('hurricane', 'NN'), ('sandy', 'NN'), ('victim', 'NN'), ('petrides', 'NNS'), ('center', 'NN')]
God Loves Orphans or departments all children of the world Address Rt repatriate Haitian village new area manager toutier Miss st cyr, IOM Bondy benGLANDE EXOCRINE  these are gland excretory canals are pouvries dump their product
[('god', 'NNS'), ('love', 'VBP'), ('orphan', 'JJ'), ('department', 'NN'), ('child', 'NN'), ('world', 'NN'), ('address', 'NN'), ('rt', 'NN'), ('repatriate', 'NN'), ('haitian', 'JJ'), ('village', 'NN'), ('new', 'JJ'), ('area', 'NN'), ('manager', 'NN'), ('toutier', 'JJR'), ('miss', 'JJ'), ('st', 'NN'), ('cyr', 'NN'), ('iom', 'NN'), ('bondy', 'NN'), ('benglande', 'N

[('38', 'CD'), ('500', 'CD'), ('refugee', 'NN'), ('live', 'JJ'), ('squalid', 'JJ'), ('tent', 'NN'), ('camp', 'NN'), ('around', 'IN'), ('aceh', 'JJ'), ('move', 'NN'), ('feb', 'RB'), ('1', 'CD'), ('new', 'JJ'), ('relocation', 'NN'), ('centre', 'NN'), ('first', 'RB'), ('stage', 'JJ'), ('resettle', 'JJ'), ('400', 'CD'), ('000', 'CD'), ('displace', 'NN'), ('people', 'NNS'), ('official', 'JJ'), ('camp', 'NNS'), ('say', 'VBP'), ('official', 'JJ'), ('tsunami', 'NN'), ('crisis', 'NN'), ('centre', 'NN'), ('banda', 'NN'), ('aceh', 'NNS'), ('say', 'VBP')]
The Department has also asked for help in mobilizing 100 volunteer divers each Sunday for the next couple of months.
[('department', 'NN'), ('also', 'RB'), ('ask', 'VB'), ('help', 'NN'), ('mobilize', 'VB'), ('100', 'CD'), ('volunteer', 'NN'), ('diver', 'NN'), ('sunday', 'JJ'), ('next', 'JJ'), ('couple', 'NN'), ('month', 'NN')]
India meanwhile flew in 13 military transport planes loaded with tonnes of food, blankets and other aid.
[('india', 'NN')

[('recent', 'JJ'), ('abundant', 'JJ'), ('rain', 'NN'), ('expect', 'VBP'), ('rain', 'NN'), ('increase', 'NN'), ('prospect', 'NN'), ('belg', 'JJ'), ('crop', 'NN'), ('beyond', 'IN'), ('permanent', 'JJ'), ('wilt', 'JJ'), ('point', 'NN')]
This is where registered Afghan returnees arrive in government-chartered buses, receive final briefings on the danger posed by landmines, get any necessary inoculations and surrender their refugee identity cards to officials of UNHCR's partner agency in Iran, the Bureau for Aliens and Foreign Immigrants Affairs (BAFIA).
[('register', 'NN'), ('afghan', 'NN'), ('returnees', 'NNS'), ('arrive', 'JJ'), ('government', 'NN'), ('charter', 'NN'), ('bus', 'NN'), ('receive', 'VBP'), ('final', 'JJ'), ('brief', 'NN'), ('danger', 'NN'), ('pose', 'VB'), ('landmines', 'NNS'), ('get', 'VB'), ('necessary', 'JJ'), ('inoculation', 'NN'), ('surrender', 'NN'), ('refugee', 'NN'), ('identity', 'NN'), ('card', 'VBP'), ('official', 'JJ'), ('unhcr', 'JJ'), ('partner', 'NN'), ('agenc

[('wake', 'NN'), ('freak', 'NN'), ('tornado', 'NN'), ('lesotho', 'NN'), ('red', 'JJ'), ('cross', 'NN'), ('stock', 'NN'), ('tent', 'NN'), ('case', 'NN'), ('repeat', 'NN'), ('strike', 'NN'), ('twister', 'NN'), ('strike', 'NN'), ('country', 'NN'), ('early', 'JJ'), ('january', 'JJ'), ('one', 'CD'), ('example', 'NN'), ('extreme', 'NN'), ('weather', 'NN'), ('southern', 'JJ'), ('africa', 'NN'), ('experience', 'NN'), ('past', 'IN'), ('week', 'NN'), ('many', 'JJ'), ('case', 'NN'), ('worst', 'JJS'), ('record', 'NN')]
In addition, the nuclear contamination has created an immensely complex series of issues.
[('addition', 'NN'), ('nuclear', 'JJ'), ('contamination', 'NN'), ('create', 'NN'), ('immensely', 'RB'), ('complex', 'JJ'), ('series', 'NN'), ('issue', 'NN')]
Paddy fields have been affected by the water weevil infestation, lack of sufficient fertilizer and unfavorable weather conditions.
[('paddy', 'JJ'), ('field', 'NN'), ('affect', 'JJ'), ('water', 'NN'), ('weevil', 'JJ'), ('infestation', 'NN'

IndexError: list index out of range