In [1]:
# Libraries for data load
import pandas as pd
import re
from sqlalchemy import create_engine

# Library for data visualization
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

# Libraries for data cleaning and pre-processing
import nltk
nltk.download(['punkt', 'wordnet'])
nltk.download('averaged_perceptron_tagger')
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.base import BaseEstimator,TransformerMixin

# Libraries for pipeline and model building
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV

# Libraries for model evaluation
from sklearn.metrics import f1_score, precision_recall_fscore_support, accuracy_score, make_scorer
import pickle

import warnings
warnings.filterwarnings("ignore")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hianj\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\hianj\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\hianj\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hianj\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
def load_data(db, database_name, feature, pos):
    
    # load data from database
    engine = create_engine(db)
    
    sql = 'SELECT * FROM ' + database_name
    df = pd.read_sql(sql, engine)
    X = df[feature]
    y = df.iloc[:,pos:]
    
    return X, y

In [5]:
def tokenize(text):
    '''
    To clean and pre-process the raw data. Here are the steps done by the function
    1) Clean the data to remove all HTML tags
    2) Normalize by converting the text to lowercase and removing punctuations
    3) Split text into tokens
    4) Remove English stop words
    
    '''
    
    # Replace url tags with the string 'urlplaceholder'
    url_regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    detected_urls = re.findall(url_regex, text)
    for url in detected_urls:
        text = text.replace(url, "urlplaceholder")
        
    # Convert to lowercase
    text = text.lower() 
    
    # Remove punctuation characters
    text = re.sub(r"[^a-zA-Z0-9]", " ", text) 
    
    # Tokenize text
    words = word_tokenize(text)
    
    # Remove stop words
    words = [w for w in words if w not in stopwords.words("english")]
    
    # Lemmatize each word to create clean tokens
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(word, pos='n').strip() for word in words]
    clean_tokens = [lemmatizer.lemmatize(token, pos='v').strip() for token in lemmatized_tokens]
    
    return clean_tokens

In [99]:
class StartingVerbExtractor(BaseEstimator, TransformerMixin):
    '''
    The purpose of this class is to create a feature which indicates whether or not a sentence starts with a verb.
    It returns a True if the sentence starts with a verb or it is a re-tweet and False otherwise.
    '''

    def starting_verb(self, text):
        # tokenize by sentences
        sentence_list = nltk.sent_tokenize(text)      
        
        for sentence in sentence_list:
            
            # tokenize each sentence into words and tag part of speech
            pos_tags = nltk.pos_tag(tokenize(sentence))
            
            if len(pos_tags) > 1:
                # index pos_tags to get the first word and part of speech tag
                first_word, first_tag = pos_tags[0]
            
                # return true if the first word is an appropriate verb or RT for retweet
                if first_tag in ['VB', 'VBP'] or first_word == 'RT':
                    return True
            
            return False
            

    def fit(self, x, y=None):
        return self

    def transform(self, X):
        # apply starting_verb function to all values in X
        X_tagged = pd.Series(X).apply(self.starting_verb)

        return pd.DataFrame(X_tagged)

In [123]:
def model_pipeline():
    '''
    This function creates various features using CountVectorizer, TfidfTransformer and StartingVerbExtractor.
    For model building, RandomForestClassifier is used. 
    The function returns a pipeline model using all of the above.
    '''

    pipeline = Pipeline([
        ('features', FeatureUnion([

            ('text_pipeline', Pipeline([
                ('vect', CountVectorizer(tokenizer=tokenize)),
                ('tfidf', TfidfTransformer())
            ])),

            ('starting_verb', StartingVerbExtractor())
        ])),

        ('clf', MultiOutputClassifier(RandomForestClassifier(n_estimators=2)))
    ])

    return pipeline

In [118]:
def train(X_train, X_test, y_train, y_test, model):
    '''
    This function will train the model and predict on the test data.
    It returns the predicted y values
    '''   
    
    print('Model training started')
    model.fit(X_train, y_train) # Train the model on train data
    print('Model training completed')
    
    print('Prediction started')
    y_pred = model.predict(X_test) #Predict on test data
    print('Prediction complete')
    
    target_col_names = y_test.columns
    y_pred = pd.DataFrame(y_pred, columns = target_col_names) #Convert y_pred to a dataframe
    
    return y_pred

In [119]:
def model_evaluation(y_true, y_pred):
    '''
    This function will loop through each target column and calculate the precision, recall and F1-score.
    The resulting dataframe will be returned.
    '''    
    
    eval_measures = {} # Dictionary to store the performance measures
    target_col_names = y_test.columns # Get all the column names present in the target
    
    for col in target_col_names:
        eval_measures[col] = {}
        precision, recall, f1_score, support = precision_recall_fscore_support(y_true.loc[:,col], y_pred.loc[:,col], average='macro')
        accuracy = accuracy_score(y_true.loc[:,col], y_pred.loc[:,col])
        
        eval_measures[col]['f1_score'] = f1_score
        eval_measures[col]['precision'] = precision
        eval_measures[col]['recall'] = recall        
        eval_measures[col]['accuracy'] = accuracy 
    
    df_eval_measures = pd.DataFrame(eval_measures)
    df_eval_measures = df_eval_measures.transpose()
    df_eval_measures = df_eval_measures.sort_values(by=['f1_score', 'precision', 'recall', 'accuracy'], ascending=False)
    
    print(df_eval_measures)

In [103]:
def main():
    X, y = load_data('sqlite:///DisasterMessages.db', 'DisasterMessages', 'message', 4)
    
    y.drop(['child_alone'], axis=1, inplace=True) # Drop this column from the target as it has only 0s
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 40)

    #verb_extractor = StartingVerbExtractor()
    #verb_extractor.fit_transform(X_train)
    
    model = model_pipeline()    
    y_pred = train(X_train, X_test, y_train, y_test, model)
    
    # Model evaluation
    #model_evaluation(y_test, y_pred)    

In [124]:
main()

Model training started


TypeError: no supported conversion for types: (dtype('float64'), dtype('O'))