In [1]:
import sys
import re
import numpy as np
import pandas as pd
import nltk
import pickle
import sqlite3
nltk.download(['punkt', 'wordnet'])
nltk.download('stopwords')
from nltk.tokenize import word_tokenize, sent_tokenize
# from nltk.stem.porter import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
#from sqlalchemy import create_engine
import sqlite3

from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from nltk.corpus import stopwords
from sklearn.metrics import classification_report
from sklearn.datasets import make_multilabel_classification
from sklearn.multioutput import MultiOutputClassifier

from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV


[nltk_data] Downloading package punkt to C:\Users\jringros\AppData\Loc
[nltk_data]     al\Continuum\anaconda3\envs\tensorflow\lib\nltk_data..
[nltk_data]     .
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\jringros\AppData\L
[nltk_data]     ocal\Continuum\anaconda3\envs\tensorflow\lib\nltk_data
[nltk_data]     ...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\jringros\AppData
[nltk_data]     \Local\Continuum\anaconda3\envs\tensorflow\lib\nltk_da
[nltk_data]     ta...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:



def load_data():#database_filepath):
    
    ''' Load a database and provides attribute variable, target variables and headers corresponding to target variables
    
    INPUT:
    database_filepath - path of the .db file (database) created and stored by the process_data.py script
   
    
    OUTPUT:
    X - dataframe corresponding to the attribute variable (1 column, that corresponds to the text contained in the disaster_message.csv file which is input of the process_data.py script)
    Y - dataframe corresponding to the target variables (36 columns, correspond to each value of the "categories" column contained in the categories_message.csv file which is input of the                     process_data.py script) 
    category_names - headers of the Y dataframe
    '''
    conn = sqlite3.connect('messages.db')

    df=pd.read_sql('SELECT * FROM messages', conn)
    
    
    #engine = create_engine('sqlite:///{}'.format(database_filepath))
   
    #df = pd.read_sql_table('DisasterResponseTable', engine) 
    
    df = df.replace(to_replace='None', value=np.nan)
    
    df=df[df["message"]!='#NAME?']
    
    #X = df["message"]
    X = pd.Series(df['message'])
    Y = df.drop(['id','message','original','genre'], axis=1)
    
    category_names = Y.columns
    
    return X, Y, category_names
        


In [3]:
X, Y, category_names =load_data()

In [4]:
X.head()

0    Weather update - a cold front from Cuba that c...
1              Is the Hurricane over or is it not over
2                      Looking for someone but no name
3    UN reports Leogane 80-90 destroyed. Only Hospi...
4    says: west side of Haiti, rest of the country ...
Name: message, dtype: object

In [5]:
def tokenize(text):
    
    ''' Tokenizes and normalizes the input text, removes stop words and symbols apart from letters and numbers
    
    INPUT:
    df- a text (string format)
      
    
    OUTPUT:
    clear tokens- a list of strings, obtained as a result of the following operations on the input text:
            - Everything but letters (uppercase und lowercase) and numbers will be removed
            - Text will be divided into separate elements, or "tokens"
            - Stop words corresponding to the English language will be removed
            - Tokens will be lemmatized, i.e. tokens will be converted into "root words",
                based on WordNetLemmatizer
            - Tokens will be lemmatized, i.e. tokens will be converted into "root words"
            - Tokens will be normalized
   
    '''
    
    # normalize text
    text = re.sub(r"[^a-zA-Z0-9]"," ",text)
   
    # tokenize text
    tokens = word_tokenize(text)
    
    # remove tokens corresponding to stop words
    tokens = [word for word in tokens if not word in stopwords.words("english")]
    
    # initiate lemmatizer
    lemmatizer = WordNetLemmatizer()

    # iterate through each token
    clean_tokens = []
    for tok in tokens:
        
        # lemmatize, normalize case, and remove leading/trailing white space
        clean_tok = lemmatizer.lemmatize(tok).lower().strip()
        clean_tokens.append(clean_tok)

    return clean_tokens 
    
    #pass


In [6]:
def build_model():
    
    '''  Builds a pipeline model
    
    OUTPUT:
    model_pipeline - A pipeline-based model with the following characteristics:
            - makes use of "CountVectorizer" as vectorizer
            - makes use of "TfidfTransformer" as transformer
            - makes use of "MultiOutputClassifier", subtype "RandomForestClassifier", as classifier
            - makes use of GridSearchCV in order to find the optimal combination of differemt hyperparameters
    
    '''
    
    pipeline = Pipeline([
    ('vect', CountVectorizer(tokenizer=tokenize)),
    ('tfidf', TfidfTransformer()),
    ('clf', MultiOutputClassifier(KNeighborsClassifier()))    ])
    
    parameters = {
    
       'vect__ngram_range': [(1, 1)],
    
       'vect__max_df': [0.5],
       'vect__max_features': [None],

       'clf__estimator__n_neighbors': [2]
                } 
    
    
    cv = GridSearchCV(pipeline, parameters,verbose=3)
    #print(cv)
    return cv

In [7]:
def train(X_train, y_train, model):
    
    ''' Fits the model with the train components of X and Y
    
    INPUT:
    X_train - train component of the dataframe corresponding to the attribute variable
    y_train - train component of the dataframe corresponding to the target variable
    model - mathematical model that will be fitted with the train components of attribute variable X and target attribute y
   
    OUTPUT:
    model - mathematical model that was inserted as input of the function, already fitted with X_test and y_test   
   
    '''

    # fit model
    model.fit(X_train, y_train)
    
    return model

In [8]:
def evaluate_model(cv, X_test, y_test, category_names):    
    
    ''' Evaluates the model, providing the test_score by using ClassificationReport (useful for multi-target models)
    
    INPUT:
    model - mathematical model that was already fitted with X_test and y_test in the train function 
    X_test - dataframe that corresponds to the train component of the attribute variable X
    y_test - contains the train component of the target variable yy - dataframe corresponding to the target variable; will be divided into train ant test sets
 
   
    '''    
    
    y_test_pred = model.predict(X_test)

    # y_test_pred and y_train_pred are obtaines as numpy arrays
    # for further operations, we need to convert them into dataframe
    # therefore, y_test_pred_df and y_train_pred_df are introduced:

    y_headers = y_test.columns

    y_test_pred_df = pd.DataFrame(y_test_pred, columns = y_headers)

    
    for col in y_test:
        test_score = classification_report(y_test[col],y_test_pred_df[col],)
        #print("\nBest Parameters:", cv.best_params_)
        print(test_score)

In [9]:
def save_model(model):
    
    ''' Saves the model as a pickle file
    
    INPUT:
    model - mathematical model that was already fitted with X_test and y_test in the train function 
    model_filepath - path where model will be saved
   
    '''       
    
    #pkl_filename = '{}'.format(model_filepath)
    #with open(pkl_model, 'wb') as file:
        #pickle.dump(model, file)
    filename = 'classifier.pkl'
    pickle.dump(model, open(filename, 'wb'))


In [10]:







def main():
    
    ''' Performs a series of operations to build a pipeline-based model fitted and tested with information contained in the database file contained in database_filepath,
        evaluates the model and stores the model in the path defined by model_filepath
    
    INPUT:
    database_filepath - path of the .db file (database) created and stored by the process_data.py script 
    model_filepath - path where model will be saved
    '''
    
    #if len(sys.argv) == 3:
#database_filepath, model_filepath = sys.argv[1:]
#print('Loading data...\n    DATABASE: {}'.format(database_filepath))
X, Y, category_names = load_data()#database_filepath)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

print('Building model...')
model = build_model()

print('Training model...')
model.fit(X_train, Y_train)

print('Evaluating model...')
evaluate_model(model, X_test, Y_test, category_names)

#print('Saving model...\n    MODEL: {}'.format(model_filepath))
save_model(model)

print('Trained model saved!')




if __name__ == '__main__':
    main()

Building model...
Training model...
Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV 1/5] END clf__estimator__n_neighbors=2, vect__max_df=0.5, vect__max_features=None, vect__ngram_range=(1, 1);, score=0.231 total time= 9.3min
[CV 2/5] END clf__estimator__n_neighbors=2, vect__max_df=0.5, vect__max_features=None, vect__ngram_range=(1, 1);, score=0.086 total time= 9.3min
[CV 3/5] END clf__estimator__n_neighbors=2, vect__max_df=0.5, vect__max_features=None, vect__ngram_range=(1, 1);, score=0.265 total time= 9.1min
[CV 4/5] END clf__estimator__n_neighbors=2, vect__max_df=0.5, vect__max_features=None, vect__ngram_range=(1, 1);, score=0.236 total time= 9.3min
[CV 5/5] END clf__estimator__n_neighbors=2, vect__max_df=0.5, vect__max_features=None, vect__ngram_range=(1, 1);, score=0.240 total time= 9.2min
Evaluating model...
              precision    recall  f1-score   support

           0       0.62      0.25      0.35      1266
           1       0.79      0.95      0.86      39

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.99      1.00      1.00      5206
           1       0.00      0.00      0.00        30

    accuracy                           0.99      5236
   macro avg       0.50      0.50      0.50      5236
weighted avg       0.99      0.99      0.99      5236

              precision    recall  f1-score   support

           0       0.59      0.99      0.74      3054
           1       0.78      0.05      0.09      2182

    accuracy                           0.60      5236
   macro avg       0.69      0.52      0.42      5236
weighted avg       0.67      0.60      0.47      5236

              precision    recall  f1-score   support

           0       0.92      1.00      0.96      4819
           1       0.50      0.00      0.00       417

    accuracy                           0.92      5236
   macro avg       0.71      0.50      0.48      5236
weighted avg       0.89      0.92      0.88      5236

              preci

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.98      1.00      0.99      5149
           1       0.00      0.00      0.00        87

    accuracy                           0.98      5236
   macro avg       0.49      0.50      0.50      5236
weighted avg       0.97      0.98      0.98      5236



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.97      1.00      0.98      5080
           1       0.00      0.00      0.00       156

    accuracy                           0.97      5236
   macro avg       0.49      0.50      0.49      5236
weighted avg       0.94      0.97      0.96      5236

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      5236

    accuracy                           1.00      5236
   macro avg       1.00      1.00      1.00      5236
weighted avg       1.00      1.00      1.00      5236

              precision    recall  f1-score   support

           0       0.94      1.00      0.97      4908
           1       0.50      0.02      0.05       328

    accuracy                           0.94      5236
   macro avg       0.72      0.51      0.51      5236
weighted avg       0.91      0.94      0.91      5236

              precision    recall  f1-score   support

           0      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.99      1.00      0.99      5171
           1       0.00      0.00      0.00        65

    accuracy                           0.99      5236
   macro avg       0.49      0.50      0.50      5236
weighted avg       0.98      0.99      0.98      5236

              precision    recall  f1-score   support

           0       0.97      1.00      0.98      5053
           1       0.00      0.00      0.00       183

    accuracy                           0.96      5236
   macro avg       0.48      0.50      0.49      5236
weighted avg       0.93      0.96      0.95      5236

              precision    recall  f1-score   support

           0       0.96      1.00      0.98      5000
           1       0.89      0.03      0.07       236

    accuracy                           0.96      5236
   macro avg       0.92      0.52      0.52      5236
weighted avg       0.95      0.96      0.94      5236

              preci

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.95      1.00      0.98      4981
           1       0.00      0.00      0.00       255

    accuracy                           0.95      5236
   macro avg       0.48      0.50      0.49      5236
weighted avg       0.90      0.95      0.93      5236

              precision    recall  f1-score   support

           0       0.95      1.00      0.98      4987
           1       0.67      0.02      0.03       249

    accuracy                           0.95      5236
   macro avg       0.81      0.51      0.50      5236
weighted avg       0.94      0.95      0.93      5236

              precision    recall  f1-score   support

           0       0.98      1.00      0.99      5129
           1       0.00      0.00      0.00       107

    accuracy                           0.98      5236
   macro avg       0.49      0.50      0.49      5236
weighted avg       0.96      0.98      0.97      5236



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.99      1.00      1.00      5203
           1       0.00      0.00      0.00        33

    accuracy                           0.99      5236
   macro avg       0.50      0.50      0.50      5236
weighted avg       0.99      0.99      0.99      5236



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.99      1.00      1.00      5185
           1       0.00      0.00      0.00        51

    accuracy                           0.99      5236
   macro avg       0.50      0.50      0.50      5236
weighted avg       0.98      0.99      0.99      5236



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       1.00      1.00      1.00      5218
           1       0.00      0.00      0.00        18

    accuracy                           1.00      5236
   macro avg       0.50      0.50      0.50      5236
weighted avg       0.99      1.00      0.99      5236



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.99      1.00      0.99      5167
           1       0.00      0.00      0.00        69

    accuracy                           0.99      5236
   macro avg       0.49      0.50      0.50      5236
weighted avg       0.97      0.99      0.98      5236



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.96      1.00      0.98      5013
           1       0.00      0.00      0.00       223

    accuracy                           0.96      5236
   macro avg       0.48      0.50      0.49      5236
weighted avg       0.92      0.96      0.94      5236

              precision    recall  f1-score   support

           0       0.74      1.00      0.85      3800
           1       0.86      0.06      0.11      1436

    accuracy                           0.74      5236
   macro avg       0.80      0.53      0.48      5236
weighted avg       0.77      0.74      0.64      5236

              precision    recall  f1-score   support

           0       0.92      1.00      0.96      4804
           1       1.00      0.00      0.00       432

    accuracy                           0.92      5236
   macro avg       0.96      0.50      0.48      5236
weighted avg       0.92      0.92      0.88      5236

              preci

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.99      1.00      0.99      5181
           1       0.00      0.00      0.00        55

    accuracy                           0.99      5236
   macro avg       0.49      0.50      0.50      5236
weighted avg       0.98      0.99      0.98      5236

              precision    recall  f1-score   support

           0       0.92      1.00      0.96      4774
           1       0.84      0.15      0.25       462

    accuracy                           0.92      5236
   macro avg       0.88      0.57      0.60      5236
weighted avg       0.92      0.92      0.90      5236

              precision    recall  f1-score   support

           0       0.98      1.00      0.99      5141
           1       1.00      0.02      0.04        95

    accuracy                           0.98      5236
   macro avg       0.99      0.51      0.52      5236
weighted avg       0.98      0.98      0.97      5236

              preci