In [1]:
import sys
import re
import numpy as np
import pandas as pd
import nltk
import pickle
import sqlite3
nltk.download(['punkt', 'wordnet'])
nltk.download('stopwords')
from nltk.tokenize import word_tokenize, sent_tokenize
# from nltk.stem.porter import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
#from sqlalchemy import create_engine
import sqlite3

from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from nltk.corpus import stopwords
from sklearn.metrics import classification_report
from sklearn.datasets import make_multilabel_classification
from sklearn.multioutput import MultiOutputClassifier

from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\John\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\John\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\John\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:



def load_data():#database_filepath):
    
    ''' Load a database and provides attribute variable, target variables and headers corresponding to target variables
    
    INPUT:
    database_filepath - path of the .db file (database) created and stored by the process_data.py script
   
    
    OUTPUT:
    X - dataframe corresponding to the attribute variable (1 column, that corresponds to the text contained in the disaster_message.csv file which is input of the process_data.py script)
    Y - dataframe corresponding to the target variables (36 columns, correspond to each value of the "categories" column contained in the categories_message.csv file which is input of the                     process_data.py script) 
    category_names - headers of the Y dataframe
    '''
    #to run on a locally
    conn = sqlite3.connect('DisasterResponse.db')

    df=pd.read_sql('SELECT * FROM DisasterResponse', conn)
    
    #run at prompt
    #engine = create_engine('sqlite:///{}'.format(database_filepath))
   
    #df = pd.read_sql_table('DisasterResponseTable', engine) 
    
    df = df.replace(to_replace='None', value=np.nan)
    
    df=df[df["message"]!='#NAME?']
    
    #X = df["message"]
    X = pd.Series(df['message'])
    Y = df.drop(['id','message','original','genre'], axis=1)
    
    category_names = Y.columns
    
    return X, Y, category_names
        


In [5]:
X, Y, category_names =load_data()

In [6]:
X.head()

0    Weather update - a cold front from Cuba that c...
1              Is the Hurricane over or is it not over
2                      Looking for someone but no name
3    UN reports Leogane 80-90 destroyed. Only Hospi...
4    says: west side of Haiti, rest of the country ...
Name: message, dtype: object

In [7]:
def tokenize(text):
    
    ''' Tokenizes and normalizes the input text, removes stop words and symbols apart from letters and numbers
    
    INPUT:
    df- a text (string format)
      
    
    OUTPUT:
    clear tokens- a list of strings, obtained as a result of the following operations on the input text:
            - Everything but letters (uppercase und lowercase) and numbers will be removed
            - Text will be divided into separate elements, or "tokens"
            - Stop words corresponding to the English language will be removed
            - Tokens will be lemmatized, i.e. tokens will be converted into "root words",
                based on WordNetLemmatizer
            - Tokens will be lemmatized, i.e. tokens will be converted into "root words"
            - Tokens will be normalized
   
    '''
    
    # normalize text
    text = re.sub(r"[^a-zA-Z0-9]"," ",text)
   
    # tokenize text
    tokens = word_tokenize(text)
    
    # remove tokens corresponding to stop words
    tokens = [word for word in tokens if not word in stopwords.words("english")]
    
    # initiate lemmatizer
    lemmatizer = WordNetLemmatizer()

    # iterate through each token
    clean_tokens = []
    for tok in tokens:
        
        # lemmatize, normalize case, and remove leading/trailing white space
        clean_tok = lemmatizer.lemmatize(tok).lower().strip()
        clean_tokens.append(clean_tok)

    return clean_tokens 
    
    #pass


In [8]:
def build_model():
    
    '''  Builds a pipeline model
    
    OUTPUT:
    model_pipeline - A pipeline-based model with the following characteristics:
            - makes use of "CountVectorizer" as vectorizer
            - makes use of "TfidfTransformer" as transformer
            - makes use of "MultiOutputClassifier", subtype "RandomForestClassifier", as classifier
            - makes use of GridSearchCV in order to find the optimal combination of differemt hyperparameters
    
    '''
    
    pipeline = Pipeline([
    ('vect', CountVectorizer(tokenizer=tokenize)),
    ('tfidf', TfidfTransformer()),
    ('clf', MultiOutputClassifier(KNeighborsClassifier()))    ])
    
    parameters = {
    
       'vect__ngram_range': [(1, 1)],
    
       'vect__max_df': [0.5],
       'vect__max_features': [None],

       'clf__estimator__n_neighbors': [2]
                } 
    
    
    cv = GridSearchCV(pipeline, parameters,verbose=3)
    #print(cv)
    return cv

In [9]:
def train(X_train, y_train, model):
    
    ''' Fits the model with the train components of X and Y
    
    INPUT:
    X_train - train component of the dataframe corresponding to the attribute variable
    y_train - train component of the dataframe corresponding to the target variable
    model - mathematical model that will be fitted with the train components of attribute variable X and target attribute y
   
    OUTPUT:
    model - mathematical model that was inserted as input of the function, already fitted with X_test and y_test   
   
    '''

    # fit model
    model.fit(X_train, y_train)
    
    return model

In [10]:
def evaluate_model(cv, X_test, y_test, category_names):    
    
    ''' Evaluates the model, providing the test_score by using ClassificationReport (useful for multi-target models)
    
    INPUT:
    model - mathematical model that was already fitted with X_test and y_test in the train function 
    X_test - dataframe that corresponds to the train component of the attribute variable X
    y_test - contains the train component of the target variable yy - dataframe corresponding to the target variable; will be divided into train ant test sets
 
   
    '''    
    
    y_test_pred = model.predict(X_test)

    # y_test_pred and y_train_pred are obtaines as numpy arrays
    # for further operations, we need to convert them into dataframe
    # therefore, y_test_pred_df and y_train_pred_df are introduced:

    y_headers = y_test.columns

    y_test_pred_df = pd.DataFrame(y_test_pred, columns = y_headers)
    #print(y_test_pred_df)
    
    for col in y_test:
        #print(y_test_pred_df[col])#added as a test feature to print out the column as the test.
        #need to extract the col name
        print("Test Score results for Category..........",col)
        #print(y_headers[col])
        test_score = classification_report(y_test[col],y_test_pred_df[col])
        #print("\nBest Parameters:", cv.best_params_)
        print(test_score)

In [11]:
def save_model(model):
    
    ''' Saves the model as a pickle file
    
    INPUT:
    model - mathematical model that was already fitted with X_test and y_test in the train function 
    model_filepath - path where model will be saved
   
    '''       
    
    #pkl_filename = '{}'.format(model_filepath)
    #with open(pkl_model, 'wb') as file:
        #pickle.dump(model, file)
    filename = 'classifier.pkl'
    pickle.dump(model, open(filename, 'wb'))


In [13]:







def main():
    
    ''' Performs a series of operations to build a pipeline-based model fitted and tested with information contained in the database file contained in database_filepath,
        evaluates the model and stores the model in the path defined by model_filepath
    
    INPUT:
    database_filepath - path of the .db file (database) created and stored by the process_data.py script 
    model_filepath - path where model will be saved
    '''
    
    #if len(sys.argv) == 3:
#database_filepath, model_filepath = sys.argv[1:]
#print('Loading data...\n    DATABASE: {}'.format(database_filepath))
X, Y, category_names = load_data()#database_filepath)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

print('Building model...')
model = build_model()

print('Training model...')
model.fit(X_train, Y_train)

print('Evaluating model...')
evaluate_model(model, X_test, Y_test, category_names)

print('Saving model...\n    MODEL: {}'.format(model_filepath))
save_model(model)

print('Trained model saved!')




if __name__ == '__main__':
    main()

Building model...
Training model...
Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] clf__estimator__n_neighbors=2, vect__max_df=0.5, vect__max_features=None, vect__ngram_range=(1, 1) 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  clf__estimator__n_neighbors=2, vect__max_df=0.5, vect__max_features=None, vect__ngram_range=(1, 1), score=0.258, total= 5.5min
[CV] clf__estimator__n_neighbors=2, vect__max_df=0.5, vect__max_features=None, vect__ngram_range=(1, 1) 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  5.5min remaining:    0.0s


[CV]  clf__estimator__n_neighbors=2, vect__max_df=0.5, vect__max_features=None, vect__ngram_range=(1, 1), score=0.259, total= 5.8min
[CV] clf__estimator__n_neighbors=2, vect__max_df=0.5, vect__max_features=None, vect__ngram_range=(1, 1) 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed: 11.3min remaining:    0.0s


[CV]  clf__estimator__n_neighbors=2, vect__max_df=0.5, vect__max_features=None, vect__ngram_range=(1, 1), score=0.262, total= 5.7min
[CV] clf__estimator__n_neighbors=2, vect__max_df=0.5, vect__max_features=None, vect__ngram_range=(1, 1) 
[CV]  clf__estimator__n_neighbors=2, vect__max_df=0.5, vect__max_features=None, vect__ngram_range=(1, 1), score=0.263, total= 5.3min
[CV] clf__estimator__n_neighbors=2, vect__max_df=0.5, vect__max_features=None, vect__ngram_range=(1, 1) 
[CV]  clf__estimator__n_neighbors=2, vect__max_df=0.5, vect__max_features=None, vect__ngram_range=(1, 1), score=0.272, total= 5.0min


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed: 27.3min finished


Evaluating model...
Test Score results for Category.......... related
              precision    recall  f1-score   support

           0       0.42      0.53      0.47      1181
           1       0.85      0.78      0.82      4017

    accuracy                           0.73      5198
   macro avg       0.63      0.66      0.64      5198
weighted avg       0.75      0.73      0.74      5198

Test Score results for Category.......... request
              precision    recall  f1-score   support

           0       0.85      0.99      0.91      4340
           1       0.72      0.10      0.17       858

    accuracy                           0.84      5198
   macro avg       0.78      0.54      0.54      5198
weighted avg       0.83      0.84      0.79      5198

Test Score results for Category.......... offer
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      5175
           1       0.00      0.00      0.00        23

    accuracy  

  _warn_prf(average, modifier, msg_start, len(result))


clothing
              precision    recall  f1-score   support

           0       0.99      1.00      0.99      5121
           1       1.00      0.04      0.08        77

    accuracy                           0.99      5198
   macro avg       0.99      0.52      0.53      5198
weighted avg       0.99      0.99      0.98      5198

Test Score results for Category.......... money
              precision    recall  f1-score   support

           0       0.98      1.00      0.99      5080
           1       1.00      0.01      0.02       118

    accuracy                           0.98      5198
   macro avg       0.99      0.50      0.50      5198
weighted avg       0.98      0.98      0.97      5198

Test Score results for Category.......... missing_people
              precision    recall  f1-score   support

           0       0.99      1.00      0.99      5133
           1       1.00      0.02      0.03        65

    accuracy                           0.99      5198
   macro avg  

NameError: name 'model_filepath' is not defined

print(category_names)
for i in category_names:
    print(category_names[i])

y_test_pred = model.predict(X_test)

    # y_test_pred and y_train_pred are obtaines as numpy arrays
    # for further operations, we need to convert them into dataframe
    # therefore, y_test_pred_df and y_train_pred_df are introduced:

y_headers = y_test.columns


y_test_pred_df = pd.DataFrame(y_test_pred, columns = y_headers)
print(y_test_pred_df)
for col in y_test:
    print(y_test_pred_df[col])

In [None]:
print(len(category_names))

In [None]:
for i in range(len(category_names)):
    print(category_names[i])
    p=i+2
    print(p)

In [None]:
y_headers = Y_test.columns

print("Results for Category",y_headers[1])

In [None]:
 y_test_pred_df = pd.DataFrame(y_test_pred, columns = y_headers)