Candidate: Ivomar Brito Soares

Email: ivomarbsoares@gmail.com

## Summary

<ul></ul>
<li>Importing libraries</li>
<li>Utility methods</li>
<li>Defining variables</li>
<li>Reading data set</li>
<li>Preprocessing</li>
<li>Feature Extraction: Term Frequency - Inverse Document Frequency (TF-IDF)</li>
<li>Preparing categorical target variable</li>
<li>Loading trained deep learning model</li>
<li>Model evaluation and performance report</li>
<ul></ul>

## Importing libraries

In [None]:
import pandas as pd
from keras.models import model_from_json
from sklearn.metrics import classification_report

# Data pre-processing modules
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from textblob import Word
from sklearn import preprocessing

# TFIDF vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

## Utility methods

In [None]:
def basic_preprocessing(dataset, feature_name):
    """
    These are the basic pre-processing steps followed in this function:
    - Convert text to lower case.
    - Punctuation removal.
    - Stop words removal.
    
    Additional possible pre-processing steps (future work):
    - Common words removal.
    - Rare words removal.
    - Spelling correction.
    - Keeping words of length of at least 3.
    """   
    # The first pre-processing is to convert all text into lower case, this avoids having multiple copies
    # of the same words.
    dataset[feature_name] = dataset[feature_name].apply(lambda x: " ".join(x.lower() for x in x.split()))
    
    # Punctuation removal, often it does not add extra information when dealing with text data. Removing them helps
    # reduce the size of the training data.
    dataset[feature_name] = dataset[feature_name].str.replace('[^\w\s]','')
    
    # Stop words (frequently occurring words) should be removed from the dataset.
    stop = stopwords.words('english')
    dataset[feature_name] = dataset[feature_name].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
    
    # Lemmatization: Converts the word into its root word.
    dataset[feature_name] = dataset[feature_name].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
    
def prepare_targets(y_train):
    """
    Converts non-numerical catorigal labels to numerical categorical labels.
    """
    le = preprocessing.LabelEncoder()
    le.fit(y_train)
    y_train_enc = le.transform(y_train)
    return y_train_enc

## Defining variables

Change these variables to the desired valuables to load and evaluate the evaluation dataset.

In [None]:
path_to_evaluation_dataset = 'test_data.csv'
categorical_target_name = 'categorical_target_1'
features_column_name = 'features'
nb_classes = 43      # Chosen target variable, categorical_target_1 with 43 unique values or classes.
model_json_file = 'model.json'
model_h5_file = 'model.h5'
batch_size = 64

## Reading data set

In [None]:
dataset = pd.read_csv(path_to_evaluation_dataset)

## Preprocessing

In [None]:
# Dropping missing values
dataset.dropna(subset=[categorical_target_name], inplace=True)

basic_preprocessing(dataset, features_column_name)

## Feature Extraction: Term Frequency - Inverse Document Frequency (TF-IDF)

In [None]:
tfidf_vectorizer = TfidfVectorizer(min_df=2, ngram_range=(1, 2), stop_words='english', max_features= 10000,strip_accents='unicode', norm='l2')
X_test = tfidf_vectorizer.fit_transform(dataset[features_column_name]).todense()

## Preparing categorical target variable

In [None]:
y_test_enc = prepare_targets(dataset[categorical_target_name])

## Loading trained deep learning model

In [None]:
json_file = open(model_json_file, 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)
# load weights into new model
loaded_model.load_weights(model_h5_file)
print("Loaded model from disk")
loaded_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

## Model evaluation and performance report

In [None]:
y_test_predclass = loaded_model.predict_classes(X_test,batch_size=batch_size)

In [None]:
print ("Deep Neural Network - Test Classification Report")
print (classification_report(y_test_enc,y_test_predclass))