In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
plt.style.use('ggplot')

from datetime import datetime
from os import getcwd
from os.path import join

from sklearn.metrics import accuracy_score, confusion_matrix

from sys import path
path.append( join( join( getcwd() , 'functions/' ) ) )

from functions import preprocessing, modelling, postprocessing


## 0. Parameters

In [25]:
current_time = datetime.now().strftime("%d-%m-%Y_%H_%M_%S")

sent_tokenizer = False # TODO: Adjust for input to CNN

# Text Cleaning Options
use_nltk_cleaning = False
text_cleaning = True

# Word Tokenizer Options
use_tfidf_tokenizer = False # TODO: Adjust for input to CNN
use_keras_tokenizer = True

# If set to FALSE then keras embedding space training is used instead
# Embedding Space possibilites are GloVe or TFIDF
use_pretrained_embeddings = True

# Only if use_pretrained_embeddings == True then select embedding vector space type
use_glove_pretrained_embeddings_weights = True
use_tfidf_as_embedding_weights = False

# Dictionary which will cotain all the model's variables
data = {}

# Initialize Model
data['epochs'] = 30 # NO. of optimizatoin runs
data['batch_size'] = 16 # No. of sentences batch to train
data['num_words'] = 5000 # No. of words to use in the embedding space of GloVe or TFIDF
data['cv'] = 4 # No. of Cross Validations
data['n_iter'] = 5 # No. of Iterations
data['seq_input_len'] = 40 # Length of the vector sentence ( no. of words per sentence)
data['embedding_dim'] = 40 # Length of the word vector ( dimension in the embedding space)
data['nodes_hidden_dense_layer'] = 5 # No. of nodes for hidden Dense layer


data['filepath'] = 'D:/Semillero Data Science/Deep Learning/pre-trained Word Embeddings/GloVe/glove.6B.50d.txt' # File path to GLoVe pretrained embedding words
data['output_file'] = f"results/{current_time}_Result.txt" # Name of output result file

In [26]:
param_grid = dict(num_filters_cv = [(64,16), (64,32), (128,16), (128,32), (256,64), (256,32), (256,64), (512,128), (512, 32)], # No of filter to use in convolution
                  kernel_size_cv = [(2,3), (2,4), (3,4), (3,5)], # No of words to check per Convolution 
                  vocab_size = [3000, 4000, 5000, 6000], # Vocab size if keras embedding space training is wanted
                  embedding_dim = [20, 30, 40, 50], 
                  seq_input_len = [50, 40, 30, 20, 10], 
                  nodes_hidden_dense_layer = [5, 10, 15, 20, 40],
                  use_pretrained_embeddings = [True, False])

In [27]:
# Small Test
param_grid = dict(num_filters_cv = [(64,16)],
                  kernel_size_cv = [(2,3)],
                  vocab_size = [5000], 
                  embedding_dim = [50],
                  seq_input_len = [50], 
                  nodes_hidden_dense_layer = [5],
                  use_pretrained_embeddings = [True])

## 1. Ingest Data

In [28]:
data_dir = 'D:/Data_Science/ClassificationProblems/Sentiment_Analysis/data/SA_4_Categories.csv'
corpus = pd.read_csv(data_dir ,encoding='latin-1', sep = ';')

In [29]:
corpus = preprocessing.data_cleaning(corpus = corpus,
                       sent_tokenizer = False, 
                       text_cleaning = True, 
                       use_nltk_cleaning = False)

In [30]:
corpus.head()

Unnamed: 0,text,label
0,stable way business life many corporate purcha...,0
1,dozens companies already learned supply demand...,0
2,capabilities profitable international business...,0
3,almost every kind manufacturer answer questions,0
4,companies already responded growing pressures,0


## 2. Preprocess Data

In [31]:
model_data = preprocessing.prepare_training_data(corpus)

# Concat two dictionaries
data = {**data, **model_data}

In [32]:
data.keys()

dict_keys(['epochs', 'batch_size', 'num_words', 'cv', 'n_iter', 'seq_input_len', 'embedding_dim', 'nodes_hidden_dense_layer', 'filepath', 'output_file', 'sentences_train', 'sentences_test', 'Y_train', 'Y_test', 'output_label'])

## 3. Vectorization

In [33]:
if use_keras_tokenizer:
    data['X_train'], data['X_test'], data['vocab_size'], data['vocab'] = modelling.keras_tokenizer(num_words = data['num_words'], 
                                                                                         sentences_train = data['sentences_train'] , 
                                                                                         sentences_test = data['sentences_test'],
                                                                                         seq_input_len = data['seq_input_len'])
elif use_tfidf_tokenizer: # Not implemented yet
    data['X_train'], data['X_test'], data['vocab_size'], data['vocab'] = modelling.tfidf_tokenizer(num_words = data['num_words'],
                                                                                         corpus = corpus,
                                                                                         sentences_train = data['sentences_train'],
                                                                                         sentences_test = data['sentences_test'])
    
if use_tfidf_as_embedding_weights:
    
    data['embedding_matrix'], data['embedding_dim']  = modelling.tfidf_as_embedding_weights(num_words = data['num_words'], 
                                                                  corpus = corpus, 
                                                                  sentences_train = data['sentences_train'])
    
elif use_glove_pretrained_embeddings_weights:
    
    data['embedding_matrix'], data['embedding_dim'] = modelling.fit_pretrained_embedding_space_glove(embedding_dim = data['embedding_dim'], 
                                                                           filepath = data['filepath'] , 
                                                                           vocab = data['vocab'])

In [11]:
data.keys()

dict_keys(['epochs', 'batch_size', 'num_words', 'cv', 'n_iter', 'seq_input_len', 'embedding_dim', 'nodes_hidden_dense_layer', 'filepath', 'output_file', 'sentences_train', 'sentences_test', 'Y_train', 'Y_test', 'output_label', 'X_train', 'X_test', 'vocab_size', 'vocab', 'embedding_matrix'])

In [12]:
data['embedding_dim'] 

40

In [13]:
data_pre = modelling.data_vectorization(sentences_train = data['sentences_train'], 
                       sentences_test = data['sentences_test'], 
                       num_words = data['num_words'], 
                       seq_input_len = data['seq_input_len'], 
                       filepath = data['filepath'],
                       corpus = corpus,
                       vocab = data['vocab'],
                       embedding_dim = data['embedding_dim'],
                       use_keras_tokenizer = use_keras_tokenizer, 
                       use_tfidf_tokenizer = use_tfidf_tokenizer, 
                       use_tfidf_as_embedding_weights = use_tfidf_as_embedding_weights,
                       use_glove_pretrained_embeddings_weights = use_glove_pretrained_embeddings_weights)

# Concat two dictionaries
data = {**data, **data_pre}

In [14]:
data.keys()

dict_keys(['epochs', 'batch_size', 'num_words', 'cv', 'n_iter', 'seq_input_len', 'embedding_dim', 'nodes_hidden_dense_layer', 'filepath', 'output_file', 'sentences_train', 'sentences_test', 'Y_train', 'Y_test', 'output_label', 'X_train', 'X_test', 'vocab_size', 'vocab', 'embedding_matrix'])

In [15]:
param_grid['embedding_matrix'] = ([data['embedding_matrix']])
param_grid['output_label'] = [data['output_label']]

## 4. Hyperparameter Optimization

Reference: https://machinelearningmastery.com/grid-search-hyperparameters-deep-learning-models-python-keras/

In [19]:
model_output = modelling.hyperparameter_optimization( 
                            X_train = data['X_train'], 
                            Y_train = data['Y_train'], 
                            X_test = data['X_test'], 
                            Y_test = data['Y_test'] , 
                            epochs = data['epochs'] , 
                            batch_size = data['batch_size'],
                            param_grid = param_grid,
                            cv = data['cv'], 
                            n_iter = data['n_iter'],
                            verbose = False)

Creating Model...
Selecting Parameters...
Evaluating Model...
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 50, 40)            118520    
_________________________________________________________________
conv1d (Conv1D)              (None, 49, 64)            5184      
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 47, 16)            3088      
_________________________________________________________________
global_max_pooling1d (Global (None, 16)                0         
_________________________________________________________________
dense (Dense)                (None, 5)                 85        
_________________________________________________________________
dense_1 (Dense)              (None, 5)                 30        
Total params: 126,907
Trainable params: 8,387
Non-trainable 

## 5. Score Analysis

In [20]:
model_output.keys()

dict_keys(['best_train_acc', 'best_train_param', 'test_acc', 'conf_matrix', 'Y_pred', 'grid_result'])

In [21]:
# Generate Confusion Matrix
conf_matrix = confusion_matrix(model_output['Y_pred'], data['Y_test'].argmax(axis=1)) / len(model_output['Y_pred'])

# Calculate Label Accuracy
model_output['label_acc'] = postprocessing.cal_label_accuracy(conf_matrix, verbose  = 1)

Accuracy for label 0 :  88.85  %
Accuracy for label 1 :  55.56  %
Accuracy for label 2 :  41.18  %
Accuracy for label 3 :  44.83  %
Accuracy for label 4 :  44.0  %


## 6. Write Results to text file

In [23]:
postprocessing.write_results_txt(output_file = data['output_file'], 
                  best_train_acc = model_output['best_train_acc'], 
                  best_train_param = model_output['best_train_param'],
                  test_acc = model_output['test_acc'], 
                  label_acc = model_output['label_acc'] , 
                  sent_tokenizer = sent_tokenizer, 
                  use_nltk_cleaning = use_nltk_cleaning, 
                  text_cleaning = text_cleaning , 
                  use_tfidf_tokenizer = use_tfidf_tokenizer, 
                  use_keras_tokenizer = use_keras_tokenizer, 
                  use_pretrained_embeddings = use_pretrained_embeddings,
                  use_glove_pretrained_embeddings_weights = use_glove_pretrained_embeddings_weights,
                  use_tfidf_as_embedding_weights = use_tfidf_as_embedding_weights,
                  epochs = data['epochs'],
                  batch_size = data['batch_size'],
                  num_words = data['num_words'], 
                  cv = data['cv'] ,
                  n_iter = data['n_iter']
 )

Writting results...
Running CNN Modeling 
 
            Best Accuracy : 0.7786069628512287
  
            Test Accuracy : 0.8014888167381287

            epochs : 30 

            batch size : 16 

            cross validations : 4 

            No. Iterations : 5 

            sent_tokenizer : False 
   
            use_nltk_cleaning: False
 
            text_cleaning: True
  
            use_tfidf_tokenizer: False
 
            use_keras_tokenizer: True
 
            use_pretrained_embeddings: True
 
            use_glove_pretrained_embeddings_weights: True
 
            use_tfidf_as_embedding_weights: False
 
            best param: {'vocab_size': 5000, 'use_pretrained_embeddings': True, 'seq_input_len': 50, 'output_label': 5, 'num_filters_cv': (64, 16), 'nodes_hidden_dense_layer': 5, 'kernel_size_cv': (2, 3), 'embedding_matrix': array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [-0.14896999, -0.41446   ,  0.57534999, ...,  0.