In [41]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
plt.style.use('ggplot')

from datetime import datetime
from os import getcwd
from os.path import join

from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn import model_selection, naive_bayes, svm

from sys import path
path.append( join( join( getcwd() , 'functions/' ) ) )

from functions import preprocessing, modelling, postprocessing
from config import ConfigDict



In [42]:
config = ConfigDict.read('config/config_param.yml')

In [43]:
config

{'params': {'input_data': {'5_labels': 'data/SA_4_Categories.csv'},
  'tokenization_options': {'sent_tokenizer': False,
   'use_nltk_cleaning': False,
   'text_cleaning': True,
   'use_tfidf_tokenizer': False,
   'use_keras_tokenizer': True,
   'use_pretrained_embeddings': True,
   'use_glove_pretrained_embeddings_weights': False,
   'use_tfidf_as_embedding_weights': True},
  'data': {'epochs': 30,
   'batch_size': 16,
   'num_words': 5000,
   'cv': 4,
   'n_iter': 5,
   'seq_input_len': 40,
   'embedding_dim': 40,
   'nodes_hidden_dense_layer': 5,
   'filepath': 'D:/Semillero Data Science/Deep Learning/pre-trained Word Embeddings/GloVe/glove.6B.50d.txt'},
  'hyperparam': {'num_filters_cv': [[64, 16], [64, 32], [128, 16], [128, 32], [256, 64], [256, 32], [256, 64], [512, 128], [512, 32]],
   'kernel_size_cv': [[2, 3], [2, 4], [3, 4], [3, 5]],
   'vocab_size': [3000, 4000, 5000, 6000],
   'embedding_dim': [20, 30, 40, 50],
   'seq_input_len': [50, 40, 30, 20, 10],
   'nodes_hidden_dense

In [44]:
current_time = datetime.now().strftime("%d-%m-%Y_%H_%M_%S")

sent_tokenizer = config['params']['tokenization_options']['sent_tokenizer'] # TODO: Adjust for input to CNN

# Text Cleaning Options
use_nltk_cleaning = config['params']['tokenization_options']['use_nltk_cleaning']
text_cleaning = config['params']['tokenization_options']['text_cleaning']

# Word Tokenizer Options
use_tfidf_tokenizer = config['params']['tokenization_options']['use_tfidf_tokenizer'] # TODO: Adjust for input to CNN
use_keras_tokenizer = config['params']['tokenization_options']['use_keras_tokenizer']

# If set to FALSE then keras embedding space training is used instead
# Embedding Space possibilites are GloVe or TFIDF
use_pretrained_embeddings = config['params']['tokenization_options']['use_pretrained_embeddings']

# Only if use_pretrained_embeddings == True then select embedding vector space type
use_glove_pretrained_embeddings_weights = config['params']['tokenization_options']['use_glove_pretrained_embeddings_weights']
use_tfidf_as_embedding_weights = config['params']['tokenization_options']['use_tfidf_as_embedding_weights']

# Dictionary which will cotain all the model's variables
data = {}

# Initialize Model
data['epochs'] = config['params']['data']['epochs'] # NO. of optimizatoin runs
data['batch_size'] = config['params']['data']['batch_size'] # No. of sentences batch to train
data['num_words'] = config['params']['data']['num_words'] # No. of words to use in the embedding space of GloVe or TFIDF
data['cv'] = config['params']['data']['cv'] # No. of Cross Validations
data['n_iter'] = config['params']['data']['n_iter'] # No. of Iterations
data['seq_input_len'] = config['params']['data']['seq_input_len'] # Length of the vector sentence ( no. of words per sentence)
data['embedding_dim'] = config['params']['data']['embedding_dim'] # Length of the word vector ( dimension in the embedding space)
data['nodes_hidden_dense_layer'] = config['params']['data']['nodes_hidden_dense_layer'] # No. of nodes for hidden Dense layer


data['filepath'] = config['params']['data']['filepath'] # File path to GLoVe pretrained embedding words
data['output_file'] = f"results/{current_time}_Result.txt" # Name of output result file

In [45]:
data

{'epochs': 30,
 'batch_size': 16,
 'num_words': 5000,
 'cv': 4,
 'n_iter': 5,
 'seq_input_len': 40,
 'embedding_dim': 40,
 'nodes_hidden_dense_layer': 5,
 'filepath': 'D:/Semillero Data Science/Deep Learning/pre-trained Word Embeddings/GloVe/glove.6B.50d.txt',
 'output_file': 'results/05-01-2021_21_42_53_Result.txt'}

In [46]:
param_grid = dict(num_filters_cv = config['params']['hyperparam']['num_filters_cv'], # No of filter to use in convolution
                  kernel_size_cv = config['params']['hyperparam']['kernel_size_cv'], # No of words to check per Convolution 
                  vocab_size = config['params']['hyperparam']['vocab_size'], # Vocab size if keras embedding space training is wanted
                  embedding_dim = config['params']['hyperparam']['embedding_dim'], 
                  seq_input_len = config['params']['hyperparam']['seq_input_len'], 
                  nodes_hidden_dense_layer = config['params']['hyperparam']['nodes_hidden_dense_layer'],
                  use_pretrained_embeddings = config['params']['hyperparam']['use_pretrained_embeddings']
                   )

## 0. Parameters

In [47]:
# current_time = datetime.now().strftime("%d-%m-%Y_%H_%M_%S")

# sent_tokenizer = False # TODO: Adjust for input to CNN

# # Text Cleaning Options
# use_nltk_cleaning = False
# text_cleaning = True

# # Word Tokenizer Options
# use_tfidf_tokenizer = False # TODO: Adjust for input to CNN
# use_keras_tokenizer = True

# # If set to FALSE then keras embedding space training is used instead
# # Embedding Space possibilites are GloVe or TFIDF
# use_pretrained_embeddings = True

# # Only if use_pretrained_embeddings == True then select embedding vector space type
# use_glove_pretrained_embeddings_weights = True
# use_tfidf_as_embedding_weights = False

# # Dictionary which will cotain all the model's variables
# data = {}

# # Initialize Model
# data['epochs'] = 30 # NO. of optimizatoin runs
# data['batch_size'] = 16 # No. of sentences batch to train
# data['num_words'] = 5000 # No. of words to use in the embedding space of GloVe or TFIDF
# data['cv'] = 4 # No. of Cross Validations
# data['n_iter'] = 5 # No. of Iterations
# data['seq_input_len'] = 40 # Length of the vector sentence ( no. of words per sentence)
# data['embedding_dim'] = 40 # Length of the word vector ( dimension in the embedding space)
# data['nodes_hidden_dense_layer'] = 5 # No. of nodes for hidden Dense layer


# data['filepath'] = 'D:/Semillero Data Science/Deep Learning/pre-trained Word Embeddings/GloVe/glove.6B.50d.txt' # File path to GLoVe pretrained embedding words
# data['output_file'] = f"results/{current_time}_Result.txt" # Name of output result file

In [48]:
# param_grid = dict(num_filters_cv = [(64,16), (64,32), (128,16), (128,32), (256,64), (256,32), (256,64), (512,128), (512, 32)], # No of filter to use in convolution
#                   kernel_size_cv = [(2,3), (2,4), (3,4), (3,5)], # No of words to check per Convolution 
#                   vocab_size = [3000, 4000, 5000, 6000], # Vocab size if keras embedding space training is wanted
#                   embedding_dim = [20, 30, 40, 50], 
#                   seq_input_len = [50, 40, 30, 20, 10], 
#                   nodes_hidden_dense_layer = [5, 10, 15, 20, 40],
#                   use_pretrained_embeddings = [True, False])

In [49]:
# # Small Test
# param_grid = dict(num_filters_cv = [(64,16)],
#                   kernel_size_cv = [(2,3)],
#                   vocab_size = [5000], 
#                   embedding_dim = [50],
#                   seq_input_len = [50], 
#                   nodes_hidden_dense_layer = [5],
#                   use_pretrained_embeddings = [True])

## 1. Ingest Data

In [50]:
data_dir = 'D:/Data_Science/ClassificationProblems/Sentiment_Analysis/data/SA_4_Categories.csv'
corpus = pd.read_csv(data_dir ,encoding='latin-1', sep = ';')

In [51]:
corpus = preprocessing.data_cleaning(corpus = corpus,
                       sent_tokenizer = False, 
                       text_cleaning = True, 
                       use_nltk_cleaning = False)

In [52]:
corpus.head()

Unnamed: 0,text,label
0,stable way business life many corporate purcha...,0
1,dozens companies already learned supply demand...,0
2,capabilities profitable international business...,0
3,almost every kind manufacturer answer questions,0
4,companies already responded growing pressures,0


## 2. Preprocess Data

In [53]:
model_data = preprocessing.prepare_training_data(corpus)

# Concat two dictionaries
data = {**data, **model_data}

In [54]:
data.keys()

dict_keys(['epochs', 'batch_size', 'num_words', 'cv', 'n_iter', 'seq_input_len', 'embedding_dim', 'nodes_hidden_dense_layer', 'filepath', 'output_file', 'sentences_train', 'sentences_test', 'Y_train', 'Y_test', 'output_label'])

## 3. Vectorization

In [55]:
if use_keras_tokenizer:
    data['X_train'], data['X_test'], data['vocab_size'], data['vocab'] = modelling.keras_tokenizer(num_words = data['num_words'], 
                                                                                         sentences_train = data['sentences_train'] , 
                                                                                         sentences_test = data['sentences_test'],
                                                                                         seq_input_len = data['seq_input_len'])
elif use_tfidf_tokenizer: # Not implemented yet
    data['X_train'], data['X_test'], data['vocab_size'], data['vocab'] = modelling.tfidf_tokenizer(num_words = data['num_words'],
                                                                                         corpus = corpus,
                                                                                         sentences_train = data['sentences_train'],
                                                                                         sentences_test = data['sentences_test'])
    
if use_tfidf_as_embedding_weights:
    
    data['embedding_matrix'], data['embedding_dim']  = modelling.tfidf_as_embedding_weights(num_words = data['num_words'], 
                                                                  corpus = corpus, 
                                                                  sentences_train = data['sentences_train'])
    
elif use_glove_pretrained_embeddings_weights:
    
    data['embedding_matrix'], data['embedding_dim'] = modelling.fit_pretrained_embedding_space_glove(embedding_dim = data['embedding_dim'], 
                                                                           filepath = data['filepath'] , 
                                                                           vocab = data['vocab'])

In [56]:
data.keys()

dict_keys(['epochs', 'batch_size', 'num_words', 'cv', 'n_iter', 'seq_input_len', 'embedding_dim', 'nodes_hidden_dense_layer', 'filepath', 'output_file', 'sentences_train', 'sentences_test', 'Y_train', 'Y_test', 'output_label', 'X_train', 'X_test', 'vocab_size', 'vocab', 'embedding_matrix'])

In [57]:
data['embedding_dim'] 

1206

In [58]:
data_pre = modelling.data_vectorization(sentences_train = data['sentences_train'], 
                       sentences_test = data['sentences_test'], 
                       num_words = data['num_words'], 
                       seq_input_len = data['seq_input_len'], 
                       filepath = data['filepath'],
                       corpus = corpus,
                       vocab = data['vocab'],
                       embedding_dim = data['embedding_dim'],
                       use_keras_tokenizer = use_keras_tokenizer, 
                       use_tfidf_tokenizer = use_tfidf_tokenizer, 
                       use_tfidf_as_embedding_weights = use_tfidf_as_embedding_weights,
                       use_glove_pretrained_embeddings_weights = use_glove_pretrained_embeddings_weights)

# Concat two dictionaries
data = {**data, **data_pre}

In [59]:
data.keys()

dict_keys(['epochs', 'batch_size', 'num_words', 'cv', 'n_iter', 'seq_input_len', 'embedding_dim', 'nodes_hidden_dense_layer', 'filepath', 'output_file', 'sentences_train', 'sentences_test', 'Y_train', 'Y_test', 'output_label', 'X_train', 'X_test', 'vocab_size', 'vocab', 'embedding_matrix'])

In [60]:
param_grid['embedding_matrix'] = ([data['embedding_matrix']])
param_grid['output_label'] = [data['output_label']]

## 4. Hyperparameter Optimization

Reference: https://machinelearningmastery.com/grid-search-hyperparameters-deep-learning-models-python-keras/

In [61]:
model_output = modelling.hyperparameter_optimization( 
                            X_train = data['X_train'], 
                            Y_train = data['Y_train'], 
                            X_test = data['X_test'], 
                            Y_test = data['Y_test'] , 
                            epochs = data['epochs'] , 
                            batch_size = data['batch_size'],
                            param_grid = param_grid,
                            cv = data['cv'], 
                            n_iter = data['n_iter'],
                            verbose = False)

embedding_8 (Embedding)      (None, 40, 1206)          4063014   
_________________________________________________________________
conv1d_16 (Conv1D)           (None, 39, 256)           617728    
_________________________________________________________________
conv1d_17 (Conv1D)           (None, 36, 64)            65600     
_________________________________________________________________
global_max_pooling1d_8 (Glob (None, 64)                0         
_________________________________________________________________
dense_16 (Dense)             (None, 10)                650       
_________________________________________________________________
dense_17 (Dense)             (None, 5)                 55        
Total params: 4,747,047
Trainable params: 684,033
Non-trainable params: 4,063,014
_________________________________________________________________
Model: "sequential_9"
_________________________________________________________________
Layer (type)                 Output Sh

## 5. Score Analysis

In [62]:
model_output.keys()

dict_keys(['best_train_acc', 'best_train_param', 'test_acc', 'conf_matrix', 'Y_pred', 'grid_result'])

In [63]:
# Generate Confusion Matrix
conf_matrix = confusion_matrix(model_output['Y_pred'], data['Y_test'].argmax(axis=1)) / len(model_output['Y_pred'])

# Calculate Label Accuracy
model_output['label_acc'] = postprocessing.cal_label_accuracy(conf_matrix, verbose  = 1)

Accuracy for label 0 :  90.27  %
Accuracy for label 1 :  50.0  %
Accuracy for label 2 :  24.14  %
Accuracy for label 3 :  66.67  %
Accuracy for label 4 :  36.0  %


## 6. Write Results to text file

In [64]:
postprocessing.write_results_txt(output_file = data['output_file'], 
                  best_train_acc = model_output['best_train_acc'], 
                  best_train_param = model_output['best_train_param'],
                  test_acc = model_output['test_acc'], 
                  label_acc = model_output['label_acc'] , 
                  sent_tokenizer = sent_tokenizer, 
                  use_nltk_cleaning = use_nltk_cleaning, 
                  text_cleaning = text_cleaning , 
                  use_tfidf_tokenizer = use_tfidf_tokenizer, 
                  use_keras_tokenizer = use_keras_tokenizer, 
                  use_pretrained_embeddings = use_pretrained_embeddings,
                  use_glove_pretrained_embeddings_weights = use_glove_pretrained_embeddings_weights,
                  use_tfidf_as_embedding_weights = use_tfidf_as_embedding_weights,
                  epochs = data['epochs'],
                  batch_size = data['batch_size'],
                  num_words = data['num_words'], 
                  cv = data['cv'] ,
                  n_iter = data['n_iter']
 )

Writting results...
Running CNN Modeling 
 
            Best Accuracy : 0.7927031525925025
  
            Test Accuracy : 0.8064516186714172

            epochs : 30 

            batch size : 16 

            cross validations : 4 

            No. Iterations : 5 

            sent_tokenizer : False 
   
            use_nltk_cleaning: False
 
            text_cleaning: True
  
            use_tfidf_tokenizer: False
 
            use_keras_tokenizer: True
 
            use_pretrained_embeddings: True
 
            use_glove_pretrained_embeddings_weights: False
 
            use_tfidf_as_embedding_weights: True
 
            best param: {'vocab_size': 6000, 'use_pretrained_embeddings': True, 'seq_input_len': 10, 'output_label': 5, 'num_filters_cv': [128, 16], 'nodes_hidden_dense_layer': 5, 'kernel_size_cv': [2, 3], 'embedding_matrix': array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0.,