In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
plt.style.use('ggplot')

from datetime import datetime
from os import getcwd
from os.path import join

from sklearn.metrics import accuracy_score, confusion_matrix

from sys import path
path.append( join( getcwd(), 'functions' ) )

from functions import preprocessing, modelling, postprocessing



## 0. Parameters

In [37]:
current_time = datetime.now().strftime("%d-%m-%Y_%H_%M_%S")

sent_tokenizer = False
use_nltk_cleaning = False

text_cleaning = True

use_tfidf_tokenizer = False # TODO: Adjust for input to CNN
use_keras_tokenizer = True

use_glove_pretrained_embeddings_weights = False
use_tfidf_as_embedding_weights = False
use_pretrained_embeddings = True

data = {}

# Initialize Model
data['epochs'] = 30
data['batch_size'] = 16
data['num_words'] = 5000
data['cv'] = 4
data['n_iter'] = 5
data['seq_input_len'] = 40
data['embedding_dim'] = 40
data['nodes_hidden_dense_layer'] = 5


data['filepath'] = 'D:/Semillero Data Science/Deep Learning/pre-trained Word Embeddings/GloVe/glove.6B.50d.txt'
data['output_file'] = f"results/{current_time}_Result.txt"

In [48]:
param_grid = dict(num_filters_cv = [(64,16), (64,32), (128,16), (128,32), (256,64), (256,32), (256,64), (512,128), (512, 32)],
                  kernel_size_cv = [(2,3), (2,5), (2,4), (3,5), (3,7), (5,7)],
                  vocab_size = [3000, 4000, 5000, 600], 
                  embedding_dim = [20, 30, 40, 50], 
                  seq_input_len = [50, 40, 30, 20, 10], 
                  nodes_hidden_dense_layer = [5, 10, 15, 20, 40],
                  use_pretrained_embeddings = [True, False])

In [49]:
# Small Test
# param_grid = dict(num_filters_cv = [(64,16)],
#                   kernel_size_cv = [(2,3)],
#                   vocab_size = [5000], 
#                   embedding_dim = [50],
#                   seq_input_len = [50], 
#                   nodes_hidden_dense_layer = [5],
#                   use_pretrained_embeddings = [True])

## 1. Ingest Data

In [50]:
data_dir = 'D:/Data_Science/ClassificationProblems/Sentiment_Analysis/data/SA_4_Categories.csv'
corpus = pd.read_csv(data_dir ,encoding='latin-1', sep = ';')

In [51]:
corpus = preprocessing.data_cleaning(corpus = corpus,
                       sent_tokenizer = False, 
                       text_cleaning = True, 
                       use_nltk_cleaning = False)

In [52]:
corpus.head()

Unnamed: 0,text,label
0,stable way business life many corporate purcha...,0
1,dozens companies already learned supply demand...,0
2,capabilities profitable international business...,0
3,almost every kind manufacturer answer questions,0
4,companies already responded growing pressures,0


## 2. Preprocess Data

In [53]:
model_data = preprocessing.prepare_training_data(corpus)

# Concat two dictionaries
data = {**data, **model_data}

In [54]:
data.keys()

dict_keys(['epochs', 'batch_size', 'num_words', 'cv', 'n_iter', 'seq_input_len', 'embedding_dim', 'nodes_hidden_dense_layer', 'filepath', 'output_file', 'sentences_train', 'sentences_test', 'Y_train', 'Y_test', 'output_label', 'X_train', 'X_test', 'vocab_size', 'vocab'])

## 3. Vectorization

In [55]:
if use_keras_tokenizer:
    data['X_train'], data['X_test'], data['vocab_size'], data['vocab'] = modelling.keras_tokenizer(num_words = data['num_words'], 
                                                                                         sentences_train = data['sentences_train'] , 
                                                                                         sentences_test = data['sentences_test'],
                                                                                         seq_input_len = data['seq_input_len'])
elif use_tfidf_tokenizer: # Not implemented yet
    data['X_train'], data['X_test'], data['vocab_size'], data['vocab'] = modelling.tfidf_tokenizer(num_words = data['num_words'],
                                                                                         corpus = corpus,
                                                                                         sentences_train = data['sentences_train'],
                                                                                         sentences_test = data['sentences_test'])
    
if use_tfidf_as_embedding_weights:
    
    data['embedding_matrix'], data['embedding_dim']  = modelling.tfidf_as_embedding_weights(num_words = data['num_words'], 
                                                                  corpus = corpus, 
                                                                  sentences_train = data['sentences_train'])
    
elif use_glove_pretrained_embeddings_weights:
    
    data['embedding_matrix'], data['embedding_dim'] = modelling.fit_pretrained_embedding_space_glove(embedding_dim = data['embedding_dim'], 
                                                                           filepath = data['filepath'] , 
                                                                           vocab = data['vocab'])

In [56]:
data.keys()

dict_keys(['epochs', 'batch_size', 'num_words', 'cv', 'n_iter', 'seq_input_len', 'embedding_dim', 'nodes_hidden_dense_layer', 'filepath', 'output_file', 'sentences_train', 'sentences_test', 'Y_train', 'Y_test', 'output_label', 'X_train', 'X_test', 'vocab_size', 'vocab'])

In [57]:
data['embedding_dim'] 

40

In [58]:
data_pre = modelling.data_vectorization(sentences_train = data['sentences_train'], 
                       sentences_test = data['sentences_test'], 
                       num_words = data['num_words'], 
                       seq_input_len = data['seq_input_len'], 
                       filepath = data['filepath'],
                       corpus = corpus,
                       vocab = data['vocab'],
                       embedding_dim = data['embedding_dim'],
                       use_keras_tokenizer = True, 
                       use_tfidf_tokenizer = False, 
                       use_tfidf_as_embedding_weights = False,
                       use_glove_pretrained_embeddings_weights = True)

# Concat two dictionaries
data = {**data, **data_pre}

In [59]:
data.keys()

dict_keys(['epochs', 'batch_size', 'num_words', 'cv', 'n_iter', 'seq_input_len', 'embedding_dim', 'nodes_hidden_dense_layer', 'filepath', 'output_file', 'sentences_train', 'sentences_test', 'Y_train', 'Y_test', 'output_label', 'X_train', 'X_test', 'vocab_size', 'vocab', 'embedding_matrix'])

In [60]:
param_grid['embedding_matrix'] = ([data['embedding_matrix']])
param_grid['output_label'] = [data['output_label']]

## 4. Hyperparameter Optimization

In [61]:
Y_pred = modelling.hyperparameter_optimization( 
                            X_train = data['X_train'], 
                            Y_train = data['Y_train'], 
                            X_test = data['X_test'], 
                            Y_test = data['Y_test'] , 
                            epochs = data['epochs'] , 
                            batch_size = data['batch_size'],
                            param_grid = param_grid,
                            cv = 2, #data['cv'], 
                            n_iter = 1, #data['n_iter'],
                            output_file = data['output_file'],
                            verbose = False)

Creating Model...
Selecting Parameters...
Evaluating Model...
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 40, 40)            117560    
_________________________________________________________________
conv1d (Conv1D)              (None, 39, 256)           20736     
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 36, 64)            65600     
_________________________________________________________________
global_max_pooling1d (Global (None, 64)                0         
_________________________________________________________________
dense (Dense)                (None, 5)                 325       
_________________________________________________________________
dense_1 (Dense)              (None, 5)                 30        
Total params: 204,251
Trainable params: 86,691
Non-trainable



 label accuracy could not be calculated
Writting results...
Running CNN Modeling data set
Best Accuracy : 0.7968
{'vocab_size': 3000, 'use_pretrained_embeddings': True, 'seq_input_len': 40, 'output_label': 5, 'num_filters_cv': (256, 64), 'nodes_hidden_dense_layer': 5, 'kernel_size_cv': (2, 4), 'embedding_matrix': array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [-0.14896999, -0.41446   ,  0.57534999, ...,  0.025145  ,
         0.32242   , -0.053632  ],
       [-0.097028  , -0.44775   , -0.25226   , ...,  0.71882999,
        -0.39734   ,  0.12426   ],
       ...,
       [ 0.76950002,  0.38067999, -0.32255   , ...,  0.20100001,
        -0.38712999,  1.02090001],
       [-0.039673  , -0.021234  , -0.43268001, ..., -0.080242  ,
        -0.92989999, -0.18745001],
       [ 0.77649999, -0.70692003, -0.24706   , ...,  0.77344   ,
        -0.55791003, -0.21297   ]]), 'embedding_dim': 50}
Test Accuracy : 0.8065




## 5. Score Analysis

In [62]:
# Generate Confusion Matrix
conf_matrix = confusion_matrix(Y_pred, data['Y_test'].argmax(axis=1)) / len(Y_pred)

# Calculate Label Accuracy
label_acc = postprocessing.cal_label_accuracy(conf_matrix, verbose  = 1)

Accuracy for label 0 :  88.09  %
Accuracy for label 1 :  66.67  %
Accuracy for label 2 :  44.0  %
Accuracy for label 3 :  50.0  %
Accuracy for label 4 :  59.26  %
