In [1]:
import pandas as pd
import numpy as np

import datetime

import matplotlib.pyplot as plt
plt.style.use('ggplot')

from datetime import datetime
from os import getcwd
from os.path import join

from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn import model_selection, naive_bayes, svm

from sys import path
path.append( join( join( getcwd() , 'functions/' ) ) )

from functions import preprocessing, modelling, postprocessing
from config import ConfigDict

import openpyxl


# temp
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer


# Run only for the first time#
# import nltk
# nltk.download('punkt')
# nltk.download('wordnet')
# nltk.download('stopwords')
# nltk.download('averaged_perceptron_tagger')


In [2]:

def preprocess(data_dir, data, param_grid, read_type, sep, remove_class_0, 
                make_all_other_classes_1, running_CNN, running_SVM,
                timestamp, output_path_vectorizer, store_tfidf_tokenizer, 
                store_keras_tokenizer, file_path_glove,  debbug = False ):

    #----------------------------------------------------------------#
    # 1. Ingest Data
    #----------------------------------------------------------------#   

    if read_type == 'excel':
    
        corpus = pd.read_excel(input_data, engine='openpyxl')

    elif read_type == 'csv':

        corpus = pd.read_csv( input_data, sep = sep)

    # Filter all NAs values
    corpus.dropna(inplace= True)

    # Make Sure labels are integers
    corpus['label'] = corpus['label'].astype(int)

    # Perform data cleaning
    corpus = preprocessing.data_cleaning(corpus = corpus,
                        sent_tokenizer = False, 
                        text_cleaning = True, 
                        use_nltk_cleaning = False)


    #----------------------------------------------------------------#
    # 2. Preprocess Data
    #----------------------------------------------------------------#

    # Create Filter on the data to avoid the imbalance classes problem
    if make_all_other_classes_1:

        corpus['label_orignal'] = corpus.loc[:,'label']
        corpus['label'] = np.where( corpus['label'] > 0 , 1, corpus['label'])

    if remove_class_0:

        corpus['label_orignal'] = corpus.loc[:,'label']
        corpus = corpus[~corpus['label'].isin([0])]

        # Reindex classes
        corpus['label'] = corpus['label'].map({1:0,2:1,3:2,4:3})

    print(" The unique labels are ", corpus['label'].unique())

    model_data = preprocessing.prepare_training_data(corpus)

    # Concat two dictionaries
    data = {**data, **model_data}

    if debbug:
        data['corpus'] = corpus
        return data, param_grid

    #----------------------------------------------------------------#
    # 3. Vectorization
    #----------------------------------------------------------------#


    if running_CNN:
    
        data['X_train_CNN'], data['X_test_CNN'], data['vocab_size'], data['vocab'] = modelling.keras_tokenizer(num_words = data['num_words'], 
                                                                                            sentences_train = data['sentences_train_CNN'] , 
                                                                                            sentences_test = data['sentences_test_CNN'],
                                                                                            seq_input_len = data['seq_input_len'],
                                                                                            store_keras_tokenizer= store_keras_tokenizer, 
                                                                                            remove_class_0 = remove_class_0, 
                                                                                            make_all_other_classes_1 =make_all_other_classes_1, 
                                                                                            output_path_vectorizer = output_path_vectorizer, 
                                                                                            timestamp = timestamp)
        if use_tfidf_as_embedding_weights:
        
            data['embedding_matrix'], data['embedding_dim']  = modelling.tfidf_as_embedding_weights(num_words = data['num_words'], 
                                                                        corpus = corpus, 
                                                                        sentences_train = data['sentences_train_CNN'])
        
        elif use_glove_pretrained_embeddings_weights:
            
            data['embedding_matrix'], data['embedding_dim'] = modelling.fit_pretrained_embedding_space_glove(embedding_dim = data['embedding_dim'], 
                                                                                filepath = data['filepath'] , 
                                                                                vocab = data['vocab'])

    if running_SVM: 

        data['X_train_SVM'], data['X_test_SVM'], data['vocab_size'], data['vocab'] = modelling.tfidf_tokenizer(num_words = data['num_words'],
                                                                                            corpus = corpus,
                                                                                            sentences_train = data['sentences_train_SVM'],
                                                                                            sentences_test = data['sentences_test_SVM'], 
                                                                                            timestamp = timestamp, 
                                                                                            output_path_vectorizer = output_path_vectorizer,
                                                                                            store_tfidf_tokenizer = store_tfidf_tokenizer, 
                                                                                            remove_class_0 = remove_class_0, 
                                                                                            make_all_other_classes_1 = make_all_other_classes_1
                                                                                            )

    if use_tfidf_as_embedding_weights: 
        
        data['embedding_matrix'], data['embedding_dim'] = modelling.tfidf_as_embedding_weights(num_words = data['num_words'], 
                                                                                                    corpus = data['corpus'], 
                                                                                                    sentences_train_CNN = data['X_train_CNN'])

    
    if use_glove_pretrained_embeddings_weights:  
        
        data['embedding_matrix'] = modelling.create_embedding_matrix(
                             filepath = file_path_glove,
                             word_index = data['vocab'], 
                             embedding_dim = data['embedding_dim'])

    # data_pre = modelling.data_vectorization(sentences_train_CNN = data['sentences_train_CNN'], 
    #                     sentences_test_CNN = data['sentences_test_CNN'], 
    #                     sentences_train_SVM = data['sentences_train_SVM'], 
    #                     sentences_test_SVM = data['sentences_test_SVM'], 
    #                     num_words = data['num_words'], 
    #                     seq_input_len = data['seq_input_len'], 
    #                     filepath = data['filepath'],
    #                     corpus = corpus,
    #                     vocab = data['vocab'],
    #                     embedding_dim = data['embedding_dim'],
    #                     running_CNN = running_CNN, 
    #                     running_SVM = running_SVM, 
    #                     use_tfidf_as_embedding_weights = use_tfidf_as_embedding_weights,
    #                     use_glove_pretrained_embeddings_weights = use_glove_pretrained_embeddings_weights, 
    #                     timestamp = timestamp, 
    #                     output_path_vectorizer = output_path_vectorizer, 
    #                     store_tfidf_tokenizer = store_tfidf_tokenizer)

    # Concat two dictionaries
    #data = {**data, **data_pre}

    # Add final parameters
    param_grid['embedding_matrix'] = ([data['embedding_matrix']]) 
    param_grid['output_label'] = [data['output_label']]
    param_grid['corpus'] = corpus

    return data, param_grid


def train_SVM(data,C, kernel, degree, gamma, class_weight,
            sent_tokenizer, 
            use_nltk_cleaning, 
            text_cleaning, 
            use_tfidf_tokenizer, 
            use_keras_tokenizer, 
            use_pretrained_embeddings,
            use_glove_pretrained_embeddings_weights,
            use_tfidf_as_embedding_weights,
            imbalanced_classes,
            make_all_other_classes_1,
            remove_class_0, 
            store_SVM_model, 
            timestamp,
            output_path_models, 
            output_path_parameters,
            seed):

    # Classifier - Algorithm - SVM
    # fit the training dataset on the classifier

    if imbalanced_classes: 
        
        if make_all_other_classes_1: 

            SVM = svm.SVC(C = C, 
                kernel = kernel,
                degree = degree, 
                gamma = gamma,
                class_weight = class_weight)


        if remove_class_0:

            SVM = svm.SVC(C = C, 
                kernel = kernel,
                degree = degree, 
                gamma = gamma,
                class_weight = class_weight)

            

        if not(make_all_other_classes_1 and remove_class_0) and class_weight :

            SVM = svm.SVC(C = C, 
                kernel = kernel,
                degree = degree, 
                gamma = gamma,
                class_weight = class_weight)    

        else: 

            SVM = svm.SVC(C = C, 
                    kernel = kernel,
                    degree = degree, 
                    gamma = gamma,
                    )

    # Fit SVM Model
    SVM.fit(data['X_train_SVM'], data['Y_train_SVM'])

    if store_SVM_model:

        file_name = f'SVM'

        if remove_class_0:
            file_name = f'SVM_1234'

        if make_all_other_classes_1:
            file_name = f'SVM_01'

        print("make_all_other_classes_1: ",  make_all_other_classes_1)
        print("remove_class_0 :" , remove_class_0)

        postprocessing.store_to_pickle(data = SVM, 
                        output_path = output_path_models, 
                        timestamp = timestamp , 
                        file_name = file_name  )


    # predict the labels on validation dataset
    predictions_SVM = SVM.predict(data['X_test_SVM'])

    # Use accuracy_score function to get the accuracy
    data['test_acc'] = np.round( accuracy_score(predictions_SVM, data['Y_test_SVM'])*100 , 4)

    print("SVM Accuracy Score -> ", accuracy_score(predictions_SVM, data['Y_test_SVM'])*100)

    # Print Confusion Matrix
    Pred_Y = SVM.predict(data['X_train_SVM'])
    data['conf_matrix'] = confusion_matrix(data['Y_test_SVM'], predictions_SVM)/len(predictions_SVM)

    # Calculate Label Accuracy
    data['label_acc'] = postprocessing.cal_label_accuracy(data['conf_matrix'], verbose  = 1)

    postprocessing.write_results_txt_SVM( output_file = output_path_parameters,  
                      timestamp = timestamp, 
                      test_acc = data['test_acc'] , 
                      label_acc = data['label_acc'], 
                      sent_tokenizer = sent_tokenizer, 
                      use_nltk_cleaning = use_nltk_cleaning, 
                      text_cleaning = text_cleaning , 
                      use_tfidf_tokenizer = use_tfidf_tokenizer, 
                      use_keras_tokenizer = use_keras_tokenizer, 
                      use_pretrained_embeddings = use_pretrained_embeddings,
                      use_glove_pretrained_embeddings_weights = use_glove_pretrained_embeddings_weights,
                      use_tfidf_as_embedding_weights = use_tfidf_as_embedding_weights,
                      imbalanced_classes = imbalanced_classes,
                      make_all_other_classes_1 = make_all_other_classes_1,
                      remove_class_0 = remove_class_0,
                      C = C ,
                      kernel = kernel,
                      degree = degree, 
                      gamma = gamma,
                      class_weight = class_weight, 
                      seed = seed)



def train_CNN(data, param_grid,
            sent_tokenizer, 
            use_nltk_cleaning, 
            text_cleaning, 
            use_tfidf_tokenizer, 
            use_keras_tokenizer, 
            use_pretrained_embeddings,
            use_glove_pretrained_embeddings_weights,
            use_tfidf_as_embedding_weights):

    #----------------------------------------------------------------#
    # Run CNN with Hyperparameter Optimization
    #----------------------------------------------------------------#

    model_output = modelling.hyperparameter_optimization( 
                                    X_train = data['X_train_CNN'], 
                                    Y_train = data['Y_train_CNN'], 
                                    X_test = data['X_test_CNN'], 
                                    Y_test = data['Y_test_CNN'] , 
                                    epochs = data['epochs'] , 
                                    batch_size = data['batch_size'],
                                    param_grid = param_grid,
                                    cv = data['cv'], 
                                    n_iter = data['n_iter'],
                                    verbose = False)

    # 5. Score Analysis

    # Generate Confusion Matrix
    conf_matrix = confusion_matrix(model_output['Y_pred'], data['Y_test_CNN'].argmax(axis=1)) / len(model_output['Y_pred'])

    # Calculate Label Accuracy
    model_output['label_acc'] = postprocessing.cal_label_accuracy(conf_matrix, verbose  = 1)

    # 6. Write Results to text file
    postprocessing.write_results_txt_CNN(output_file = data['output_file'],
                best_train_acc = model_output['best_train_acc'], 
                best_train_param = model_output['best_train_param'],
                test_acc = model_output['test_acc'], 
                label_acc = model_output['label_acc'] , 
                sent_tokenizer = sent_tokenizer, 
                use_nltk_cleaning = use_nltk_cleaning, 
                text_cleaning = text_cleaning , 
                use_tfidf_tokenizer = use_tfidf_tokenizer, 
                use_keras_tokenizer = use_keras_tokenizer, 
                use_pretrained_embeddings = use_pretrained_embeddings,
                use_glove_pretrained_embeddings_weights = use_glove_pretrained_embeddings_weights,
                use_tfidf_as_embedding_weights = use_tfidf_as_embedding_weights,
                epochs = data['epochs'],
                batch_size = data['batch_size'],
                num_words = data['num_words'], 
                cv = data['cv'] ,
                n_iter = data['n_iter']
    )



def train_bayes(data):

    Naive = naive_bayes.MultinomialNB()
    Naive.fit(data['X_train_SVM'], data['Y_train_SVM'])

    # predict the labels on validation dataset
    predictions_NB = Naive.predict(Test_X_Tfidf)

    # Use accuracy_score function to get the accuracy
    print("Naive Bayes Accuracy Score -> ",accuracy_score(predictions_NB, Test_Y)*100)

    # Print Confusion Matrix
    Pred_Y = Naive.predict(Test_X_Tfidf)
    confusion_matrix(Test_Y, Pred_Y)/len(Pred_Y)


## Parameters

In [3]:
config = ConfigDict.read('config/config_param.yml')


In [4]:
config

{'params': {'model': {'running_CNN': True, 'running_SVM': True, 'seed': 123},
  'input_data': {'data': 'data/ML_data_2.0.xlsx',
   'sep': ',',
   'read_type': 'excel'},
  'output_data': {'output_path_vectorizer': 'results/vectorizer/',
   'output_path_model': 'results/model/',
   'output_parameters': 'results/parameters/',
   'store_tfidf_tokenizer': True,
   'store_keras_tokenizer': True,
   'store_SVM_model': True,
   'store_CNN_model': True},
  'tokenization_options': {'sent_tokenizer': False,
   'use_nltk_cleaning': True,
   'text_cleaning': False,
   'use_tfidf_tokenizer': True,
   'use_keras_tokenizer': True,
   'use_pretrained_embeddings': False,
   'use_glove_pretrained_embeddings_weights': True,
   'use_tfidf_as_embedding_weights': False,
   'imbalanced_classes': True,
   'make_all_other_classes_1': True,
   'remove_class_0': True},
  'CNN': {'epochs': 30,
   'batch_size': 10,
   'num_words': 5000,
   'cv': 4,
   'n_iter': 5,
   'seq_input_len': 40,
   'embedding_dim': 40,
   

## Run Model

In [5]:

# ----------------------------------------------------------------#
# 0. Parameters
# ----------------------------------------------------------------#

# Define Timestamp to store models and vectorizer
timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

# output paths 
output_path_vectorizer = config['params']['output_data']['output_path_vectorizer']
output_path_models = config['params']['output_data']['output_path_model']
output_path_parameters = config['params']['output_data']['output_parameters']

# Storage options
store_tfidf_tokenizer = config['params']['output_data']['store_tfidf_tokenizer']
store_keras_tokenizer = config['params']['output_data']['store_keras_tokenizer']
store_SVM_model = config['params']['output_data']['store_SVM_model']

# Set seed
seed = config['params']['model']['seed']
np.random.seed(seed)

# Current Date
current_time = datetime.now().strftime("%d-%m-%Y_%H_%M_%S")

# Input data
input_data = config['params']['input_data']['data']
read_type  = config['params']['input_data']['read_type']

# Models to run
running_CNN = config['params']['model']['running_CNN']
running_SVM = config['params']['model']['running_SVM']

# To test for only clases 0 and 1
make_all_other_classes_1 =  config['params']['tokenization_options']['make_all_other_classes_1']
remove_class_0 = config['params']['tokenization_options']['remove_class_0']


# Sentence Tokenizer
sent_tokenizer = config['params']['tokenization_options']['sent_tokenizer'] # TODO: Adjust for input to CNN

# Text Cleaning Options
use_nltk_cleaning = config['params']['tokenization_options']['use_nltk_cleaning']
text_cleaning = config['params']['tokenization_options']['text_cleaning']

# Word Tokenizer Options
use_tfidf_tokenizer = config['params']['tokenization_options']['use_tfidf_tokenizer'] # For SVM
use_keras_tokenizer = config['params']['tokenization_options']['use_keras_tokenizer'] # For CNN

# If set to FALSE then keras embedding space training is used instead
# Embedding Space possibilites are GloVe or TFIDF
use_pretrained_embeddings = config['params']['tokenization_options']['use_pretrained_embeddings']

# Only if use_pretrained_embeddings == True then select embedding vector space type
use_glove_pretrained_embeddings_weights = config['params']['tokenization_options']['use_glove_pretrained_embeddings_weights']
use_tfidf_as_embedding_weights = config['params']['tokenization_options']['use_tfidf_as_embedding_weights']

# Options for SVM
imbalanced_classes = config['params']['tokenization_options']['imbalanced_classes']
C = config['params']['SVM']['C']
kernel = config['params']['SVM']['kernel']
degree = config['params']['SVM']['degree']
gamma = config['params']['SVM']['gamma']

# Define Class Weights as empty dict
class_weight = {}


# Dictionary which will cotain all the model's variables
data = {}

# Initialize Model
data['epochs'] = config['params']['CNN']['epochs'] # NO. of optimization runs
data['batch_size'] = config['params']['CNN']['batch_size'] # No. of sentences batch to train
data['num_words'] = config['params']['CNN']['num_words'] # No. of words to use in the embedding space of GloVe or TFIDF
data['cv'] = config['params']['CNN']['cv'] # No. of Cross Validations
data['n_iter'] = config['params']['CNN']['n_iter'] # No. of Iterations
data['seq_input_len'] = config['params']['CNN']['seq_input_len'] # Length of the vector sentence ( no. of words per sentence)
data['embedding_dim'] = config['params']['CNN']['embedding_dim'] # Length of the word vector ( dimension in the embedding space)
data['nodes_hidden_dense_layer'] = config['params']['CNN']['nodes_hidden_dense_layer'] # No. of nodes for hidden Dense layer


data['filepath'] = config['params']['CNN']['filepath_GloVe'] # File path to GLoVe pretrained embedding words

# Hyperparameters for CNN
param_grid = dict(num_filters_cv = config['params']['CNN']['grid_search']['num_filters_cv'], # No of filter to use in convolution
                kernel_size_cv = config['params']['CNN']['grid_search']['kernel_size_cv'], # No of words to check per Convolution 
                vocab_size = config['params']['CNN']['grid_search']['vocab_size'], # Vocab size if keras embedding space training is wanted
                embedding_dim = config['params']['CNN']['grid_search']['embedding_dim'], 
                seq_input_len = config['params']['CNN']['grid_search']['seq_input_len'], 
                nodes_hidden_dense_layer = config['params']['CNN']['grid_search']['nodes_hidden_dense_layer'],
                use_pretrained_embeddings = config['params']['CNN']['grid_search']['use_pretrained_embeddings']
                )
# Small Test
param_grid = dict(num_filters_cv = [(64,16)],
                    kernel_size_cv = [(2,3)],
                    vocab_size = [5000], 
                    embedding_dim = [50],
                    seq_input_len = [50], 
                    nodes_hidden_dense_layer = [5],
                    use_pretrained_embeddings = [config['params']['CNN']['grid_search']['use_pretrained_embeddings']])


## Case 1: Model for Labels 0 and make 1, 2, 3, 4 equal to 1

In [6]:
config['params']['CNN']

{'epochs': 30,
 'batch_size': 10,
 'num_words': 5000,
 'cv': 4,
 'n_iter': 5,
 'seq_input_len': 40,
 'embedding_dim': 40,
 'nodes_hidden_dense_layer': 5,
 'filepath_GloVe': 'D:/Semillero Data Science/Deep Learning/pre-trained Word Embeddings/GloVe/glove.6B.50d.txt',
 'grid_search': {'num_filters_cv': [[64, 16], [64, 32], [128, 16], [128, 32], [256, 64], [256, 32], [256, 64], [512, 128], [512, 32]],
  'kernel_size_cv': [[2, 3], [2, 4], [3, 4], [3, 5]],
  'vocab_size': [3000, 4000, 5000, 6000],
  'embedding_dim': [20, 30, 40, 50],
  'seq_input_len': [50, 40, 30, 20, 10],
  'nodes_hidden_dense_layer': [5, 10, 15, 20, 40],
  'use_pretrained_embeddings': [True, False]}}

In [8]:
# Preprocess data
if make_all_other_classes_1:

    remove_class_0_case_1 = False

    class_weight = {0: config['params']['SVM']['class_weights_2']['0'],
                    1: config['params']['SVM']['class_weights_2']['1']}

    data, param_grid = preprocess(data_dir = config['params']['input_data']['data'], 
                            data = data, 
                            param_grid = param_grid, 
                            read_type = config['params']['input_data']['read_type'], 
                            sep = config['params']['input_data']['read_type'],
                            remove_class_0 = remove_class_0_case_1,
                            make_all_other_classes_1 = make_all_other_classes_1, 
                            running_CNN = running_CNN, 
                            running_SVM = running_SVM, 
                            timestamp = timestamp, 
                            output_path_vectorizer = output_path_vectorizer, 
                            store_tfidf_tokenizer = store_tfidf_tokenizer, 
                            store_keras_tokenizer = store_keras_tokenizer,
                            file_path_glove = data['filepath'])



        # Train and Calculate Accuracy for SVM
    # if running_SVM:

    #     train_SVM(data = data, 
    #         C = C, 
    #         kernel = kernel, 
    #         degree = degree, 
    #         gamma = gamma, 
    #         class_weight = class_weight,
    #         sent_tokenizer = sent_tokenizer, 
    #         use_nltk_cleaning = use_nltk_cleaning, 
    #         text_cleaning = text_cleaning, 
    #         use_tfidf_tokenizer = use_tfidf_tokenizer, 
    #         use_keras_tokenizer = use_keras_tokenizer, 
    #         use_pretrained_embeddings = use_pretrained_embeddings,
    #         use_glove_pretrained_embeddings_weights = use_glove_pretrained_embeddings_weights,
    #         use_tfidf_as_embedding_weights = use_tfidf_as_embedding_weights,
    #         imbalanced_classes = imbalanced_classes,
    #         make_all_other_classes_1 = make_all_other_classes_1,
    #         remove_class_0 = remove_class_0_case_1, 
    #         seed = seed, 
    #         store_SVM_model = store_SVM_model,
    #         timestamp = timestamp,
    #         output_path_models = output_path_models, 
    #         output_path_parameters = output_path_parameters)


    # Train and Calculate Accuracy for CNN
    if running_CNN:
        
        train_CNN(data = data, 
            param_grid = param_grid,
            sent_tokenizer = sent_tokenizer, 
            use_nltk_cleaning = use_nltk_cleaning, 
            text_cleaning = text_cleaning, 
            use_tfidf_tokenizer = False, 
            use_keras_tokenizer = use_keras_tokenizer, 
            use_pretrained_embeddings = use_pretrained_embeddings,
            use_glove_pretrained_embeddings_weights = use_glove_pretrained_embeddings_weights,
            use_tfidf_as_embedding_weights = use_tfidf_as_embedding_weights)



 The unique labels are  [0 1]
Creating Model...
Selecting Parameters...
Evaluating Model...


KeyError: 1557

In [37]:
from keras.wrappers.scikit_learn import KerasClassifier
from keras.models import Sequential
from keras import layers
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import EarlyStopping

from tensorflow.python.client import device_lib

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import RandomizedSearchCV

from postprocessing import cal_label_accuracy, store_to_pickle

from keras.wrappers.scikit_learn import KerasClassifier
from keras.models import Sequential
from keras import layers
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import EarlyStopping

from tensorflow.python.client import device_lib

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import RandomizedSearchCV

from postprocessing import cal_label_accuracy, store_to_pickle

In [62]:
num_filters_cv = param_grid['num_filters_cv'][0]
kernel_size_cv = param_grid['kernel_size_cv'][0]
vocab_size = param_grid['vocab_size'][0]
embedding_dim = param_grid['embedding_dim'][0]
embedding_matrix = param_grid['embedding_matrix'][0]
seq_input_len = param_grid['seq_input_len'][0]
output_label = param_grid['output_label'][0]
nodes_hidden_dense_layer = param_grid['nodes_hidden_dense_layer'][0]
use_pretrained_embeddings = param_grid['use_pretrained_embeddings'][0][0]


In [77]:
param_grid['embedding_matrix'][0]

array([[ 0.33932   ,  0.46586001,  0.75602001, ..., -0.26905999,
        -1.21819997,  0.44033   ],
       [-0.66845   , -0.68803   , -0.12052   , ..., -0.39627999,
        -1.07819998,  0.20907   ],
       [ 0.13523   , -0.24144   ,  0.58442003, ...,  0.17660999,
        -0.58127999, -0.63757998],
       ...,
       [ 0.068305  , -0.85210001,  0.1919    , ...,  0.25569999,
         1.12349999,  0.56221002],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ]])

In [64]:
CNN_model = modelling.create_model(num_filters_cv, kernel_size_cv, vocab_size, embedding_dim, embedding_matrix,
                 seq_input_len, output_label, nodes_hidden_dense_layer, use_pretrained_embeddings)

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 50, 40)            158840    
_________________________________________________________________
conv1d_4 (Conv1D)            (None, 49, 64)            5184      
_________________________________________________________________
conv1d_5 (Conv1D)            (None, 47, 16)            3088      
_________________________________________________________________
global_max_pooling1d_2 (Glob (None, 16)                0         
_________________________________________________________________
dense_4 (Dense)              (None, 5)                 85        
_________________________________________________________________
dense_5 (Dense)              (None, 2)                 12        
Total params: 167,209
Trainable params: 8,369
Non-trainable params: 158,840
____________________________________________

In [65]:
data['batch_size']

10

In [66]:
%time

history = CNN_model.fit(data['X_train_CNN'], data['Y_train_SVM'],
                    epochs = data['epochs'],
                    verbose = False,
                    validation_data = (data['X_test_CNN'], data['Y_test_CNN']),
                    batch_size = data['batch_size'])

loss, accuracy = model.evaluate(data['X_train_CNN'], data['Y_train_SVM'], verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(data['X_test_CNN'], data['Y_test_CNN'], verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))
plot_history(history)

Wall time: 0 ns


ValueError: in user code:

    C:\Users\User\Anaconda3\envs\class\lib\site-packages\tensorflow\python\keras\engine\training.py:805 train_function  *
        return step_function(self, iterator)
    C:\Users\User\Anaconda3\envs\class\lib\site-packages\tensorflow\python\keras\engine\training.py:795 step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    C:\Users\User\Anaconda3\envs\class\lib\site-packages\tensorflow\python\distribute\distribute_lib.py:1259 run
        return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)
    C:\Users\User\Anaconda3\envs\class\lib\site-packages\tensorflow\python\distribute\distribute_lib.py:2730 call_for_each_replica
        return self._call_for_each_replica(fn, args, kwargs)
    C:\Users\User\Anaconda3\envs\class\lib\site-packages\tensorflow\python\distribute\distribute_lib.py:3417 _call_for_each_replica
        return fn(*args, **kwargs)
    C:\Users\User\Anaconda3\envs\class\lib\site-packages\tensorflow\python\keras\engine\training.py:788 run_step  **
        outputs = model.train_step(data)
    C:\Users\User\Anaconda3\envs\class\lib\site-packages\tensorflow\python\keras\engine\training.py:756 train_step
        y, y_pred, sample_weight, regularization_losses=self.losses)
    C:\Users\User\Anaconda3\envs\class\lib\site-packages\tensorflow\python\keras\engine\compile_utils.py:203 __call__
        loss_value = loss_obj(y_t, y_p, sample_weight=sw)
    C:\Users\User\Anaconda3\envs\class\lib\site-packages\tensorflow\python\keras\losses.py:152 __call__
        losses = call_fn(y_true, y_pred)
    C:\Users\User\Anaconda3\envs\class\lib\site-packages\tensorflow\python\keras\losses.py:256 call  **
        return ag_fn(y_true, y_pred, **self._fn_kwargs)
    C:\Users\User\Anaconda3\envs\class\lib\site-packages\tensorflow\python\util\dispatch.py:201 wrapper
        return target(*args, **kwargs)
    C:\Users\User\Anaconda3\envs\class\lib\site-packages\tensorflow\python\keras\losses.py:1537 categorical_crossentropy
        return K.categorical_crossentropy(y_true, y_pred, from_logits=from_logits)
    C:\Users\User\Anaconda3\envs\class\lib\site-packages\tensorflow\python\util\dispatch.py:201 wrapper
        return target(*args, **kwargs)
    C:\Users\User\Anaconda3\envs\class\lib\site-packages\tensorflow\python\keras\backend.py:4833 categorical_crossentropy
        target.shape.assert_is_compatible_with(output.shape)
    C:\Users\User\Anaconda3\envs\class\lib\site-packages\tensorflow\python\framework\tensor_shape.py:1134 assert_is_compatible_with
        raise ValueError("Shapes %s and %s are incompatible" % (self, other))

    ValueError: Shapes (None, 1) and (None, 2) are incompatible


In [33]:
data['verbose']

KeyError: 'verbose'

In [34]:
model = KerasClassifier(build_fn = modelling.create_model,
                        epochs = data['epochs'], 
                        batch_size = data['batch_size'],
                        verbose = False)

print("Selecting Parameters...")

# Make Random Search Cross Validation
grid = RandomizedSearchCV(estimator = model, 
                            param_distributions = param_grid,
                            cv = data['cv'], 
                            n_iter = data['n_iter'],
                            verbose = False)



Selecting Parameters...


In [35]:
print("Evaluating Model...")
# Fit Selected Model with Random Parameters
grid_result = grid.fit(data['X_train_CNN'], data['Y_train_CNN'], verbose = False)

Evaluating Model...


KeyError: 453

array([[1026,    3,   17, ...,    0,    0,    0],
       [  31, 1028, 1029, ...,    0,    0,    0],
       [1031,   10,  124, ...,    0,    0,    0],
       ...,
       [ 724,   64,  725, ...,    0,    0,    0],
       [ 486, 1267, 1834, ...,    0,    0,    0],
       [  52, 1798, 1341, ...,    0,    0,    0]])

array([[1, 0],
       [1, 0],
       [0, 1],
       ...,
       [1, 0],
       [1, 0],
       [1, 0]], dtype=uint8)

In [23]:
param_grid

{'num_filters_cv': [(64, 16)],
 'kernel_size_cv': [(2, 3)],
 'vocab_size': [5000],
 'embedding_dim': [50],
 'seq_input_len': [50],
 'nodes_hidden_dense_layer': [5],
 'use_pretrained_embeddings': [True],
 'embedding_matrix': [[]],
 'output_label': [2],
 'corpus':                                                    text  label  label_orignal
 0     stable way business life many corporate purcha...      0              0
 1     dozens companies already learned supply demand...      0              0
 2     capabilities profitable international business...      0              0
 3       almost every kind manufacturer answer questions      0              0
 4         companies already responded growing pressures      0              0
 ...                                                 ...    ...            ...
 2001  twostep process buying leverage established pr...      1              2
 2002  main products noncritical category office supp...      1              1
 2003  key question respect

In [24]:
use_pretrained_embeddings

False

In [None]:
X_train = data['X_train_CNN'], 
                                    Y_train = data['Y_train_CNN'], 
                                    X_test = data['X_test_CNN'], 
                                    Y_test = data['Y_test_CNN'] , 
                                    epochs = data['epochs'] , 
                                    batch_size = data['batch_size'],
                                    param_grid = param_grid,
                                    cv = data['cv'], 
                                    n_iter = data['n_iter'],
                                    verbose = False)

## Case 2: Model for 1, 2, 3, 4  and exclude 0. Convert 1, 2, 3 , 4 to 0, 1, 2, 3

In [12]:

if remove_class_0:

    make_all_other_classes_1_case_2 = False
    
    # Remember that without the 0 , the other labels are reindexed
    class_weight = {0: config['params']['data']['SVM']['class_weights_1_2_3_4']['0'],
                    1: config['params']['data']['SVM']['class_weights_1_2_3_4']['1'],
                    2: config['params']['data']['SVM']['class_weights_1_2_3_4']['2'],
                    3: config['params']['data']['SVM']['class_weights_1_2_3_4']['3']}


    data, param_grid = preprocess(data_dir = config['params']['input_data']['data'], 
                            data = data, 
                            param_grid = param_grid, 
                            read_type = config['params']['input_data']['read_type'], 
                            sep = config['params']['input_data']['read_type'],
                            remove_class_0 = remove_class_0,
                            make_all_other_classes_1 = make_all_other_classes_1_case_2, 
                            running_CNN = running_CNN, 
                            running_SVM = running_SVM, 
                            timestamp = timestamp, 
                            output_path_vectorizer = output_path_vectorizer, 
                            store_tfidf_tokenizer = store_tfidf_tokenizer, 
                            file_path_glove = data['filepath'])

        # Train and Calculate Accuracy for SVM
    if running_SVM:

        train_SVM(data = data, 
            C = C, 
            kernel = kernel, 
            degree = degree, 
            gamma = gamma, 
            class_weight = class_weight,
            sent_tokenizer = sent_tokenizer, 
            use_nltk_cleaning = use_nltk_cleaning, 
            text_cleaning = text_cleaning, 
            use_tfidf_tokenizer = use_tfidf_tokenizer, 
            use_keras_tokenizer = use_keras_tokenizer, 
            use_pretrained_embeddings = use_pretrained_embeddings,
            use_glove_pretrained_embeddings_weights = use_glove_pretrained_embeddings_weights,
            use_tfidf_as_embedding_weights = use_tfidf_as_embedding_weights,
            imbalanced_classes = imbalanced_classes,
            make_all_other_classes_1 = make_all_other_classes_1_case_2,
            remove_class_0 = remove_class_0, 
            seed = seed, 
            store_SVM_model = store_SVM_model,
            timestamp = timestamp,
            output_path_models = output_path_models, 
            output_path_parameters = output_path_parameters)


    # Train and Calculate Accuracy for CNN
    # if running_CNN:
        
    #     train_CNN(data = data, 
    #         param_grid = param_grid,
    #         sent_tokenizer = sent_tokenizer, 
    #         use_nltk_cleaning = use_nltk_cleaning, 
    #         text_cleaning = text_cleaning, 
    #         use_tfidf_tokenizer = use_tfidf_tokenizer, 
    #         use_keras_tokenizer = use_keras_tokenizer, 
    #         use_pretrained_embeddings = use_pretrained_embeddings,
    #         use_glove_pretrained_embeddings_weights = use_glove_pretrained_embeddings_weights,
    #         use_tfidf_as_embedding_weights = use_tfidf_as_embedding_weights)


 The unique labels are  [3 0 2 1]
make_all_other_classes_1:  False
remove_class_0 : True
SVM Accuracy Score ->  71.85185185185186
Accuracy for label 0 :  78.26  %
Accuracy for label 1 :  60.0  %
Accuracy for label 2 :  80.0  %
Accuracy for label 3 :  70.27  %
Writting results...
Running SVM Modeling 
  
            Seed : 123

            Test Accuracy : 71.8519

            C : 1.0

            kernel : linear

            degree : 3
 
            gamma : auto

            class_weight : {0: 1, 1: 1, 2: 1, 3: 1}

            sent_tokenizer : False 
   
            use_nltk_cleaning: True
 
            text_cleaning: False
  
            make_all_other_classes_1: False
  
            remove_class_0: True 

            use_tfidf_tokenizer: True
 
            use_keras_tokenizer: False
 
            use_pretrained_embeddings: False
 
            use_glove_pretrained_embeddings_weights: False
 
            use_tfidf_as_embedding_weights: False
 
            imbalanced_classes: True
 
    

## Case 3: Model for all classes

In [13]:
if True:

    remove_class_0_case_3 = False
    make_all_other_classes_1_case_3 = False

    if config['params']['data']['SVM']['use_class_weights']:

        class_weight = {0: config['params']['data']['SVM']['class_weights']['0'],
                    1: config['params']['data']['SVM']['class_weights']['1'],
                    2: config['params']['data']['SVM']['class_weights']['2'],
                    3: config['params']['data']['SVM']['class_weights']['3'],
                    4: config['params']['data']['SVM']['class_weights']['4']}



        data, param_grid = preprocess(data_dir = config['params']['input_data']['data'], 
                            data = data, 
                            param_grid = param_grid, 
                            read_type = config['params']['input_data']['read_type'], 
                            sep = config['params']['input_data']['read_type'],
                            remove_class_0 = remove_class_0_case_3,
                            make_all_other_classes_1 = make_all_other_classes_1_case_3, 
                            running_CNN = running_CNN, 
                            running_SVM = running_SVM, 
                            timestamp = timestamp, 
                            output_path_vectorizer = output_path_vectorizer, 
                            store_tfidf_tokenizer = store_tfidf_tokenizer, 
                            file_path_glove = data['filepath'])

            # Train and Calculate Accuracy for SVM
        if running_SVM:

            train_SVM(data = data, 
                    C = C, 
                    kernel = kernel, 
                    degree = degree, 
                    gamma = gamma, 
                    class_weight = class_weight,
                    sent_tokenizer = sent_tokenizer, 
                    use_nltk_cleaning = use_nltk_cleaning, 
                    text_cleaning = text_cleaning, 
                    use_tfidf_tokenizer = use_tfidf_tokenizer, 
                    use_keras_tokenizer = use_keras_tokenizer, 
                    use_pretrained_embeddings = use_pretrained_embeddings,
                    use_glove_pretrained_embeddings_weights = use_glove_pretrained_embeddings_weights,
                    use_tfidf_as_embedding_weights = use_tfidf_as_embedding_weights,
                    imbalanced_classes = imbalanced_classes,
                    make_all_other_classes_1 = make_all_other_classes_1_case_3,
                    remove_class_0 = remove_class_0_case_3, 
                    seed = seed, 
                    store_SVM_model = store_SVM_model,
                    timestamp = timestamp,
                    output_path_models = output_path_models, 
                    output_path_parameters = output_path_parameters)


        # Train and Calculate Accuracy for CNN
        # if running_CNN:
            
        #     train_CNN(data = data, 
        #         param_grid = param_grid,
        #         sent_tokenizer = sent_tokenizer, 
        #         use_nltk_cleaning = use_nltk_cleaning, 
        #         text_cleaning = text_cleaning, 
        #         use_tfidf_tokenizer = use_tfidf_tokenizer, 
        #         use_keras_tokenizer = use_keras_tokenizer, 
        #         use_pretrained_embeddings = use_pretrained_embeddings,
        #         use_glove_pretrained_embeddings_weights = use_glove_pretrained_embeddings_weights,
        #         use_tfidf_as_embedding_weights = use_tfidf_as_embedding_weights)




 The unique labels are  [0 4 1 3 2]
make_all_other_classes_1:  False
remove_class_0 : False
SVM Accuracy Score ->  63.745019920318725
Accuracy for label 0 :  58.97  %
Accuracy for label 1 :  57.69  %
Accuracy for label 2 :  63.64  %
Accuracy for label 3 :  92.86  %
Accuracy for label 4 :  84.85  %
Writting results...
Running SVM Modeling 
  
            Seed : 123

            Test Accuracy : 63.745

            C : 1.0

            kernel : linear

            degree : 3
 
            gamma : auto

            class_weight : {0: 0.05, 1: 1, 2: 1, 3: 1, 4: 1}

            sent_tokenizer : False 
   
            use_nltk_cleaning: True
 
            text_cleaning: False
  
            make_all_other_classes_1: False
  
            remove_class_0: False 

            use_tfidf_tokenizer: True
 
            use_keras_tokenizer: False
 
            use_pretrained_embeddings: False
 
            use_glove_pretrained_embeddings_weights: False
 
            use_tfidf_as_embedding_weights: Fal

In [14]:
new_sentence  = pd.DataFrame({'text':["raw materials purchase is decisive for the company", 'framework agreement master contract preferred', 'twostep process                                        buying leverage established provoking leverage portfolio matrix' , 'companys top management promptly ordered change purchasing policy build alternative domestic sources'],
                            'label':[1,0,2,3]})

new_sentence

Unnamed: 0,text,label
0,raw materials purchase is decisive for the com...,1
1,framework agreement master contract preferred,0
2,twostep process ...,2
3,companys top management promptly ordered chang...,3


In [15]:
timestamp

'2021-01-17_22-37-56'

In [16]:
# Test Case 1

loaded_TFIDF_tokenizer = postprocessing.load_pickle(output_path = output_path_vectorizer , 
                                    timestamp = timestamp , 
                                    file_name = 'TFIDF_vectorizer')

SVM_model =  postprocessing.load_pickle(output_path = output_path_models , 
                        timestamp = timestamp , 
                        file_name = 'SVM_01') 

postprocessing.classify_new_sentences(new_sentence = new_sentence , model = SVM_model, vectorizer = loaded_TFIDF_tokenizer )

array([1, 1, 1, 1], dtype=int64)

In [18]:
# Test Case 2

loaded_TFIDF_tokenizer = postprocessing.load_pickle(output_path = output_path_vectorizer , 
                                    timestamp = timestamp , 
                                    file_name = 'TFIDF_vectorizer_1234')

SVM_model =  postprocessing.load_pickle(output_path = output_path_models , 
                        timestamp = timestamp , 
                        file_name = 'SVM_1234') 

postprocessing.classify_new_sentences(new_sentence = new_sentence , model = SVM_model, vectorizer = loaded_TFIDF_tokenizer )

array([3, 1, 1, 2], dtype=int64)

In [19]:
# Test Case 3

loaded_TFIDF_tokenizer = postprocessing.load_pickle(output_path = output_path_vectorizer , 
                                    timestamp = timestamp , 
                                    file_name = 'TFIDF_vectorizer')

SVM_model =  postprocessing.load_pickle(output_path = output_path_models , 
                        timestamp = timestamp , 
                        file_name = 'SVM') 

postprocessing.classify_new_sentences(new_sentence = new_sentence , model = SVM_model, vectorizer = loaded_TFIDF_tokenizer )

array([1, 2, 2, 3], dtype=int64)