#### Import Libraries

In [90]:
import warnings
warnings.filterwarnings("ignore")
import nltk
import keras
import itertools
import joblib
import tensorflow as tf
import pandas as pd 
import numpy as np
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
from CustomTokenizer import CustomTokenizer
from sklearn import preprocessing
from sklearn.decomposition import TruncatedSVD
from sklearn.utils import resample, shuffle
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, KFold
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, classification_report, balanced_accuracy_score
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.layers import Dense, Conv2D, Embedding, Dropout, Conv1D, GlobalMaxPooling1D, LSTM, Bidirectional
from keras.wrappers.scikit_learn import KerasClassifier
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential, load_model
from keras import layers, Input, Model
import re
pd.set_option('display.max_colwidth', 200)

#### Get Info from CSV

In [91]:
columns = ['Pregunta', 'Intencion']
df_train = shuffle(pd.read_csv('train.csv', usecols=columns, sep='|'))
df_test = shuffle(pd.read_csv('test_santander.csv', usecols=['id','Pregunta']))

#### Helper Code

In [92]:
def show_accuracy(y, pred):    
    accuracy = balanced_accuracy_score(y,pred)    
    print('balanced_accuracy_score: ' + str(round(accuracy,2)))    

labelEncoders = dict()
def labelEncoder(df, column, params = {}):    
    le = preprocessing.LabelEncoder()        
    le.fit(df[column])    
    column_encoded = le.transform(df[column])
    labelEncoders[column] = le
    return column_encoded

In [93]:
df_train['Intencion_cat_label'] = df_train['Intencion'].str[4:]
df_train['Intencion_cat_label'] = df_train['Intencion_cat_label'].astype('int32')
df_train['Intencion_encoded'] = labelEncoder(df_train, 'Intencion')

In [94]:
#Frequency distribution of classes"
train_outcome = pd.crosstab(index=df_train["Intencion"],  # Make a crosstab
                              columns="count")      # Name the count column

train_outcome


col_0,count
Intencion,Unnamed: 1_level_1
Cat_0,55
Cat_1,493
Cat_10,8
Cat_100,42
Cat_101,18
...,...
Cat_95,25
Cat_96,355
Cat_97,69
Cat_98,48


In [95]:
X = df_train['Pregunta'].values
y = df_train['Intencion_encoded'].values

In [96]:
df_array = pd.DataFrame(columns=df_train.columns)
enableResample = False
if enableResample == True:
    max_value = df_train['Intencion'].value_counts()[0]
    unique_cat = df_train['Intencion'].unique()
    for category in list(unique_cat):
        df_cat_filter = df_train[df_train['Intencion'] == category]        
        if len(df_cat_filter) < max_value:
            # upsample minority
            df_cat_filter = pd.DataFrame(resample(df_cat_filter, replace=True, # sample with replacement
                                     n_samples=max_value # match number in majority class
                                    ) # reproducible results
                                        )                
        df_array = df_array.append(df_cat_filter, ignore_index=True)
    # combine majority and upsampled minority      
    X = df_array['Pregunta'].values
    y = df_array['Intencion_encoded'].values
    print(df_array.Intencion.value_counts())

#### Get Info from CSV

In [97]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, shuffle = True)
stopwords = nltk.corpus.stopwords.words('spanish')
tokenizer = CustomTokenizer()
tfidf_vect = TfidfVectorizer(lowercase=True, 
                             stop_words=stopwords,                              
                             strip_accents='ascii', 
                             tokenizer=tokenizer,
                             ngram_range= (1,2),
                             sublinear_tf=True,
                             analyzer='word',
                             token_pattern="[\w']+")

In [98]:
y_train = y_train.astype('int')
y_test = y_test.astype('int')

##### Keras

In [99]:
# Model constants.
max_features = 20000
embedding_dim = 128
sequence_length = 500
tfidf_vect.fit_transform(X_train)
vocab_size = 30982
max_length = len(tfidf_vect.get_feature_names())

# define baseline model
def baseline_model():
    # create model
    model = Sequential()
    model.add(Embedding(vocab_size, 128, input_length = max_length, trainable = False))
    model.add(Bidirectional(LSTM(128)))
#   model.add(LSTM(128))
    model.add(Dense(32, activation = "relu"))
    model.add(Dropout(0.5))
    model.add(Dense(353, activation = "softmax"))
    model.compile(loss = "categorical_crossentropy", optimizer = "adam", metrics = ["accuracy"])    
    return model

In [100]:
keras_param_grid = {
    'keras__epochs': [100], 
}

keras = KerasClassifier(build_fn=baseline_model, batch_size=32)
pileline = Pipeline(steps=[('vect',tfidf_vect), ('TfidfTransformer', TfidfTransformer()), ('keras', keras)])
#kfold = KFold(n_splits=2, shuffle=True)
keras_gs = GridSearchCV(pileline, param_grid=keras_param_grid, cv=2, refit=True)

In [None]:
#early_stop = EarlyStopping(monitor='val_loss',patience=20)
#keras_gs.fit(X_train, y_train, keras__callbacks=[early_stop])
keras_gs.fit(X_train, y_train)
optimized_keras = keras_gs.best_estimator_
keras_gs.best_params_

In [None]:
print("Training set score for Keras: %f" % model.score(X_train , y_train))
print("Testing  set score for Keras: %f" % model.score(X_test  , y_test ))

In [None]:
pred_keras = model.predict(X_test)

In [None]:
#Compute the balanced accuracy
#The balanced accuracy in binary and multiclass classification problems to deal with imbalanced datasets. It is defined as the average of recall obtained on each class.
#The best value is 1 and the worst value is 0 when adjusted=False.
show_accuracy(y_test, pred_keras)

In [None]:
encoder = labelEncoders['Intencion']
y_test_labels = encoder.inverse_transform(y_test)
pred_keras_labels = list(encoder.inverse_transform(pred_keras))
print(classification_report(y_test_labels, pred_keras_labels))

#### Save model to disk

In [None]:
filename = 'Keras_model.sav'
joblib.dump(model, filename)

In [None]:
pred_keras = model.predict(df_test['Pregunta'])
df_test['Intencion'] = pred_keras

In [None]:
df_test['Intencion_cat_label'] = labelEncoders['Intencion'].inverse_transform(df_test['Intencion'])
df_test['Intencion_cat'] = df_test['Intencion_cat_label'].str[4:]

In [None]:
df_test.sample(3)

In [None]:
df_test.to_csv('submit_keras.csv',mode='w', header=False, columns=['id','Intencion_cat'], index=False, sep=',')