### Import Libraries

In [14]:
import warnings
warnings.filterwarnings("ignore")
import nltk
import pandas as pd 
import numpy as np
import io
import os
import json
import re
import numbers
import xgboost as xgb
import lightgbm as lgb
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.layers import Dense, Conv2D, Embedding, Dropout, Conv1D, GlobalMaxPooling1D, LSTM, Bidirectional
from keras.wrappers.scikit_learn import KerasClassifier
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential, load_model
from keras import layers, Input, Model
from classes.Accuracy import Accuracy
from classes.ModelBuilder import ModelBuilder
from sklearn import preprocessing
from sklearn.utils import resample, shuffle
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import *
from imblearn.over_sampling import ADASYN, SMOTE
from imblearn.ensemble import BalancedRandomForestClassifier
pd.set_option('display.max_colwidth', 600)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import mlflow

### Select Classifier

In [15]:
# Model constants.
max_features = 20000
embedding_dim = 128
sequence_length = 500
vocab_size = 30982
max_length = 5890#len(tfidf_vect.get_feature_names())

# define baseline model
def baseline_model():
    # create model
    model = Sequential()
    model.add(Embedding(vocab_size, 128, trainable = False))
    model.add(Bidirectional(LSTM(128)))
#   model.add(LSTM(128))
    model.add(Dense(32, activation = "relu"))
    model.add(Dropout(0.5))
    model.add(Dense(352, activation = "softmax"))
    model.compile(loss = "categorical_crossentropy", optimizer = "adam", metrics = ["accuracy"])    
    return model

  and should_run_async(code)


In [16]:
#classifier = xgb.XGBClassifier(n_jobs=-1, verbose=2)
#classifier = SVC(verbose=2)
#classifier = LinearSVC(verbose=2)
#classifier = RandomForestClassifier(n_jobs=-1, verbose=2)
#classifier = BalancedRandomForestClassifier(n_jobs=-1, verbose=2)
#classifier = KNeighborsClassifier(n_jobs=-1)
#classifier = lgb.LGBMClassifier()
#classifier = DecisionTreeClassifier()
classifier = KerasClassifier(build_fn=baseline_model, batch_size=32)

# #--------------------------------------------------#
# ### TO USE STACKING OR VOTING UNCOMMENT ALL THIS ###
# #--------------------------------------------------#
# # define the base models
# level0 = list()	
# level0.append(('knc', KNeighborsClassifier()))
# level0.append(('lsvc', LinearSVC()))
# level0.append(('svm', SVC(C=10, gamma=0.1)))	
# # define meta learner model
# level1 = LogisticRegression()
# # define the stacking ensemble
# classifier = StackingClassifier(estimators=level0, final_estimator=level1, verbose=2)
# #classifier = VotingClassifier(estimators=level0, verbose=2)

### Helper Code

In [17]:
def add_mock_data(category):
    global df_train
    mask = (df_train['Intencion_cat_label'] == category)
    row = df_train[mask]
    df_train = df_train.append([row]*10,ignore_index=True)

### Get Info from CSV

In [18]:
columns = ['Pregunta', 'Intencion']
df_train = shuffle(pd.read_csv('data/train.csv', usecols=columns, sep='|'))
df_test = shuffle(pd.read_csv('data/test_santander.csv', usecols=['id','Pregunta']))

df_train['Intencion_cat_label'] = df_train['Intencion'].str[4:]
df_train['Intencion_cat_label'] = df_train['Intencion_cat_label'].astype('int32')

print('-----------------')
print('Before Resample: ')
print('-----------------')
print('Train Shape: ' + str(df_train.shape))
print('Test Shape: ' + str(df_test.shape))

#add another row to minority class with just 1 value
add_mock_data(104)
add_mock_data(11)
add_mock_data(13)
add_mock_data(15)
add_mock_data(24)
add_mock_data(205)

print('----------------')
print('After Resample: ')
print('----------------')
print('Train Shape: ' + str(df_train.shape))
print('Test Shape: ' + str(df_test.shape))

-----------------
Before Resample: 
-----------------
Train Shape: (20104, 3)
Test Shape: (6702, 2)
----------------
After Resample: 
----------------
Train Shape: (20254, 3)
Test Shape: (6702, 2)


In [19]:
X = df_train['Pregunta'].values
y = df_train['Intencion_cat_label'].values

  and should_run_async(code)


In [20]:
grouped = df_train.groupby('Intencion_cat_label').count().sort_values(by='Intencion_cat_label', ascending=True)
grouped[grouped['Intencion'] < 5]
grouped

### train_test_split

In [21]:
X_train, X_test, y_train, y_test = train_test_split(np.array(X), y, test_size = 0.20, stratify=y)
y_train = y_train.astype('int')
y_test = y_test.astype('int')

In [22]:
#X_train = X_train.reshape(1,-1)
#y_train = y_train.reshape(1,-1)

  and should_run_async(code)


In [23]:
with mlflow.start_run():
    mlflow.log_param('Classifier', classifier.__class__.__name__)
    cv = RepeatedStratifiedKFold(n_splits=2, n_repeats=3, random_state=42)
    modelBuilder = ModelBuilder()
    resampling = True
    mlflow.log_param('resampling', resampling)
    optimized_model, model_best_params, X_train, X_test, y_train, y_test = modelBuilder.GenerateTrainedModel(classifier, X_train, X_test, y_train, y_test, cv=cv, resampling=resampling, vect_type='TfidfVectorizer')

    pred = optimized_model.predict(X_test)
    #Compute the balanced accuracy
    #The balanced accuracy in binary and multiclass classification problems to deal with imbalanced datasets. It is defined as the average of recall obtained on each class.
    #The best value is 1 and the worst value is 0 when adjusted=False.
    balanced_accuracy_score = Accuracy.get_balanced_accuracy_score(y_test, pred)
    accuracy_score = Accuracy.get_accuracy_score(y_test, pred)
    mlflow.log_metric("balanced_accuracy_score", balanced_accuracy_score)
    mlflow.log_metric("accuracy_score", accuracy_score)    
    for key, value in model_best_params.items():        
        mlflow.log_param(key, value)
    mlflow.sklearn.log_model(optimized_model, "model")

Preprocessing data...
Resampling data...
Training Model...
Fitting 6 folds for each of 1 candidates, totalling 6 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   6 | elapsed:    6.5s remaining:    6.5s
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:    6.5s finished


InvalidArgumentError: indices[3] = [0,2449] is out of order. Many sparse ops require sorted indices.
    Use `tf.sparse.reorder` to create a correctly ordered copy.

 [Op:SerializeManySparse]

### Save Accuracy on CSV file

In [24]:
columns = ['model_name', 'accuracy_score', 'balanced_accuracy_score', 'model_best_params']
data = {
    'model_name': classifier.__class__.__name__, 
    'accuracy_score': accuracy_score, 
    'balanced_accuracy_score': balanced_accuracy_score, 
    'model_best_params': [model_best_params]
}
df_accuracy = pd.DataFrame(data=data,
                            columns=columns)
ACCURACY_FILENAME = 'data/accuracy.csv'
fileexists = os.path.isfile(ACCURACY_FILENAME)
header = False
mode = 'a'
if fileexists == False:
    mode='w'
    header=True    
df_accuracy.to_csv(ACCURACY_FILENAME,mode=mode, header=header, columns=columns, index=False, sep=',')
df_accuracy

  and should_run_async(code)


NameError: name 'accuracy_score' is not defined

### Classification Report

In [25]:
report = Accuracy.get_classification_report(y_test, pred)

NameError: name 'pred' is not defined

### Save predicted test to CSV

In [26]:
pred_det = optimized_model.predict(df_test['Pregunta'].values)
df_test['Intencion'] = pred_det
SUBMIT_FILE = 'data/submit_{}.csv'.format(classifier.__class__.__name__)
df_test.to_csv(SUBMIT_FILE,mode='w', header=False, columns=['id','Intencion'], index=False, sep=',')

df_test.sample(3)

NameError: name 'optimized_model' is not defined