### Import Libraries

In [9]:
%reload_ext autoreload

In [28]:
import warnings
warnings.filterwarnings("ignore")
import nltk
import pandas as pd 
import numpy as np
import io
import os
import json
import xgboost as xgb
import lightgbm as lgb
from CustomTokenizer import CustomTokenizer
from Accuracy import Accuracy
from ModelBuilder import ModelBuilder
from sklearn import preprocessing
from sklearn.utils import resample, shuffle
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
pd.set_option('display.max_colwidth', 600)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

### Select Classifier

In [11]:
classifier = SVC()

### Get Info from CSV

In [12]:
columns = ['Pregunta', 'Intencion', 'Intencion_cat_label', 'Intencion_encoded']
df_train = shuffle(pd.read_csv('data/train_encoded.csv', usecols=columns, sep='|'))
df_test = shuffle(pd.read_csv('data/test_santander.csv', usecols=['id','Pregunta']))
print('Train Shape: ' + str(df_train.shape))
print('Test Shape: ' + str(df_test.shape))

Train Shape: (20104, 4)
Test Shape: (6702, 2)


### Helper Code

In [13]:
X = df_train['Pregunta'].values
y = df_train['Intencion_encoded'].values

In [14]:
df_array = pd.DataFrame(columns=df_train.columns)
enableResample = False
if enableResample == True:
    max_value = df_train['Intencion'].value_counts()[0]
    unique_cat = df_train['Intencion'].unique()
    for category in list(unique_cat):
        df_cat_filter = df_train[df_train['Intencion'] == category]        
        if len(df_cat_filter) < max_value:
            # upsample minority
            df_cat_filter = pd.DataFrame(resample(df_cat_filter, replace=True, # sample with replacement
                                     n_samples=max_value # match number in majority class
                                    ) # reproducible results
                                        )                
        df_array = df_array.append(df_cat_filter, ignore_index=True)
    # combine majority and upsampled minority      
    X = df_array['Pregunta'].values
    y = df_array['Intencion_encoded'].values
    print(df_array.Intencion.value_counts())

### train_test_split

In [15]:
X_train, X_test, y_train, y_test = train_test_split(np.array(X), y, test_size = 0.20)
y_train = y_train.astype('int')
y_test = y_test.astype('int')

In [16]:
modelBuilder = ModelBuilder()
optimized_model, model_best_params = modelBuilder.GenerateTrainedModel(classifier, 
                                                                        X_train, 
                                                                        X_test, 
                                                                        y_train, 
                                                                        y_test)

pred = optimized_model.predict(X_test)
#Compute the balanced accuracy
#The balanced accuracy in binary and multiclass classification problems to deal with imbalanced datasets. It is defined as the average of recall obtained on each class.
#The best value is 1 and the worst value is 0 when adjusted=False.
balanced_accuracy_score = Accuracy.get_balanced_accuracy_score(y_test, pred)
accuracy_score = Accuracy.get_accuracy_score(y_test, pred)

Training Model...
Finished Training Model.
Best Params for SVC: {'clf__C': 1000, 'clf__gamma': 0.1}

--------------------------------------------------------
-- Saving Best Parameters for SVC on model_best_params/SVC_best_params.json --
--------------------------------------------------------

--------------------------------------------------------
-- Summary --
--------------------------------------------------------
Training set score for SVC 0.997202
Testing  set score for SVC 0.631684
--------------------------------------------------------

--------------------------------------------------------
-- Saving model on models/SVC_model.sav --
--------------------------------------------------------

--------------------------------------------------------
-- Summary --
--------------------------------------------------------
balanced_accuracy_score: 0.48

--------------------------------------------------------
-- Summary --
--------------------------------------------------------
ac

### Save Accuracy on CSV file

In [25]:
columns = ['model_name', 'accuracy_score', 'balanced_accuracy_score', 'model_best_params']
data = {
    'model_name': classifier.__class__.__name__, 
    'accuracy_score': accuracy_score, 
    'balanced_accuracy_score': balanced_accuracy_score, 
    'model_best_params': [model_best_params]
}
df_accuracy = pd.DataFrame(data=data,
                            columns=columns)
ACCURACY_FILENAME = 'data/accuracy.csv'
fileexists = os.path.isfile(ACCURACY_FILENAME)
header = False
mode = 'a'
if fileexists == False:
    mode='w'
    header=True    
df_accuracy.to_csv(ACCURACY_FILENAME,mode=mode, header=header, columns=columns, index=False, sep=',')
df_accuracy

Unnamed: 0,model_name,accuracy_score,balanced_accuracy_score,model_best_params
0,SVC,0.631684,0.482126,"{'clf__C': 1000, 'clf__gamma': 0.1}"


### Classification Report

In [26]:
encoder = preprocessing.LabelEncoder()        
encoder.fit(df_train['Intencion'])    
y_test_labels = encoder.inverse_transform(y_test)
pred_labels = list(encoder.inverse_transform(pred))
report = Accuracy.get_classification_report(y_test_labels, pred_labels)


--------------------------------------------------------
-- Summary --
--------------------------------------------------------
              precision    recall  f1-score   support

       Cat_0       0.75      0.69      0.72        13
       Cat_1       0.69      0.85      0.76       103
      Cat_10       0.00      0.00      0.00         0
     Cat_100       0.13      0.33      0.19         6
     Cat_101       0.50      0.50      0.50         4
     Cat_102       1.00      1.00      1.00         6
     Cat_103       0.62      0.40      0.48        20
     Cat_105       1.00      1.00      1.00         1
     Cat_106       0.75      0.50      0.60        12
     Cat_107       1.00      0.83      0.91         6
     Cat_108       0.60      0.50      0.55         6
     Cat_109       0.33      0.11      0.17         9
     Cat_110       0.67      0.67      0.67        15
     Cat_112       0.49      0.69      0.57        58
     Cat_114       1.00      1.00      1.00         7
     C

### Save predicted test to CSV

In [27]:
pred_det = optimized_model.predict(df_test['Pregunta'])
df_test['Intencion'] = pred_det

df_test['Intencion_cat_label'] = encoder.inverse_transform(df_test['Intencion'])
df_test['Intencion_cat'] = df_test['Intencion_cat_label'].str[4:]
SUBMIT_FILE = 'data/submit_{}.csv'.format(classifier.__class__.__name__)
df_test.to_csv(SUBMIT_FILE,mode='w', header=False, columns=['id','Intencion_cat'], index=False, sep=',')

df_test.sample(3)

Unnamed: 0,id,Pregunta,Intencion,Intencion_cat_label,Intencion_cat
6461,6461,quiero liquidar una posición en bonos,66,Cat_163,163
102,102,siguen enviando resumen de deuda cuando ya di de baja la tarjeta,15,Cat_112,112
1827,1827,darle alto tarjeta,257,Cat_337,337
