#### Import Libraries

In [10]:
import warnings
warnings.filterwarnings("ignore")
import nltk
import joblib
import pandas as pd 
import numpy as np
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
from googletrans import Translator
from classes.CustomTokenizer import CustomTokenizer
from classes.Accuracy import Accuracy
from classes.ModelBuilder import ModelBuilder
from sklearn import preprocessing
from sklearn.decomposition import TruncatedSVD
from sklearn.utils import resample, shuffle
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, classification_report, balanced_accuracy_score
pd.set_option('display.max_colwidth', 600)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

#### Get Info from CSV

In [11]:
columns = ['Pregunta', 'Intencion']
df_train = shuffle(pd.read_csv('data/train.csv', usecols=columns, sep='|'))
df_test = shuffle(pd.read_csv('data/test_santander.csv', usecols=['id','Pregunta']))

#### Helper Code

In [12]:
labelEncoders = dict()
def labelEncoder(df, column, params = {}):    
    le = preprocessing.LabelEncoder()        
    le.fit(df[column])    
    column_encoded = le.transform(df[column])
    labelEncoders[column] = le
    return column_encoded

#### Encoding categories

In [13]:
df_train['Intencion_cat_label'] = df_train['Intencion'].str[4:]
df_train['Intencion_cat_label'] = df_train['Intencion_cat_label'].astype('int32')
df_train['Intencion_encoded'] = labelEncoder(df_train, 'Intencion')

#### Translations

In [16]:
translator = Translator()
translations_fr = []
sentences = df_train['Pregunta'].values
for sent in tqdm(sentences):
    translation = translator.translate(sent, src="es", dest="fr").text
    translations_fr.append(translation)
print(f'Amount sentences en: {len(translations_fr)}')
translations_fr[:2]

translations_es_back = []
for sent in tqdm(translations_fr):
    translation = translator.translate(sent, src="fr", dest="es").text
    translations_es_back.append(translation)
print(f'Amount sentences en: {len(translations_es_back)}')
translations_es_back[:2]

df["Pregunta_es"] = translations_es_back

78%|███████▊  | 15775/20104 [2:52:50<47:26,  1.52it/s]


ReadTimeout: The read operation timed out

In [None]:
mode='w'
header=False
df.to_csv('data/train_preprocessed.csv',mode=mode, header=header, index=False, sep='|')

In [6]:
X = df_train['Pregunta'].values
y = df_train['Intencion_encoded'].values

vect = TfidfVectorizer(tokenizer=CustomTokenizer(), sublinear_tf=True, analyzer='word', strip_accents='ascii', ngram_range=(1,1))
x_vect = vect.fit_transform(X)
x_vect.shape

KeyboardInterrupt: 

In [7]:
df_train[df_train['Pregunta'].str.startswith('aa')]

Unnamed: 0,Pregunta,Intencion,Intencion_cat_label,Intencion_encoded
10656,aa me sirve solamente para american airlines?,Cat_277,277,191
17616,aadvantage,Cat_277,277,191
7669,aadvantage venir resumen tarjeta credito,Cat_277,277,191


In [6]:
tokenizer = CustomTokenizer()
words = [tokenizer.processAll(sentence, stem=False) for sentence in df_train['Pregunta']]

freq_dist = nltk.FreqDist(np.concatenate(words, axis=0))
freq_df = pd.DataFrame(list(freq_dist.items()), columns = ["Word","Frequency"])

print('Coincidences for 1: {}'.format(len(freq_df[freq_df['Frequency'] == 1])))
print('Coincidences for 2: {}'.format(len(freq_df[freq_df['Frequency'] == 2])))
print('Coincidences for 3: {}'.format(len(freq_df[freq_df['Frequency'] == 3])))
print('Coincidences for more than 3: {}'.format(len(freq_df[freq_df['Frequency'] > 3])))
print('FreqDist')
print(freq_df.sort_values(by='Frequency', ascending=True))

Coincidences for 1: 1314
Coincidences for 2: 722
Coincidences for 3: 326
Coincidences for more than 3: 1888
FreqDist
                    Word  Frequency
4249           observada          1
3497      exclusivamente          1
3498                cita          1
1846              debera          1
2827            hospital          1
3499            bastante          1
992              estirar          1
990             transito          1
2343            gallegos          1
2828            argarica          1
3502        consecutivas          1
3503             elevado          1
3504              rinden          1
982              cabildo          1
1852       desadherirmar          1
980               madrid          1
2830                  dj          1
3505              valida          1
3496      refinanciarlar          1
3495           respondio          1
1843             propias          1
3493          dedicarmer          1
3479               corro          1
1827               

In [9]:
from autocorrect import Speller
spell = Speller('es', threshold=2)
for sentence in df_train.sample(40)['Pregunta']:
    print('Before autocorrect: {}'.format(sentence))
    sentence = spell(sentence)
    print('After autocorrect: {}'.format(sentence))

#sentences = [spell(sentence) for sentence in df_train['Pregunta']]

Original number of words: 289144
After applying threshold: 289144
Before autocorrect: obtener clave hacer deja
After autocorrect: obtener clave hacer deja
Before autocorrect: cuantosbilletes de  se pueden depositar en cajero automatico?
After autocorrect: cuantosbilletes de  se pueden depositar en cajero automatico?
Before autocorrect: quiero discutir sobre consumos dudosos de mis tarjetas de credito visa
After autocorrect: quiero discutir sobre consumos dudosos de mis tarjetas de credito visa
Before autocorrect: cálculo de prestamo
After autocorrect: cálculo de prestamo
Before autocorrect: para poder transferir me pide que acepte terminos y condiciones
After autocorrect: para poder transferir me pide que acepte terminos y condiciones
Before autocorrect: no llegar tarjeta nueva diciembre recibir
After autocorrect: no llegar tarjeta nueva diciembre recibir
Before autocorrect: como puedo asociar el tokken a mi cuenta? ya tengo el ticket que me dio el cajero
After autocorrect: como puedo 

In [2]:
tokenizer = CustomTokenizer()
tokenizer.word_tokenize('cómo ago para tener el prestamo')

['cómo', 'ago', 'para', 'tener', 'el', 'prestamo']

In [5]:
from spellchecker import SpellChecker

spell = SpellChecker(language='es', case_sensitive=True)
    
words = tokenizer.word_tokenize('cómo ago para tener el prestamo')
spell.unknown(words)

set()

In [4]:
tokenizer.conjugate_verb('cómo ago para tener el prestamo')

['tener']

In [None]:
df_categories = df_train.groupby(by='Intencion', sort=True, as_index=False).count()
df_categories.head(5)

In [None]:
#Frequency distribution of classes"
#train_outcome = pd.crosstab(index=df_train["Intencion"],  # Make a crosstab
#                              columns="count")      # Name the count column


sns.catplot(y='Intencion', x='Pregunta', kind="bar", data=df_categories, aspect=1, height=20)

In [None]:
df_train.shape

In [None]:
X = df_train['Pregunta'].values
y = df_train['Intencion_encoded'].values

In [None]:
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20)
stopwords = nltk.corpus.stopwords.words('spanish')
tokenizer = CustomTokenizer()
tfidf_vect = TfidfVectorizer(lowercase=True, 
                             stop_words=stopwords, 
                             ngram_range = (1,2), 
                             strip_accents='ascii', 
                             tokenizer=tokenizer,
                             analyzer='word',
                             sublinear_tf=True,
                             token_pattern="[\w']+")

In [None]:
df_train.sample(5)

In [None]:
cat_list = ['Cat_10', 'Cat_100', 'Cat_101', 'Cat_102', 'Cat_103', 'Cat_105', 'Cat_106', 'Cat_107', 'Cat_108', 'Cat_109']
df_train[df_train['Intencion'].isin(cat_list)].sort_values(by='Intencion')

In [None]:
import joblib
import pandas as pd 
import numpy as np
from classes.CustomTokenizer import CustomTokenizer
from sklearn.utils import resample, shuffle
columns = ['Pregunta', 'Intencion']
tokenizer = CustomTokenizer()
df_train = shuffle(pd.read_csv('data/train.csv', usecols=columns, sep='|'))
samples = df_train.sample(5)
for sentence in samples['Pregunta'].values:
    print('after: ' + sentence)
    words = tokenizer.processAll(sentence)
    print('before: ' + str(words))

In [None]:
df_train.sample(10)

In [None]:
#import cProfile, pstats, io
#from pstats import SortKey
#pr = cProfile.Profile()
#pr.enable()

#words = []
#for index, row in df_train[:30].iterrows():
#    #print(row['Pregunta'])
#    words.append(tokenizer.processAll(row['Pregunta']))    
#df_train[:30]['words'] = words

#pr.disable()
#s = io.StringIO()
#sortby = SortKey.CUMULATIVE
#ps = pstats.Stats(pr, stream=s).sort_stats(sortby)
#ps.print_stats()
#print(s.getvalue())

In [3]:
from sklearn.utils import shuffle
df_accuracy = shuffle(pd.read_csv('data/accuracy.csv', sep=','))
df_accuracy.sort_values(by='balanced_accuracy_score', ascending=False)

Unnamed: 0,model_name,accuracy_score,balanced_accuracy_score,model_best_params
29,BalancedRandomForestClassifier,0.695847,0.581593,{}
25,BalancedRandomForestClassifier,0.679184,0.577214,{}
18,RandomForestClassifier,0.715245,0.574226,"{'clf__class_weight': 'balanced_subsample', 'c..."
27,BalancedRandomForestClassifier,0.688386,0.571778,{}
28,BalancedRandomForestClassifier,0.674459,0.565911,{}
17,RandomForestClassifier,0.699329,0.558379,"{'clf__class_weight': 'balanced_subsample', 'c..."
19,RandomForestClassifier,0.709774,0.5583,"{'clf__class_weight': 'balanced_subsample', 'c..."
16,RandomForestClassifier,0.688137,0.548287,"{'clf__class_weight': 'balanced_subsample', 'c..."
5,SVC,0.667496,0.539431,"{'clf__C': 1000, 'clf__gamma': 0.1, 'vect__ngr..."
8,SVC,0.640637,0.533743,"{'clf__C': 1500, 'clf__class_weight': 'balance..."
