In [30]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
import matplotlib
plt.style.use('ggplot')
from matplotlib.pyplot import figure
import matplotlib.gridspec as gridspec

from sklearn import neighbors, datasets
from sklearn import naive_bayes
from sklearn import svm
from sklearn import tree

from sklearn.feature_extraction import FeatureHasher
from sklearn.feature_selection import VarianceThreshold
from sklearn.impute import SimpleImputer
from sklearn import model_selection
from sklearn.preprocessing import (
    KBinsDiscretizer,
    LabelEncoder,
    MinMaxScaler,
    Normalizer,
    OneHotEncoder,
    OrdinalEncoder,
    PowerTransformer,
    RobustScaler,
    StandardScaler,
)
    

# Datos

In [4]:
df1 = pd.read_csv('tp-2020-2c-train-cols1.csv')
df2 = pd.read_csv('tp-2020-2c-train-cols2.csv')

In [5]:
df = (pd.merge(left=df2, right=df1, how="left", on="id_usuario", validate="one_to_one"))

In [90]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 801 entries, 0 to 800
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   tipo_de_sala   801 non-null    object 
 1   nombre         801 non-null    object 
 2   id_usuario     801 non-null    int64  
 3   genero         801 non-null    object 
 4   edad           641 non-null    float64
 5   amigos         801 non-null    int64  
 6   parientes      801 non-null    int64  
 7   id_ticket      801 non-null    object 
 8   precio_ticket  801 non-null    int64  
 9   fila           177 non-null    object 
 10  nombre_sede    799 non-null    object 
dtypes: float64(1), int64(4), object(6)
memory usage: 69.0+ KB


# Preprocesamiento

In [91]:
def onehot(df, columna):
    newdf = df.copy(deep=True)
    
    ohe = OneHotEncoder(drop='first') #Para mitigar problemas de colinealidad en estimadores lineales
    enc = ohe.fit(df[[columna]].astype(str))
    cat = list(enc.categories_)
    
    enc = enc.transform(df[[columna]].astype(str)).todense().astype(int)
    enc = pd.DataFrame(enc)
    cat = np.delete(cat, 0)
    
    enc.columns = cat
    return enc

def imputar(df):
    imputer = SimpleImputer()
    return imputer.fit_transform(df)

def escalar(df):
    scaler = StandardScaler()
    return scaler.fit_transform(df)

def prepro_1(df):
    return df

def prepro_2(df):
    return df

def prepro_3(data, targets, columnas_a_encodear):
    newdf = df.copy(deep=True)
    
    for col in columnas_a_encodear:
        encoded = onehot(data, col)
        newdf = pd.concat([newdf, encoded], axis=1)
        newdf.drop(labels=col, axis=1, inplace=True)
    
    newdf.drop(labels=['nombre', 'id_ticket'], axis=1, inplace=True)
    newdf = imputar(newdf)
    newdf = escalar(newdf)
    _targets = targets['volveria'].to_numpy(copy=True)
    
    return newdf, _targets

In [122]:
df_X, df_y = prepro_3(df2, df1, ['tipo_de_sala','genero','fila', 'nombre_sede'])
X_train, X_test, y_train, y_test = model_selection.train_test_split(df_X, df_y, test_size=0.4, random_state=42)

# Entrenando modelos

### 3.1 KNN

Entrenamiento con valores por defecto.

In [138]:
clf_knn = neighbors.KNeighborsClassifier()
clf_knn.fit(X_train, y_train)
pred = clf_knn.predict(X_test)
score = (y_test == pred)
score.mean()

0.9906542056074766

Busqueda de hiperparámetros mediante GridSearch

In [139]:
knn_grid_params = {
    'n_neighbors': [3,5,11,19],
    'weights' : ['distance', 'uniform'],
    'metric' : ['manhattan', 'euclidean']
}

gs = model_selection.GridSearchCV(
    clf_knn,
    knn_grid_params,
    verbose = 3,
    cv = 3,
    n_jobs = -1
)

In [140]:
gs_results = gs.fit(X_train, y_train)

Fitting 3 folds for each of 16 candidates, totalling 48 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:    1.4s
[Parallel(n_jobs=-1)]: Done  41 out of  48 | elapsed:    1.5s remaining:    0.3s
[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:    1.5s finished


In [141]:
gs_results.best_score_

0.9833333333333334

In [142]:
gs_results.best_estimator_

KNeighborsClassifier(metric='euclidean', n_neighbors=3, weights='distance')

In [143]:
gs_results.best_params_

{'metric': 'euclidean', 'n_neighbors': 3, 'weights': 'distance'}

In [144]:
scores = model_selection.cross_val_score(clf_knn, df_X, df_y, cv=5)
scores

array([0.97515528, 0.975     , 0.99375   , 1.        , 0.99375   ])

### 3.2 Naive Bayes

#### 3.2.1 Gaussiano

In [94]:
clf_nb_gauss = naive_bayes.GaussianNB()
clf_nb_gauss.fit(X_train, y_train)
pred = clf_nb_gauss.predict(X_test)
score = (y_test == pred)
score.mean()

1.0

#### 3.2.2 Multinomial

In [145]:
nb_mult_alpha = 1.0
clf_nb_mult = naive_bayes.MultinomialNB()
#clf_nb_mult.fit(X_train, y_train)
#pred = clf_nb_mult.predict(X_test)
score = (y_test == pred)
score.mean()

0.9906542056074766

#### 3.2.3 Complemento

In [146]:
nb_comp_alpha = 1.0
clf_nb_comp = naive_bayes.ComplementNB()
#clf_nb_comp.fit(X_train, y_train)
#pred = clf_nb_comp.predict(X_test)
score = (y_test == pred)
score.mean()

0.9906542056074766

#### 3.2.4 Bernoulli

Entendemos que como los datos no siguen una distribucion bernoulli multivariada, el metodo no aplica.

### 3.3 SVM

#### 3.3.1 Comun

Entrenamiento con valores por defecto

In [147]:
clf_svc = svm.SVC()
clf_svc.fit(X_train, y_train)
pred = clf_svc.predict(X_test)
score = (y_test == pred)
score.mean()

0.9906542056074766

Busqueda de hiperparametros mediante GridSearch

In [162]:
svc_grid_params = {
    'kernel': ['rbf','linear'],
    'C' : [0.1, 1, 10]
}

gs = model_selection.GridSearchCV(
    clf_svc,
    svc_grid_params,
    verbose = 3,
    cv = 3,
    n_jobs = -1
)

In [163]:
gs_results = gs.fit(X_train, y_train)

Fitting 3 folds for each of 6 candidates, totalling 18 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 out of  18 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  18 out of  18 | elapsed:    0.1s finished


In [164]:
gs_results.best_params_

{'C': 0.1, 'kernel': 'linear'}

In [165]:
gs_results.best_score_

1.0

In [108]:
scores = model_selection.cross_val_score(clf_svc, df_X, df_y, cv=5)
scores

array([0.99378882, 1.        , 1.        , 1.        , 1.        ])

#### 3.3.2 Lineal

In [98]:
clf_linear_svc = svm.LinearSVC()
clf_linear_svc.fit(X_train, y_train)
pred = clf_linear_svc.predict(X_test)
score = (y_test == pred)
score.mean()

1.0

In [109]:
scores = model_selection.cross_val_score(clf_linear_svc, df_X, df_y, cv=5)
scores

array([1., 1., 1., 1., 1.])

### 3.4 Regresor Lineal (Logistic Regression)

In [99]:
from sklearn.linear_model import LogisticRegression
clf_logreg = LogisticRegression()
clf_logreg.fit(X_train, y_train)
pred = clf_logreg.predict(X_test)
score = (y_test == pred)
score.mean()

1.0

In [110]:
scores = model_selection.cross_val_score(clf_logreg, df_X, df_y, cv=5)
scores

array([1., 1., 1., 1., 1.])

### 3.5 Decision Tree

In [100]:
clf_tree = tree.DecisionTreeClassifier(random_state=0, max_depth=2)
clf_tree.fit(X_train, y_train)
pred = clf_tree.predict(X_test)
score = (y_test == pred)
score.mean()

1.0

In [111]:
scores = model_selection.cross_val_score(clf_tree, df_X, df_y, cv=5)
scores

array([1., 1., 1., 1., 1.])

### 3.6 Redes Neuronales

In [4]:
from tensorflow import keras
from tensorflow.keras import layers

### 3.7 Ensambles

In [77]:
from sklearn import ensemble

#### 3.7.1 RandomForest

In [101]:
random_forest = ensemble.RandomForestClassifier()
random_forest.fit(X_train, y_train)
pred = random_forest.predict(X_test)
score = (y_test == pred)
score.mean()

1.0

#### 3.7.2 Gradient Boost