In [13]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
import matplotlib
plt.style.use('ggplot')
from matplotlib.pyplot import figure
import matplotlib.gridspec as gridspec

from sklearn import neighbors, datasets
from sklearn import naive_bayes
from sklearn import svm
from sklearn import tree

from sklearn.feature_extraction import FeatureHasher
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import RFE
from sklearn.impute import SimpleImputer
from sklearn import model_selection
from sklearn.preprocessing import (
    KBinsDiscretizer,
    LabelEncoder,
    MinMaxScaler,
    Normalizer,
    OneHotEncoder,
    OrdinalEncoder,
    PowerTransformer,
    RobustScaler,
    StandardScaler,
)
    

# Datos

In [145]:
data = pd.read_csv('tp-2020-2c-train-cols2.csv')
targets = pd.read_csv('tp-2020-2c-train-cols1.csv')

In [112]:
df = (pd.merge(left=data, right=targets, how="left", on="id_usuario", validate="one_to_one"))

In [113]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 801 entries, 0 to 800
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   tipo_de_sala   801 non-null    object 
 1   nombre         801 non-null    object 
 2   id_usuario     801 non-null    int64  
 3   genero         801 non-null    object 
 4   edad           641 non-null    float64
 5   amigos         801 non-null    int64  
 6   parientes      801 non-null    int64  
 7   id_ticket      801 non-null    object 
 8   precio_ticket  801 non-null    int64  
 9   fila           177 non-null    object 
 10  nombre_sede    799 non-null    object 
dtypes: float64(1), int64(4), object(6)
memory usage: 69.0+ KB


In [62]:
df.isnull().sum()

tipo_de_sala       0
nombre             0
id_usuario         0
genero             0
edad             160
amigos             0
parientes          0
id_ticket          0
precio_ticket      0
fila             624
nombre_sede        2
volveria           0
dtype: int64

# Preprocesamiento

In [89]:
encoding_params = {'onehot_drop':'first'
                  }
scaling_params = {'standard_withmean':False}
selection_params = {'vt_threshold':0,
                    'rfe_estimator':'estimator'}

columnas_a_encodear = ['tipo_de_sala', 'genero', 'nombre_sede']

In [78]:
#encoders
def OrdinalEncoderWrapper(encoding_params):
    return OrdinalEncoder()

def OneHotWrapper(encoding_params):
    return OneHotEncoder(drop=encoding_params['onehot_drop'])

def LabelEncoderWrapper(encoding_params):
    return LabelEncoder()

encoders = {'ordinal': OrdinalEncoderWrapper(encoding_params),
           'label': LabelEncoderWrapper(encoding_params),
            'onehot': OneHotWrapper(encoding_params)
           }

#scalers
def StandardScalerWrapper(scaling_params):
    return StandardScaler(with_mean=scaling_params['standard_withmean'])

def MinMaxScalerWrapper(scaling_params):
    return MinMaxScaler()

def RobustScalerWrapper(scaling_params):
    return RobustScaler()

def PowerTransformerWrapper(scaling_params):
    return PowerTransformer()

def NormalizerWrapper(scaling_params):
    return Normalizer()

scalers = {'standard': StandardScalerWrapper(scaling_params),
          'minmax': MinMaxScalerWrapper(scaling_params),
          'robust': RobustScalerWrapper(scaling_params),
           'power' : PowerTransformerWrapper(scaling_params),
           'normalizer' : NormalizerWrapper(scaling_params)
          }

#Selectors


def VarianceThresholdWrapper(selection_params):
    return VarianceThreshold()

def RFEWrapper(selection_params):
    return RFE(selection_params['rfe_estimator'])

def FeatureHasherWrapper(selection_params):
    return FeatureHasher()


selectors = {'var_thres': VarianceThresholdWrapper(selection_params),
            'rfe': RFEWrapper(selection_params),
            'feature_hasher': FeatureHasherWrapper(selection_params)}

In [188]:
#Funciones auxiliares
#robado de la practica
def droppear_nulos_por_columna(data):
    NULL_REMOVE_PCT = 0.30
    cols = data.isna().mean()
    cols = cols[cols < NULL_REMOVE_PCT]
    return data[cols.index]

def droppear_filas_sin_sede(data):
    _data = data.drop(data.loc[data['nombre_sede'].isna()].index, inplace=False)
    _data.reset_index(drop=True)
    return _data
    

def prepro_1(X, y, encoder, scaler, selector):
    _X = X.copy(deep=True)
    _y = y.copy(deep=True)
    
    #tratamiento de nulos
    _X = droppear_nulos_por_columna(_X)
    #_X = droppear_filas_sin_sede(_X) rompe el dataframe, no se por que
    _X.drop(['id_ticket','nombre','id_usuario'], axis=1, inplace=True)
    _X['edad'] = SimpleImputer(strategy='mean').fit_transform(_X[['edad']])
    
    
    #encoding
    _encoder = encoders[encoder]
    if(encoder == 'onehot'):
        for col in columnas_a_encodear:
            
            encoded = _encoder.fit(_X[[col]].astype(str))
            categories = list(encoded.categories_)
            encoded = encoded.transform(_X[[col]].astype(str)).todense().astype(int)
            encoded = pd.DataFrame(encoded)
            categories = np.delete(categories, 0)
            encoded.columns = categories
            
            _X = pd.concat([_X, encoded], axis=1)
            _X.drop(labels=col, axis=1, inplace=True)
    
    #seleccion
    _selector = selectors[selector]
    #_X = _selector.fit_transform(_X,_y)
    
    #escalado
    _scaler = scalers[scaler]
    _X = _scaler.fit_transform(_X, _y)
    
    _y = y['volveria'].to_numpy(copy=True)
    
    return _X, _y #pd.DataFrame(_X)


In [189]:
df_X, df_y = prepro_1(data, targets, 'onehot', 'minmax', 'var_thres')
df_X

array([[0.88062327, 0.        , 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.39683338, 0.125     , 0.16666667, ..., 0.        , 1.        ,
        0.        ],
       [0.36778559, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.43453129, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.36778559, 0.        , 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.36778559, 0.        , 0.        , ..., 1.        , 0.        ,
        0.        ]])

In [195]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(df_X, df_y, test_size=0.3, random_state=42)

# Entrenando modelos

### 3.1 KNN

Entrenamiento con valores por defecto.

In [196]:
clf_knn = neighbors.KNeighborsClassifier()
clf_knn.fit(X_train, y_train)
pred = clf_knn.predict(X_test)
score = (y_test == pred)
score.mean()

0.8423236514522822

Busqueda de hiperparámetros mediante GridSearch

In [201]:
knn_grid_params = {
    'n_neighbors': [3,5,11,19],
    'weights' : ['distance', 'uniform'],
    'metric' : ['euclidean', 'manhattan']
}

gs = model_selection.GridSearchCV(
    clf_knn,
    knn_grid_params,
    verbose = 3,
    cv = 3,
    n_jobs = -1
)

In [202]:
gs_results = gs.fit(X_train, y_train)

Fitting 3 folds for each of 16 candidates, totalling 48 fits


In [203]:
gs_results.best_score_

0.810802522377475

In [204]:
gs_results.best_estimator_

KNeighborsClassifier(metric='manhattan', n_neighbors=19)

In [205]:
gs_results.best_params_

{'metric': 'manhattan', 'n_neighbors': 19, 'weights': 'uniform'}

In [206]:
scores = model_selection.cross_val_score(clf_knn, df_X, df_y, cv=5)
scores

array([0.7826087, 0.7875   , 0.83125  , 0.8125   , 0.8125   ])

### 3.2 Naive Bayes

#### 3.2.1 Gaussiano

In [207]:
clf_nb_gauss = naive_bayes.GaussianNB()
clf_nb_gauss.fit(X_train, y_train)
pred = clf_nb_gauss.predict(X_test)
score = (y_test == pred)
score.mean()

0.6473029045643154

#### 3.2.2 Multinomial

In [208]:
nb_mult_alpha = 1.0
clf_nb_mult = naive_bayes.MultinomialNB()
clf_nb_mult.fit(X_train, y_train)
pred = clf_nb_mult.predict(X_test)
score = (y_test == pred)
score.mean()

0.7551867219917012

#### 3.2.3 Complemento

In [209]:
nb_comp_alpha = 1.0
clf_nb_comp = naive_bayes.ComplementNB()
clf_nb_comp.fit(X_train, y_train)
pred = clf_nb_comp.predict(X_test)
score = (y_test == pred)
score.mean()

0.7510373443983402

#### 3.2.4 Bernoulli

Entendemos que como los datos no siguen una distribucion bernoulli multivariada, el metodo no aplica.

### 3.3 SVM

#### 3.3.1 Comun

Entrenamiento con valores por defecto

In [210]:
clf_svc = svm.SVC()
clf_svc.fit(X_train, y_train)
pred = clf_svc.predict(X_test)
score = (y_test == pred)
score.mean()

0.8506224066390041

Busqueda de hiperparametros mediante GridSearch

In [211]:
svc_grid_params = {
    'kernel': ['rbf','linear'],
    'C' : [0.1, 1, 10]
}

gs = model_selection.GridSearchCV(
    clf_svc,
    svc_grid_params,
    verbose = 3,
    cv = 3,
    n_jobs = -1
)

In [212]:
gs_results = gs.fit(X_train, y_train)

Fitting 3 folds for each of 6 candidates, totalling 18 fits


In [213]:
gs_results.best_params_

{'C': 10, 'kernel': 'rbf'}

In [214]:
gs_results.best_score_

0.8143196672608437

In [215]:
scores = model_selection.cross_val_score(clf_svc, df_X, df_y, cv=5)
scores

array([0.77018634, 0.825     , 0.85625   , 0.825     , 0.825     ])

#### 3.3.2 Lineal

In [216]:
clf_linear_svc = svm.LinearSVC()
clf_linear_svc.fit(X_train, y_train)
pred = clf_linear_svc.predict(X_test)
score = (y_test == pred)
score.mean()

0.8174273858921162

In [217]:
scores = model_selection.cross_val_score(clf_linear_svc, df_X, df_y, cv=5)
scores

array([0.7826087, 0.79375  , 0.8375   , 0.8      , 0.8      ])

### 3.4 Regresor Lineal (Logistic Regression)

In [99]:
from sklearn.linear_model import LogisticRegression
clf_logreg = LogisticRegression()
clf_logreg.fit(X_train, y_train)
pred = clf_logreg.predict(X_test)
score = (y_test == pred)
score.mean()

1.0

In [110]:
scores = model_selection.cross_val_score(clf_logreg, df_X, df_y, cv=5)
scores

array([1., 1., 1., 1., 1.])

### 3.5 Decision Tree

In [221]:
clf_tree = tree.DecisionTreeClassifier(random_state=0, max_depth=2)
clf_tree.fit(X_train, y_train)
pred = clf_tree.predict(X_test)
score = (y_test == pred)
score.mean()

0.8215767634854771

In [222]:
scores = model_selection.cross_val_score(clf_tree, df_X, df_y, cv=5)
scores

array([0.7826087, 0.75625  , 0.825    , 0.7625   , 0.79375  ])

### 3.6 Redes Neuronales

In [4]:
from tensorflow import keras
from tensorflow.keras import layers

### 3.7 Ensambles

In [219]:
from sklearn import ensemble

#### 3.7.1 RandomForest

In [220]:
random_forest = ensemble.RandomForestClassifier()
random_forest.fit(X_train, y_train)
pred = random_forest.predict(X_test)
score = (y_test == pred)
score.mean()

0.8340248962655602

#### 3.7.2 Gradient Boost