In [1]:
# Modulos a cargar
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Machine Learning
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics

# Datos
df = pd.read_csv("https://raw.githubusercontent.com/robintux/Datasets4StackOverFlowQuestions/master/hotel.csv")
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119390 entries, 0 to 119389
Data columns (total 32 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   hotel                           119390 non-null  object 
 1   is_canceled                     119390 non-null  int64  
 2   lead_time                       119390 non-null  int64  
 3   arrival_date_year               119390 non-null  int64  
 4   arrival_date_month              119390 non-null  object 
 5   arrival_date_week_number        119390 non-null  int64  
 6   arrival_date_day_of_month       119390 non-null  int64  
 7   stays_in_weekend_nights         119390 non-null  int64  
 8   stays_in_week_nights            119390 non-null  int64  
 9   adults                          119390 non-null  int64  
 10  children                        119386 non-null  float64
 11  babies                          119390 non-null  int64  
 12  meal            

In [2]:
# Documentacion de la clase DecisionTreeClassifier
help(DecisionTreeClassifier)

Help on class DecisionTreeClassifier in module sklearn.tree._classes:

class DecisionTreeClassifier(sklearn.base.ClassifierMixin, BaseDecisionTree)
 |  DecisionTreeClassifier(*, criterion='gini', splitter='best', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, random_state=None, max_leaf_nodes=None, min_impurity_decrease=0.0, class_weight=None, ccp_alpha=0.0)
 |  
 |  A decision tree classifier.
 |  
 |  Read more in the :ref:`User Guide <tree>`.
 |  
 |  Parameters
 |  ----------
 |  criterion : {"gini", "entropy", "log_loss"}, default="gini"
 |      The function to measure the quality of a split. Supported criteria are
 |      "gini" for the Gini impurity and "log_loss" and "entropy" both for the
 |      Shannon information gain, see :ref:`tree_mathematical_formulation`.
 |  
 |  splitter : {"best", "random"}, default="best"
 |      The strategy used to choose the split at each node. Supported
 |      strategies are "best" to 

In [4]:
# Tomemos la decision gerencial de eliminar todas las columnas con valores faltantes
ColumnasEliminar = (df.isnull().sum().sort_values(ascending=False)*100/df.shape[0]).index[:4]
data = df.drop(ColumnasEliminar, axis = 1)
# data : todas las variables sin valores faltantes

# Consideremos a las variables de naturaleza cuantitativa
data1 = data.select_dtypes(include = ["float64", "int64"])
# data1 : Todas las variables de naturaleza cuantitativa

# Definicion de las variables independientes (X) y la variable dependiente
y = data1.is_canceled
X = data1.drop("is_canceled", axis = 1)

In [8]:
# Implementar una funcion que ajuste un modelo de tipo arbol de decision cuya entrada
# sea el valor asignado al argumento criterion de la clase DecisionTreeClassifier
def Estabilidad_DT(criterio):
  # particionado
  Xtrain, Xtest, ytrain, ytest = train_test_split(X,y, train_size = 0.90,
                                                  stratify = y)

  # Instanciamos la clase a modelar
  ModelDT_Base = DecisionTreeClassifier(criterion = criterio)

  # Ajustamos la instancia a modelar usando el subconjunto de entrenamiento
  ModelDT_Base.fit(Xtrain,ytrain)

  # Score
  R2_DT_Base = ModelDT_Base.score(Xtrain, ytrain)

  # KPI
  y_pronostico_dt_base = ModelDT_Base.predict(Xtest)
  Acc_DT_Base = metrics.accuracy_score(ytest, y_pronostico_dt_base)*100

  return (R2_DT_Base, Acc_DT_Base)

In [9]:
# Prueba
Estabilidad_DT("gini")

(0.9896604033466417, 80.69352542088953)

In [None]:
arg_criterion = ["gini", "entropy", "log_loss"]

ListaR2_c = []
ListaAcc_c = []

for c in arg_criterion:
  ListaR2_Crit = []
  ListaAcc_crit = []
  for exp in range(1500):
    r2, acc = Estabilidad_DT(c)
    ListaR2_Crit.append(r2)
    ListaAcc_crit.append(acc)
  ListaR2_c.append(ListaR2_Crit)
  ListaAcc_c.append(ListaAcc_crit)

# ListaR2_c, ListaAcc_c : Listas compuesta por 3 listas. Cada una de las cuales posee 1500 elemento.
# Calcular la varianza de cada una de las listas que componen a  ListaR2_c y ListaAcc_c

In [10]:
# Una vez terminado el analisis de la celda anterior.
# Y siendo conciente de los valores que pueden tomar estos indicadores de calidad.
# Concluir : De que los resultado obtenido no son suficientemente satisfactorios

# Luego : Realizar un barrido de hiperparametros

# Carguemos la clase que se va encargar de realizar el barrido de hiperparametros
from sklearn.model_selection import GridSearchCV


In [None]:
# Analicemos que hiperparametros vamor a tunear de la clase DecisionTreeClassifier
# help(DecisionTreeClassifier)

# Voy a considerar a los siguientes parametros :
  # criterion : ["gini", "entropy", "log_loss"],
  # splitter : ["best", "random"]
  # max_depth : int
  # ccp_alpha : non-negative float, default=0.0


# Definamos el diccionario de hiperparametros
DictHP_DT = {
    "criterion" : ["gini", "entropy", "log_loss"],
    "splitter" : ["best", "random"],
    "max_depth" : [5,10,15,25,30,45,55,75,99, 120],
    "ccp_alpha" : np.linspace(0.0001, 20, 200)
}

# Instanciamos el modelo base
Mod_DT = DecisionTreeClassifier()

# Configuramos el barrido de HP
Mod_DT_GS = GridSearchCV(estimator = Mod_DT,
                         param_grid = DictHP_DT,
                         cv = 4,
                         scoring = "accuracy",
                         verbose = 3,
                         n_jobs=-1)

# Procedimiento pesado : Ajuste del GS
Xtrain, Xtest, ytrain, ytest = train_test_split(X,y, train_size = 0.90,stratify = y)
Historia_GS = Mod_DT_GS.fit(Xtrain,ytrain)