In [16]:
#import relevant packages
import pandas as pd
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.model_selection import KFold
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.linear_model import ElasticNet, LogisticRegression
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [2]:
#cargar datos
url = "https://raw.githubusercontent.com/4GeeksAcademy/decision-tree-project-tutorial/main/diabetes.csv"
df = pd.read_csv(url)
df

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [3]:
#ver tipos de datos y detectar valores nulos
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [4]:
#comparar frecuencias de valores de variable objetivo
conteo_outcome = df['Outcome'].value_counts(normalize=True)
conteo_outcome

Outcome
0    0.651042
1    0.348958
Name: proportion, dtype: float64

In [5]:
#comparar frecuencias de valores de variable 'Pregnancies'
df['Pregnancies'].value_counts()

Pregnancies
1     135
0     111
2     103
3      75
4      68
5      57
6      50
7      45
8      38
9      28
10     24
11     11
13     10
12      9
14      2
17      1
15      1
Name: count, dtype: int64

In [63]:
#separar variables predictoras y variable objetivo
y=df['Outcome']
X = df.drop(['Outcome'], axis=1)

In [64]:
# VIF dataframe
vif_data = pd.DataFrame()
vif_data["feature"] = X.columns

# calculating VIF for each feature
vif_data["VIF"] = [variance_inflation_factor(X.values, i)
                          for i in range(len(X.columns))]

vif_data

Unnamed: 0,feature,VIF
0,Pregnancies,3.275748
1,Glucose,16.725078
2,BloodPressure,14.619512
3,SkinThickness,4.008696
4,Insulin,2.063689
5,BMI,18.408884
6,DiabetesPedigreeFunction,3.195626
7,Age,13.492985


In [65]:
#separar registros en train y test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# #grilla de decision tree
# def grid_dt(X_train, y_train):
#     model = DecisionTreeClassifier(random_state=42)
#     class_weight =  [{0:0.65, 1:0.35}, None, 'balanced']
#     criterion = ['gini', 'entropy', 'log_loss']
#     max_depth = [2,3,5,7,10,20,30,40]
#     min_samples_split = [40, 50, 60, 70, 80, 90, 100]
#     min_samples_leaf = [8, 10, 15, 20, 25, 30, 35]
#     max_leaf_nodes = [2,5,10,20]
#     ccp_alpha = [0.001, 0.01, 0.1, 1, 10]
#     splitter = ['best', 'random']
#     grid = dict(criterion=criterion,
#                 class_weight= class_weight,
#                 max_depth = max_depth,
#                 min_samples_split = min_samples_split,
#                 min_samples_leaf = min_samples_leaf,
#                 splitter=splitter,
#                 ccp_alpha = ccp_alpha,
#                 max_leaf_nodes = max_leaf_nodes
#                 )
#     cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=1, random_state=42)
#     grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv,
#                            scoring='roc_auc',error_score='raise')
#     grid_result = grid_search.fit(X_train, y_train)
#     return  grid_result.best_estimator_

In [85]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, RepeatedStratifiedKFold

def grid_dt(X_train, y_train):
    # Initialize the model
    model = DecisionTreeClassifier(random_state=42)

    # Define the hyperparameter grid
    param_grid = {
        'class_weight': [{0: 0.65, 1: 0.35}, None, 'balanced'],
        'criterion': ['gini', 'entropy'],
        'max_depth': [2, 3, 5, None],  # None allows the tree to grow fully
        'min_samples_split': [80, 90, 100, 120],
        'min_samples_leaf': [30, 40, 50],
        'ccp_alpha': [0.001, 0.01, 0.1],  # Pruning parameter
        'max_features': [3, 5, 8]
    }

    # Set up cross-validation
    cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=1, random_state=42)

    # Perform grid search
    grid_search = GridSearchCV(
        estimator=model,
        param_grid=param_grid,
        cv=cv,
        scoring='roc_auc',
        n_jobs=-1,
        error_score='raise'
    )

    # Fit the grid search
    grid_search.fit(X_train, y_train)

    # Return the best estimator
    return grid_search.best_estimator_

In [86]:
#identificar valores de mejor modelo
best_dt = grid_dt(X_train, y_train)
best_dt

In [87]:
#prediccion con mejor modelo
y_pred_dt = best_dt.predict(X_test)
y_pred_dt

array([1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0,
       1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1,
       0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1,
       0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1,
       1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1,
       0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1,
       0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0])

In [89]:
print(classification_report(y_test, y_pred_dt))

              precision    recall  f1-score   support

           0       0.83      0.69      0.75        99
           1       0.57      0.75      0.65        55

    accuracy                           0.71       154
   macro avg       0.70      0.72      0.70       154
weighted avg       0.74      0.71      0.71       154

