In [1]:
import pandas as pd
import numpy as np
import warnings
from sklearn import metrics, preprocessing, tree
from sklearn.metrics import  f1_score, make_scorer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold

In [2]:
# Ignore Warnings
warnings.filterwarnings('ignore')

In [3]:
data = pd.read_csv('https://raw.githubusercontent.com/jorge-robledo11/Datasets/main/Datasets/Social_Network_Ads.csv')
data.head()

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0


In [4]:
data.shape

(400, 5)

In [5]:
X = data.iloc[:,[2,3]].values
y = data.iloc[:,4].values

In [6]:
# Feature Scaling

from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
X = sc.fit_transform(X)

In [7]:
X = pd.DataFrame(X)
y = pd.DataFrame(y)

## K - Fold Cross Validation

In [8]:
# Obtenemos nuestros datos de entrenamiento y test por medio de K-Fold

kf = KFold(n_splits=5, shuffle=True, random_state=0)

for train, test in kf.split(X):

    X_train = X.iloc[train]
    y_train = y.iloc[train]
    X_test = X.iloc[test]
    y_test = y.iloc[test]

print(len(X_train))
print(len(X_test))
print(len(y_train))
print(len(y_test))

320
80
320
80


In [9]:
# Obtenemos nuestra grilla de parámetros

parameters = {'max_depth': [1,2,3,4,5],
              'min_samples_leaf': [1,2,3,4,5],
              'min_samples_split': [2,3,4,5],
              'criterion': ['gini','entropy']}

In [10]:
"""
1. Definimos el clasificador e instanciamos
2. Hacemos validación cruzada y sacamos la media de los resultados
"""

from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier()

In [11]:
"""
1. Entrenamos nuestro clasificador con validación cruzada y los parámetros conseguidos
2. Printeamos los resultados de las medias de las validaciones cruzadas de todas las combinaciones posibles de modelos 
con los parámetros
3. Tomamos el mejor modelo de los resultados obtenidos
"""

clf_cv = GridSearchCV(clf, parameters, cv=kf, n_jobs=-1, scoring='f1_macro').fit(X_train, y_train)
print(clf_cv.cv_results_['mean_test_score'])
best_model = clf_cv.best_estimator_

[0.79362428 0.79362428 0.79362428 0.79362428 0.79362428 0.79362428
 0.79362428 0.79362428 0.79362428 0.79362428 0.79362428 0.79362428
 0.79362428 0.79362428 0.79362428 0.79362428 0.79362428 0.79362428
 0.79362428 0.79362428 0.8880848  0.8880848  0.8880848  0.8880848
 0.8880848  0.8880848  0.8880848  0.8880848  0.8880848  0.8880848
 0.8880848  0.8880848  0.8880848  0.8880848  0.8880848  0.8880848
 0.8880848  0.8880848  0.8880848  0.8880848  0.88104714 0.88104714
 0.88104714 0.88104714 0.88104714 0.88104714 0.88104714 0.88104714
 0.88104714 0.88104714 0.88104714 0.88104714 0.88104714 0.88104714
 0.88104714 0.88104714 0.88104714 0.88104714 0.88104714 0.88104714
 0.87509558 0.87509558 0.87509558 0.87861577 0.87509558 0.87509558
 0.87509558 0.87861577 0.88880678 0.88880678 0.88880678 0.88880678
 0.88880678 0.88880678 0.88880678 0.88880678 0.88880678 0.88880678
 0.88880678 0.88880678 0.87264248 0.86895024 0.87949153 0.87949153
 0.87266832 0.87610833 0.87610833 0.87949153 0.86078616 0.8607861

In [12]:
# Hacemos validación cruzada con nuestro mejor modelo y sacamos la media de la mejor validación cruzada

scores = cross_val_score(best_model, X_train, y_train, cv=kf, scoring='f1_macro').mean()
scores

0.8950354039513796

In [13]:
# Entrenamos el modelo con los datos de entrenamiento y printeamos el score con los resultados tanto de train como de test

best_model.fit(X_train, y_train)
print(best_model.score(X_train, y_train))
print(best_model.score(X_test, y_test))

0.915625
0.95


In [14]:
y_pred = best_model.predict(X_test)
y_pred

array([0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1,
       1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0,
       0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0])

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=4ec84517-f553-446b-9032-1da3132bd62a' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>