In [1]:
import pandas as pd 
import numpy as np
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV
from sklearn.model_selection import HalvingRandomSearchCV

import warnings
warnings.filterwarnings('ignore')

In [2]:
df=pd.read_csv('creditcard.csv')
df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [3]:
df.Class.unique()

array([0, 1])

In [4]:
df= df.drop(columns='Time')

In [6]:
df['Amount']=(df['Amount']- np.mean(df['Amount']))/np.std(df.Amount)
df.head()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,0.090794,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,0.244964,0
1,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,-0.166974,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,-0.342475,0
2,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,0.207643,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,1.160686,0
3,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,-0.054952,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,0.140534,0
4,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,0.753074,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,-0.073403,0


In [7]:
# El problema es muy desbalanceado tranajaremos con una muestra para resolver el problema
df_ones=df[df['Class']==1] # Filtro de caracteristica
print(df_ones.shape)

df_zeros=df[df['Class']==0] # Filtro de NO caracteristica
df_zeros= df_zeros.sample(3*df_ones.shape[0]) # Tamaño de muestra 3 veces el de la caracteristica
print(df_zeros.shape)

# Concatenar
df_final=pd.DataFrame(np.concatenate([df_ones, df_zeros],axis=0), columns=df.columns)
print(df_final.shape)
df_final.head()

(492, 30)
(1476, 30)
(1968, 30)


Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,-2.312227,1.951992,-1.609851,3.997906,-0.522188,-1.426545,-2.537387,1.391657,-2.770089,-2.772272,...,0.517232,-0.035049,-0.465211,0.320198,0.044519,0.17784,0.261145,-0.143276,-0.353229,1.0
1,-3.043541,-3.157307,1.088463,2.288644,1.359805,-1.064823,0.325574,-0.067794,-0.270953,-0.838587,...,0.661696,0.435477,1.375966,-0.293803,0.279798,-0.145362,-0.252773,0.035764,1.761758,1.0
2,-2.30335,1.759247,-0.359745,2.330243,-0.821628,-0.075788,0.56232,-0.399147,-0.238253,-1.525412,...,-0.294166,-0.932391,0.172726,-0.08733,-0.156114,-0.542628,0.039566,-0.153029,0.606031,1.0
3,-4.397974,1.358367,-2.592844,2.679787,-1.128131,-1.706536,-3.496197,-0.248778,-0.247768,-4.801637,...,0.573574,0.176968,-0.436207,-0.053502,0.252405,-0.657488,-0.827136,0.849573,-0.117342,1.0
4,1.234235,3.01974,-4.304597,4.732795,3.624201,-1.357746,1.713445,-0.496358,-1.282858,-2.447469,...,-0.379068,-0.704181,-0.656805,-1.632653,1.488901,0.566797,-0.010016,0.146793,-0.349231,1.0


In [8]:
df.Class.unique()

array([0, 1])

In [9]:
df_final.isnull().sum()

V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64

In [10]:
# Separar en X y y
y= df_final.Class
X= df_final.drop(columns='Class', axis=1)
print(X.shape, y.shape)

(1968, 29) (1968,)


In [11]:
X_train,X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
print(X_train.shape, X_test.shape)

(1377, 29) (591, 29)


# Seleccion Manual

In [27]:
model= xgb.XGBClassifier(learning_rate=0.001)

#Lista de hiperparametros

params_1 = {'max_depth': 1}
params_2 = {'max_depth': 2}
params_3 = {'max_depth': 3}
params_4 = {'max_depth': 4}
params_5 = {'max_depth': 5}
params_6 = {'max_depth': 6}

In [28]:
# Modelo 1
model.set_params(**params_1).fit(X_train, y_train)
print(f'Accuracy para Modelo 1 = {round(accuracy_score(y_test, model.predict(X_test)), 5)}')
# Modelo 2
model.set_params(**params_2).fit(X_train, y_train)
print(f'Accuracy para Modelo 2 = {round(accuracy_score(y_test, model.predict(X_test)), 5)}')
# Modelo 3
model.set_params(**params_3).fit(X_train, y_train)
print(f'Accuracy para Modelo 3 = {round(accuracy_score(y_test, model.predict(X_test)), 5)}')
# Modelo 4
model.set_params(**params_4).fit(X_train, y_train)
print(f'Accuracy para Modelo 4 = {round(accuracy_score(y_test, model.predict(X_test)), 5)}')
# Modelo 5
model.set_params(**params_5).fit(X_train, y_train)
print(f'Accuracy para Modelo 5 = {round(accuracy_score(y_test, model.predict(X_test)), 5)}')
# Modelo 5
model.set_params(**params_6).fit(X_train, y_train)
print(f'Accuracy para Modelo 6 = {round(accuracy_score(y_test, model.predict(X_test)), 5)}')

Accuracy para Modelo 1 = 0.94924
Accuracy para Modelo 2 = 0.95939
Accuracy para Modelo 3 = 0.96108
Accuracy para Modelo 4 = 0.95431
Accuracy para Modelo 5 = 0.95601
Accuracy para Modelo 6 = 0.95431


# Grid Search

In [32]:
params_grid = {
        'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3,5,6,7],
        }

In [34]:
%%time
grid_cv = GridSearchCV(model, params_grid, scoring="accuracy", n_jobs=-1, cv=3)
grid_cv.fit(X_train, y_train)

print("Mejores Parametros", grid_cv.best_params_)
print("Mejor CV score", grid_cv.best_score_)
print(f'Accuracy del modelo = {round(accuracy_score(y_test, grid_cv.predict(X_test)), 5)}')

Mejores Parametros {'colsample_bytree': 0.8, 'gamma': 5, 'max_depth': 3, 'min_child_weight': 1, 'subsample': 1.0}
Mejor CV score 0.9600580973129992
Accuracy del modelo = 0.95939
CPU times: user 3.97 s, sys: 547 ms, total: 4.52 s
Wall time: 1min 18s


# Randomized Search CV

In [40]:
%%time
grid_cv = RandomizedSearchCV(model, params_grid, scoring="accuracy", n_jobs=-1, cv=3)
grid_cv.fit(X_train, y_train)

print("Mejores parametros", grid_cv.best_params_)
print("Mejor score de CV", grid_cv.best_score_)
print(f'Accuracy del modelo = {round(accuracy_score(y_test, grid_cv.predict(X_test)), 5)}')

Mejores parametros {'subsample': 0.8, 'min_child_weight': 1, 'max_depth': 3, 'gamma': 2, 'colsample_bytree': 0.6}
Mejor score de CV 0.9578794480755266
Accuracy del modelo = 0.96447
CPU times: user 702 ms, sys: 125 ms, total: 828 ms
Wall time: 1.41 s


# Halving Grid Search

In [41]:
%%time
halving_cv = HalvingGridSearchCV(model, params_grid, scoring="accuracy", factor=3)
halving_cv.fit(X_train, y_train)

print("Mejores parametros", halving_cv.best_params_)
print("Mejor Score CV", halving_cv.best_score_)
print(f'Accuracy del modelo = {round(accuracy_score(y_test, halving_cv.predict(X_test)), 5)}')

Mejores parametros {'colsample_bytree': 0.6, 'gamma': 5, 'max_depth': 5, 'min_child_weight': 1, 'subsample': 0.8}
Mejor Score CV 0.9496884735202492
Accuracy del modelo = 0.96785
CPU times: user 11min 23s, sys: 2min 41s, total: 14min 4s
Wall time: 2min 8s


# Halving Randomized Search

In [42]:
%%time
halving_cv = HalvingRandomSearchCV(model, params_grid, scoring="accuracy", factor=3)
halving_cv.fit(X_train, y_train)

print("Mejores parametros", halving_cv.best_params_)
print("Mejor CV score", halving_cv.best_score_)

Mejores parametros {'subsample': 0.6, 'min_child_weight': 1, 'max_depth': 5, 'gamma': 1, 'colsample_bytree': 0.8}
Mejor CV score 0.9627898926964347
CPU times: user 1min 32s, sys: 23.1 s, total: 1min 56s
Wall time: 18.6 s


# Ejemplo Cancer


In [43]:
import pandas as pd
import numpy as np
import scipy as sp

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

In [44]:
from sklearn.datasets import load_breast_cancer
data = load_breast_cancer()

In [45]:
#Convertimos en dataframe
df = pd.DataFrame(np.c_[data['data'], data['target']],
                  columns= np.append(data['feature_names'], ['target']))

In [46]:
#Visualizamos el objeto
df.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0.0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0.0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0.0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0.0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0.0


In [47]:
#Como son muchos atributos nos vamos a quedar unicamente con algunos de ellos
features= list(df.columns[0:10])
features

['mean radius',
 'mean texture',
 'mean perimeter',
 'mean area',
 'mean smoothness',
 'mean compactness',
 'mean concavity',
 'mean concave points',
 'mean symmetry',
 'mean fractal dimension']

In [48]:
#A lo que ya tenemos le agregamos la variable: target
data = df[features + ['target']]
data.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,target
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,0.0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,0.0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,0.0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,0.0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,0.0


In [49]:
#Separamos en X e y como así también en Train y Test
X = data.drop(['target'],axis=1)
y = data['target']

# Dividimos los datos en Train y Test
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [50]:
knn = KNeighborsClassifier()

In [51]:
knn = KNeighborsClassifier()
#Definicion de Hyperparámetros
param_grid = {'n_neighbors':np.arange(1, 10),
              'weights': ['uniform', 'distance'], 
              'leaf_size':[1,3,5,7,10],
              'algorithm':['auto', 'kd_tree']}

#Utilizamos la grilla definida anteriormente...
model = GridSearchCV(knn, param_grid=param_grid, cv=5)

In [52]:
model.fit(X_train, y_train)

In [53]:
print("Mejores parametros: "+str(model.best_params_))


Mejores parametros: {'algorithm': 'auto', 'leaf_size': 1, 'n_neighbors': 6, 'weights': 'distance'}


In [54]:
print("Mejor Score: "+str(model.best_score_)+'\n')

Mejor Score: 0.8849247606019152



In [67]:
scores = pd.DataFrame(model.cv_results_)
scores.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_algorithm,param_leaf_size,param_n_neighbors,param_weights,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.001523,0.000134,0.001673,6.8e-05,auto,6,1,distance,"{'algorithm': 'auto', 'leaf_size': 6, 'n_neigh...",0.895349,0.858824,0.858824,0.823529,0.8,0.847305,0.032792,83
1,0.00169,0.00025,0.002304,0.00097,kd_tree,4,8,distance,"{'algorithm': 'kd_tree', 'leaf_size': 4, 'n_ne...",0.895349,0.882353,0.941176,0.823529,0.870588,0.882599,0.03802,4
2,0.00132,0.000119,0.003206,0.000246,kd_tree,6,3,uniform,"{'algorithm': 'kd_tree', 'leaf_size': 6, 'n_ne...",0.895349,0.894118,0.941176,0.811765,0.870588,0.882599,0.042162,4
3,0.00109,0.000113,0.00253,0.000106,kd_tree,7,9,uniform,"{'algorithm': 'kd_tree', 'leaf_size': 7, 'n_ne...",0.883721,0.870588,0.941176,0.823529,0.870588,0.877921,0.037685,32
4,0.001199,9.8e-05,0.001685,0.000511,auto,2,7,distance,"{'algorithm': 'auto', 'leaf_size': 2, 'n_neigh...",0.895349,0.882353,0.941176,0.823529,0.858824,0.880246,0.03904,28


In [57]:
prediction = model.predict(X_test)

In [58]:
print('Exactitud:', accuracy_score(y_test, prediction))

Exactitud: 0.9090909090909091


In [59]:
# Matriz de Confusion
cm = confusion_matrix(y_test,prediction)
print("Matriz de confusión:")
print(cm)

Matriz de confusión:
[[47  7]
 [ 6 83]]


# Ramdon search

In [60]:
# Grilla para Random Search
param_dist = {'n_neighbors':sp.stats.randint(1, 10),
              'weights': ['uniform', 'distance'], 
              'leaf_size':sp.stats.randint(1, 10),
              'algorithm':['auto', 'kd_tree']}

#Aplicamos la grilla al modelo
model = RandomizedSearchCV(knn, param_dist, n_iter=100, random_state=0, cv=5)

In [61]:
model.fit(X_train, y_train)

In [62]:
model.fit(X_train, y_train)

In [63]:
print("Mejores parametros: "+str(model.best_params_))
print("Mejor Score: "+str(model.best_score_)+'\n')

Mejores parametros: {'algorithm': 'kd_tree', 'leaf_size': 7, 'n_neighbors': 6, 'weights': 'distance'}
Mejor Score: 0.8849247606019152



In [64]:
prediction = model.predict(X_test)

In [65]:
print('Exactitud:', accuracy_score(y_test, prediction))

Exactitud: 0.9090909090909091


In [66]:
# Matriz de Confusion
cm = confusion_matrix(y_test,prediction)
print("Matriz de confusión:")
print(cm)

Matriz de confusión:
[[47  7]
 [ 6 83]]
