# Lista 7

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Perceptron
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

# Wisconsin Diagnostic Breast Cancer (WDBC)

In [2]:
df =  pd.read_csv('wdbc.data', header=None)

In [3]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,22,23,24,25,26,27,28,29,30,31
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 32 columns):
0     569 non-null int64
1     569 non-null object
2     569 non-null float64
3     569 non-null float64
4     569 non-null float64
5     569 non-null float64
6     569 non-null float64
7     569 non-null float64
8     569 non-null float64
9     569 non-null float64
10    569 non-null float64
11    569 non-null float64
12    569 non-null float64
13    569 non-null float64
14    569 non-null float64
15    569 non-null float64
16    569 non-null float64
17    569 non-null float64
18    569 non-null float64
19    569 non-null float64
20    569 non-null float64
21    569 non-null float64
22    569 non-null float64
23    569 non-null float64
24    569 non-null float64
25    569 non-null float64
26    569 non-null float64
27    569 non-null float64
28    569 non-null float64
29    569 non-null float64
30    569 non-null float64
31    569 non-null float64
dtypes: float64(30), int64(1), obj

Como não defini o nome das colunas coloco uma variável com índice da label

In [5]:
label_col = 1

# Mapeando label

In [6]:
df[label_col].unique()

array(['M', 'B'], dtype=object)

In [7]:
label_map = {'M':0,'B':1}
df[label_col] = df[label_col].map(label_map)

# Separando em treino e teste / estratificando 

In [8]:
X = df.drop(label_col, axis=1).astype('float64')
y = df[label_col]

x_train, x_test, y_train, y_test = train_test_split(X, y, stratify=df[label_col], shuffle=True, test_size=0.2, random_state=42)

In [9]:
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((455, 31), (114, 31), (455,), (114,))

# Standardizando features

In [10]:
stds = StandardScaler()

x_train_std = stds.fit_transform(x_train)
x_test_std = stds.transform(x_test)

# Definindo hiperparâmetros que serão usados no GridSearch

In [11]:
params = {'estimator__penalty': ['None','l1','l2','elasticnet'], 'estimator__alpha':[0.0001, 0.001, 0.1, 0.5]}

In [12]:
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

In [13]:
pipeline = Pipeline([('transformer', StandardScaler()),('estimator', Perceptron(max_iter=1000, tol=1e-3))])

gscv = GridSearchCV(pipeline, params, cv=cv, scoring='accuracy', iid=True, return_train_score=True)

In [14]:
gscv.fit(x_train, y_train);

# Exibindo tabela de resultados

In [15]:
cols = ['param_estimator__alpha','param_estimator__penalty','mean_train_score','mean_test_score']

In [16]:
pd.DataFrame(gscv.cv_results_)[cols].sort_values(by="mean_test_score", ascending=False)

Unnamed: 0,param_estimator__alpha,param_estimator__penalty,mean_train_score,mean_test_score
2,0.0001,l2,0.982416,0.973626
3,0.0001,elasticnet,0.982416,0.973626
6,0.001,l2,0.978034,0.969231
7,0.001,elasticnet,0.978034,0.969231
5,0.001,l1,0.986813,0.964835
0,0.0001,,0.979123,0.96044
1,0.0001,l1,0.98462,0.96044
4,0.001,,0.979123,0.96044
8,0.1,,0.979123,0.96044
12,0.5,,0.979123,0.96044


## Criando um modelo com os melhores hiperparâmetros

In [17]:
model = Perceptron(max_iter=1000, tol=1e-3, alpha=0.0001, penalty='l2')

## Treinando modelo

In [18]:
model.fit(x_train_std, y_train);

## Calculando acurácia

In [19]:
y_pred = model.predict(x_test_std)

np.sum(y_pred == y_test) / y_test.shape[0]

0.9385964912280702