In [18]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix,accuracy_score,precision_score,recall_score,f1_score
from sklearn.preprocessing import StandardScaler

In [19]:
data = pd.read_csv('all/train.csv',delimiter=',')


Os atributos preditores, listados a seguir, são de ordem quantitativa:

1. Age (idade, em anos)
2. BMI (índice de massa corporal, kg/m2)
3. Glucose (mg/dL)
4. Insulin (insulina, µU/mL)
5. HOMA
6. Leptin (ng/mL)
7. Adiponectin (µg/mL)
8. Resistin (ng/mL)
9. MCP-1(pg/dL)

O atributo alvo, Classification, denota os seguintes resultados:

1. Pessoa saudável
2. Pessoa acometida por câncer de mama

In [20]:
data.head()

Unnamed: 0,id,Age,BMI,Glucose,Insulin,HOMA,Leptin,Adiponectin,Resistin,MCP.1,Classification
0,114,47,22.03,84,2.869,0.59,26.65,38.04,3.32,191.72,1
1,104,75,30.48,152,7.01,2.6283,50.53,10.06,11.73,99.45,2
2,90,25,22.86,82,4.09,0.8273,20.45,23.67,5.14,313.73,1
3,8,54,24.2188,86,3.73,0.7913,8.6874,3.7052,10.3446,635.049,2
4,9,69,35.0927,101,5.646,1.4066,83.4821,6.797,82.1,263.499,1


In [21]:
data_novo = data.dropna(axis=0)

In [22]:
data_novo.head()

Unnamed: 0,id,Age,BMI,Glucose,Insulin,HOMA,Leptin,Adiponectin,Resistin,MCP.1,Classification
0,114,47,22.03,84,2.869,0.59,26.65,38.04,3.32,191.72,1
1,104,75,30.48,152,7.01,2.6283,50.53,10.06,11.73,99.45,2
2,90,25,22.86,82,4.09,0.8273,20.45,23.67,5.14,313.73,1
3,8,54,24.2188,86,3.73,0.7913,8.6874,3.7052,10.3446,635.049,2
4,9,69,35.0927,101,5.646,1.4066,83.4821,6.797,82.1,263.499,1


In [23]:
#separando id e target
data_id = data_novo['id']
y = data_novo['Classification']

In [24]:
X = data_novo.drop('Classification',axis=1)

In [25]:
X.head()

Unnamed: 0,id,Age,BMI,Glucose,Insulin,HOMA,Leptin,Adiponectin,Resistin,MCP.1
0,114,47,22.03,84,2.869,0.59,26.65,38.04,3.32,191.72
1,104,75,30.48,152,7.01,2.6283,50.53,10.06,11.73,99.45
2,90,25,22.86,82,4.09,0.8273,20.45,23.67,5.14,313.73
3,8,54,24.2188,86,3.73,0.7913,8.6874,3.7052,10.3446,635.049
4,9,69,35.0927,101,5.646,1.4066,83.4821,6.797,82.1,263.499


In [26]:
X.count()

id             92
Age            92
BMI            92
Glucose        92
Insulin        92
HOMA           92
Leptin         92
Adiponectin    92
Resistin       92
MCP.1          92
dtype: int64

In [27]:
X = X.drop('id',axis=1)

In [28]:
X.head()

Unnamed: 0,Age,BMI,Glucose,Insulin,HOMA,Leptin,Adiponectin,Resistin,MCP.1
0,47,22.03,84,2.869,0.59,26.65,38.04,3.32,191.72
1,75,30.48,152,7.01,2.6283,50.53,10.06,11.73,99.45
2,25,22.86,82,4.09,0.8273,20.45,23.67,5.14,313.73
3,54,24.2188,86,3.73,0.7913,8.6874,3.7052,10.3446,635.049
4,69,35.0927,101,5.646,1.4066,83.4821,6.797,82.1,263.499


In [29]:
X.shape[0]

92

In [30]:
# Criação do modelo

In [31]:
param1 = {'hidden_layer_sizes':[(5,4),(4,5),(3,6),(9,),(8,1),(6,3),(1,8)],
          'activation':['relu','tanh','logistic'],'learning_rate_init':[0.01,0.1,0.001]}

In [32]:
model = MLPClassifier(solver='lbfgs')

In [33]:
clf = GridSearchCV(model,param1,cv=5)

In [35]:
scaler = StandardScaler()  
# Don't cheat - fit only on training data
scaler.fit(X)  
X = scaler.transform(X)  
# apply same transformation to test data
#x_test = scaler.transform(x_test)  

In [36]:
X

array([[-5.87680465e-01, -1.15278739e+00, -6.32991592e-01,
        -6.91029932e-01, -5.74386951e-01, -2.06745373e-02,
         3.94564566e+00, -9.20197994e-01, -1.04244854e+00],
       [ 1.14443038e+00,  5.43874060e-01,  3.12411979e+00,
        -2.89406347e-01,  1.67547890e-02,  1.24460148e+00,
        -1.92881888e-02, -2.44819027e-01, -1.33364129e+00],
       [-1.94862470e+00, -9.86133071e-01, -7.43494868e-01,
        -5.72608682e-01, -5.05565906e-01, -3.49180035e-01,
         1.90933046e+00, -7.74039882e-01, -6.57400011e-01],
       [-1.54652754e-01, -7.13301878e-01, -5.22488316e-01,
        -6.07524038e-01, -5.16006520e-01, -9.72418546e-01,
        -9.19801471e-01, -3.56075867e-01,  3.56643168e-01],
       [ 7.73263770e-01,  1.47005042e+00,  3.06286254e-01,
        -4.21696753e-01, -3.37559033e-01,  2.99056051e+00,
        -4.81674863e-01,  5.40636027e+00, -8.15922858e-01],
       [-7.73263770e-01, -1.40778858e+00, -5.22488316e-01,
        -2.36742351e-01, -2.81469737e-01, -6.861630

In [37]:
clf.fit(X,y)

GridSearchCV(cv=5, error_score='raise',
       estimator=MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='lbfgs', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'hidden_layer_sizes': [(5, 4), (4, 5), (3, 6), (9,), (8, 1), (6, 3), (1, 8)], 'activation': ['relu', 'tanh', 'logistic'], 'learning_rate_init': [0.01, 0.1, 0.001]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [38]:
pd.DataFrame(clf.best_params_)

Unnamed: 0,activation,hidden_layer_sizes,learning_rate_init
0,relu,5,0.01
1,relu,4,0.01


In [39]:
clf.best_score_

0.7065217391304348

In [40]:
predict = clf.predict(X)

In [41]:
accuracy_score(y,predict)

0.9891304347826086

In [42]:
#test

In [43]:
data_test = pd.read_csv('all/test.csv',delimiter=',')

In [44]:
data_test.head()

Unnamed: 0,id,Age,BMI,Glucose,Insulin,HOMA,Leptin,Adiponectin,Resistin,MCP.1
0,100,62,22.6562,92,3.482,0.7902,9.8648,11.2362,10.6955,703.973
1,78,29,23.01,82,5.663,1.1454,35.59,26.72,4.58,174.8
2,77,75,25.7,94,8.079,1.8733,65.926,3.7412,4.4968,206.802
3,113,44,27.8876,99,9.208,2.2486,12.6757,5.4782,23.0331,407.206
4,86,75,23.0,83,4.952,1.0138,17.127,11.579,7.0913,318.302


In [46]:
id_test = data_test['id']
data_test = data_test.drop('id',axis=1)

In [47]:
data_test.head()

Unnamed: 0,Age,BMI,Glucose,Insulin,HOMA,Leptin,Adiponectin,Resistin,MCP.1
0,62,22.6562,92,3.482,0.7902,9.8648,11.2362,10.6955,703.973
1,29,23.01,82,5.663,1.1454,35.59,26.72,4.58,174.8
2,75,25.7,94,8.079,1.8733,65.926,3.7412,4.4968,206.802
3,44,27.8876,99,9.208,2.2486,12.6757,5.4782,23.0331,407.206
4,75,23.0,83,4.952,1.0138,17.127,11.579,7.0913,318.302


In [48]:
scaler = StandardScaler()  
# Don't cheat - fit only on training data
scaler.fit(data_test)  
data_test = scaler.transform(data_test)  
# apply same transformation to test data
#x_test = scaler.transform(x_test)  

In [50]:
pred_test = clf.predict(data_test)

In [51]:
pred_test

array([1, 1, 1, 1, 1, 1, 2, 1, 2, 2, 2, 2, 1, 2, 1, 2, 1, 1, 1, 1, 1, 1,
       1, 1])

In [52]:
data_out = {
    'id':id_test,
    'Classification':pred_test
}

In [53]:
classi_out = pd.DataFrame(data_out,columns=['id','Classification'])

In [54]:
classi_out

Unnamed: 0,id,Classification
0,100,1
1,78,1
2,77,1
3,113,1
4,86,1
5,84,1
6,89,2
7,56,1
8,31,2
9,48,2
