In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix,accuracy_score,precision_score,recall_score,f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [2]:
data = pd.read_csv('all/train.csv',delimiter=',')


Os atributos preditores, listados a seguir, são de ordem quantitativa:

1. Age (idade, em anos)
2. BMI (índice de massa corporal, kg/m2)
3. Glucose (mg/dL)
4. Insulin (insulina, µU/mL)
5. HOMA
6. Leptin (ng/mL)
7. Adiponectin (µg/mL)
8. Resistin (ng/mL)
9. MCP-1(pg/dL)

O atributo alvo, Classification, denota os seguintes resultados:

1. Pessoa saudável
2. Pessoa acometida por câncer de mama

In [3]:
data.head()

Unnamed: 0,id,Age,BMI,Glucose,Insulin,HOMA,Leptin,Adiponectin,Resistin,MCP.1,Classification
0,114,47,22.03,84,2.869,0.59,26.65,38.04,3.32,191.72,1
1,104,75,30.48,152,7.01,2.6283,50.53,10.06,11.73,99.45,2
2,90,25,22.86,82,4.09,0.8273,20.45,23.67,5.14,313.73,1
3,8,54,24.2188,86,3.73,0.7913,8.6874,3.7052,10.3446,635.049,2
4,9,69,35.0927,101,5.646,1.4066,83.4821,6.797,82.1,263.499,1


In [4]:
data_novo = data.dropna(axis=0)

In [5]:
data_novo.head()

Unnamed: 0,id,Age,BMI,Glucose,Insulin,HOMA,Leptin,Adiponectin,Resistin,MCP.1,Classification
0,114,47,22.03,84,2.869,0.59,26.65,38.04,3.32,191.72,1
1,104,75,30.48,152,7.01,2.6283,50.53,10.06,11.73,99.45,2
2,90,25,22.86,82,4.09,0.8273,20.45,23.67,5.14,313.73,1
3,8,54,24.2188,86,3.73,0.7913,8.6874,3.7052,10.3446,635.049,2
4,9,69,35.0927,101,5.646,1.4066,83.4821,6.797,82.1,263.499,1


In [6]:
#separando id e target
data_id = data_novo['id']
y = data_novo['Classification']

In [7]:
X = data_novo.drop('Classification',axis=1)

In [8]:
X.head()

Unnamed: 0,id,Age,BMI,Glucose,Insulin,HOMA,Leptin,Adiponectin,Resistin,MCP.1
0,114,47,22.03,84,2.869,0.59,26.65,38.04,3.32,191.72
1,104,75,30.48,152,7.01,2.6283,50.53,10.06,11.73,99.45
2,90,25,22.86,82,4.09,0.8273,20.45,23.67,5.14,313.73
3,8,54,24.2188,86,3.73,0.7913,8.6874,3.7052,10.3446,635.049
4,9,69,35.0927,101,5.646,1.4066,83.4821,6.797,82.1,263.499


In [9]:
X.count()

id             92
Age            92
BMI            92
Glucose        92
Insulin        92
HOMA           92
Leptin         92
Adiponectin    92
Resistin       92
MCP.1          92
dtype: int64

In [10]:
X = X.drop('id',axis=1)

In [11]:
X.head()

Unnamed: 0,Age,BMI,Glucose,Insulin,HOMA,Leptin,Adiponectin,Resistin,MCP.1
0,47,22.03,84,2.869,0.59,26.65,38.04,3.32,191.72
1,75,30.48,152,7.01,2.6283,50.53,10.06,11.73,99.45
2,25,22.86,82,4.09,0.8273,20.45,23.67,5.14,313.73
3,54,24.2188,86,3.73,0.7913,8.6874,3.7052,10.3446,635.049
4,69,35.0927,101,5.646,1.4066,83.4821,6.797,82.1,263.499


In [12]:
X.shape[0]

92

In [13]:
# Criação do modelo

In [14]:
param1 = {'hidden_layer_sizes':[(5,4),(4,5),(3,6),(9,),(8,1),(6,3),(1,8)],
          'activation':['relu','tanh','logistic'],'learning_rate_init':[0.01,0.1,0.001]}

In [15]:
model = MLPClassifier(solver='lbfgs')

In [16]:
clf = GridSearchCV(model,param1,cv=3)

In [35]:
cols1 = list(X.columns)
pca = PCA(n_components=3,whiten=True)
dados_pca = pca.fit_transform(X[cols1].values)
dados_pca = pd.DataFrame(dados_pca,columns=['comp1','comp2','comp3'])

In [36]:
dados_pca.head()

Unnamed: 0,comp1,comp2,comp3
0,-1.037363,-0.638602,0.684995
1,-1.326151,2.425318,-1.118042
2,-0.654195,-1.048118,1.362947
3,0.354275,-0.932294,-0.402165
4,-0.808327,2.65897,1.895473


In [37]:
#scaler = StandardScaler()  
# Don't cheat - fit only on training data
#scaler.fit(dados_pca)  
#X = scaler.transform(dados_pca)  
# apply same transformation to test data
#x_test = scaler.transform(x_test)  

In [38]:
clf.fit(X,y)

GridSearchCV(cv=3, error_score='raise',
       estimator=MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='lbfgs', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'hidden_layer_sizes': [(5, 4), (4, 5), (3, 6), (9,), (8, 1), (6, 3), (1, 8)], 'activation': ['relu', 'tanh', 'logistic'], 'learning_rate_init': [0.01, 0.1, 0.001]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [39]:
pd.DataFrame(clf.best_params_)

Unnamed: 0,activation,hidden_layer_sizes,learning_rate_init
0,relu,9,0.01


In [40]:
clf.best_score_

0.6413043478260869

In [41]:
predict = clf.predict(X)

In [42]:
accuracy_score(y,predict)

0.5543478260869565

# test

In [43]:
data_test = pd.read_csv('all/test.csv',delimiter=',')

In [44]:
data_test.head()

Unnamed: 0,id,Age,BMI,Glucose,Insulin,HOMA,Leptin,Adiponectin,Resistin,MCP.1
0,100,62,22.6562,92,3.482,0.7902,9.8648,11.2362,10.6955,703.973
1,78,29,23.01,82,5.663,1.1454,35.59,26.72,4.58,174.8
2,77,75,25.7,94,8.079,1.8733,65.926,3.7412,4.4968,206.802
3,113,44,27.8876,99,9.208,2.2486,12.6757,5.4782,23.0331,407.206
4,86,75,23.0,83,4.952,1.0138,17.127,11.579,7.0913,318.302


In [45]:
id_test = data_test['id']
data_test = data_test.drop('id',axis=1)

In [46]:
data_test.head()

Unnamed: 0,Age,BMI,Glucose,Insulin,HOMA,Leptin,Adiponectin,Resistin,MCP.1
0,62,22.6562,92,3.482,0.7902,9.8648,11.2362,10.6955,703.973
1,29,23.01,82,5.663,1.1454,35.59,26.72,4.58,174.8
2,75,25.7,94,8.079,1.8733,65.926,3.7412,4.4968,206.802
3,44,27.8876,99,9.208,2.2486,12.6757,5.4782,23.0331,407.206
4,75,23.0,83,4.952,1.0138,17.127,11.579,7.0913,318.302


In [48]:
pred_test = clf.predict(data_test)

In [49]:
pred_test

array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2])

In [50]:
data_out = {
    'id':id_test,
    'Classification':pred_test
}

In [51]:
classi_out = pd.DataFrame(data_out,columns=['id','Classification'])

In [52]:
classi_out

Unnamed: 0,id,Classification
0,100,2
1,78,2
2,77,2
3,113,2
4,86,2
5,84,2
6,89,2
7,56,2
8,31,2
9,48,2
