In [1]:
# Import libraries
import pandas as pd #biblioteca para manipulação de dados
import numpy as np #biblioteca para utilizacao de vetores e matrizes
import matplotlib.pyplot as plt #bibloteca para plotar graficos

In [16]:
# Reading CSV file and store data in df variable
df = pd.read_csv('data/data.csv', delimiter=",")

In [17]:
df.head()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,Class
0,65,Female,0.7,0.1,187,16,18,6.8,3.3,0.9,1
1,62,Male,10.9,5.5,699,64,100,7.5,3.2,0.74,1
2,62,Male,7.3,4.1,490,60,68,7.0,3.3,0.89,1
3,58,Male,1.0,0.4,182,14,20,6.8,3.4,1.0,1
4,72,Male,3.9,2.0,195,27,59,7.3,2.4,0.4,1


### Attribute Information
V1. Age of the patient. Any patient whose age exceeded 89 is listed as being of age "90".

V2. Gender of the patient

V3. Total Bilirubin

V4. Direct Bilirubin

V5. Alkphos Alkaline Phosphatase

V6. Sgpt Alanine Aminotransferase

V7. Sgot Aspartate Aminotransferase

V8. Total Proteins

V9. Albumin

V10. A/G Ratio Albumin and Globulin Ratio

In [18]:
print('Samples and Features:', df.shape)

Samples and Features: (583, 11)


In [19]:
df.describe()

Unnamed: 0,V1,V3,V4,V5,V6,V7,V8,V9,V10,Class
count,583.0,583.0,583.0,583.0,583.0,583.0,583.0,583.0,583.0,583.0
mean,44.746141,3.298799,1.486106,290.576329,80.713551,109.910806,6.48319,3.141852,0.947064,1.286449
std,16.189833,6.209522,2.808498,242.937989,182.620356,288.918529,1.085451,0.795519,0.318492,0.45249
min,4.0,0.4,0.1,63.0,10.0,10.0,2.7,0.9,0.3,1.0
25%,33.0,0.8,0.2,175.5,23.0,25.0,5.8,2.6,0.7,1.0
50%,45.0,1.0,0.3,208.0,35.0,42.0,6.6,3.1,0.947064,1.0
75%,58.0,2.6,1.3,298.0,60.5,87.0,7.2,3.8,1.1,2.0
max,90.0,75.0,19.7,2110.0,2000.0,4929.0,9.6,5.5,2.8,2.0


In [20]:
#criando um dicionario de dados para o mapeamento
name_to_class = {
    'Female': 0,
    'Male': 1
}

#substituindo os valores categóricos pelo mapeamento
df['V2'] = df['V2'].map(name_to_class)

#check
df.head(5)

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,Class
0,65,0,0.7,0.1,187,16,18,6.8,3.3,0.9,1
1,62,1,10.9,5.5,699,64,100,7.5,3.2,0.74,1
2,62,1,7.3,4.1,490,60,68,7.0,3.3,0.89,1
3,58,1,1.0,0.4,182,14,20,6.8,3.4,1.0,1
4,72,1,3.9,2.0,195,27,59,7.3,2.4,0.4,1


In [21]:
# Armazenando os labels em um array
y = np.array(df['Class'])

# Salvando a ordem das features
feature_list = list(df.columns)

# removendo a coluna de labels do dataframe original
df = df.drop('Class', axis = 1)

In [22]:
df.columns

Index(['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10'], dtype='object')

In [23]:
# convertendo o dataframe para array
X = np.array(df)

In [24]:
# Importando modelos
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform
from scipy.stats import randint
from sklearn.metrics import f1_score, make_scorer

In [25]:
cv_strat = StratifiedKFold(n_splits=10)
f1 = make_scorer(f1_score)
distribSVC = dict(kernel = ['sigmoid', 'poly', 'rbf'], C = uniform(loc=1, scale=10))

svc_classif = SVC()
rd_svc = RandomizedSearchCV(svc_classif, distribSVC, cv=cv_strat, scoring=f1, random_state=5762, n_iter=5)

rd_svc.fit(X, y)

print(rd_svc.best_params_)
print(rd_svc.best_score_) 

{'C': 3.996689355015553, 'kernel': 'poly'}
0.832818081808181


In [26]:
#vamos olhar para todos os resultados encontrados!
print('Resumo de todos os resultados encontrados:\n\n', rd_svc.cv_results_)

Resumo de todos os resultados encontrados:

 {'mean_fit_time': array([0.53558717, 0.37390606, 0.01241815, 0.30183454, 0.01080263]), 'std_fit_time': array([0.27597957, 0.21344638, 0.00142818, 0.22016555, 0.00091975]), 'mean_score_time': array([0.00126843, 0.00169272, 0.00196898, 0.00153062, 0.00133736]), 'std_score_time': array([4.53771128e-05, 5.36561198e-04, 4.12790002e-04, 7.04151249e-04,
       8.81195230e-05]), 'param_C': masked_array(data=[9.070941686647695, 3.996689355015553, 2.96908678529896,
                   3.038065179879575, 6.043494590721352],
             mask=[False, False, False, False, False],
       fill_value='?',
            dtype=object), 'param_kernel': masked_array(data=['poly', 'poly', 'rbf', 'poly', 'rbf'],
             mask=[False, False, False, False, False],
       fill_value='?',
            dtype=object), 'params': [{'C': 9.070941686647695, 'kernel': 'poly'}, {'C': 3.996689355015553, 'kernel': 'poly'}, {'C': 2.96908678529896, 'kernel': 'rbf'}, {'C': 3.0380

In [27]:
#vamos olhar para os melhores resultados encontrados pelo Grid Search
print('Melhor resultado f1:', rd_svc.best_score_)
print('\n\nMelhor configuração de hiperparâmetros:', rd_svc.best_params_)

print( '\n\nConfigurações de todos os hiperparâmetros do melhor estimado encontrado pelo RandomizedSearchCV: \n', rd_svc.best_estimator_)

Melhor resultado f1: 0.832818081808181


Melhor configuração de hiperparâmetros: {'C': 3.996689355015553, 'kernel': 'poly'}


Configurações de todos os hiperparâmetros do melhor estimado encontrado pelo RandomizedSearchCV: 
 SVC(C=3.996689355015553, break_ties=False, cache_size=200, class_weight=None,
    coef0=0.0, decision_function_shape='ovr', degree=3, gamma='scale',
    kernel='poly', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)


In [31]:
#definindo o tipo de validacao cruzada e o numero de folds
cv_strat = StratifiedKFold(n_splits = 10)

#definindo a estrategia de score a partir da metrica f1
f1 = make_scorer(f1_score)

#definindo hiperparâmetros
distributions1 = dict(n_estimators = randint(10, 1000),
                      bootstrap = [True, False],
                      criterion = ['gini', 'entropy'])

#instânciando meu classificador
classifier1 = RandomForestClassifier(random_state = 5762)

#instanciando e modelando o grid search com os hiperparametros e a validação definidas.
random_cv1 = RandomizedSearchCV(
    classifier1, 
    distributions1, 
    cv = cv_strat, 
    scoring = f1, 
    random_state = 5762, 
    n_iter = 5
)

random_cv1.fit(X, y)

RandomizedSearchCV(cv=StratifiedKFold(n_splits=10, random_state=None, shuffle=False),
                   error_score=nan,
                   estimator=RandomForestClassifier(bootstrap=True,
                                                    ccp_alpha=0.0,
                                                    class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features='auto',
                                                    max_leaf_nodes=None,
                                                    max_samples=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_s...
                     

In [33]:
#vamos olhar para todos os resultados encontrados!
print('Resumo de todos os resultados encontrados:\n\n', random_cv1.cv_results_)

Resumo de todos os resultados encontrados:

 {'mean_fit_time': array([1.28246496, 1.36341739, 0.16592247, 0.77853243, 0.8945883 ]), 'std_fit_time': array([0.05087068, 0.07413543, 0.04115126, 0.00519931, 0.17706472]), 'mean_score_time': array([0.0735739 , 0.0672986 , 0.00845034, 0.04518454, 0.05367661]), 'std_score_time': array([4.66719733e-03, 3.12413956e-03, 7.66848482e-05, 5.78400960e-03,
       3.35764708e-02]), 'param_bootstrap': masked_array(data=[False, False, False, True, False],
             mask=[False, False, False, False, False],
       fill_value='?',
            dtype=object), 'param_criterion': masked_array(data=['gini', 'entropy', 'entropy', 'gini', 'entropy'],
             mask=[False, False, False, False, False],
       fill_value='?',
            dtype=object), 'param_n_estimators': masked_array(data=[797, 740, 85, 488, 417],
             mask=[False, False, False, False, False],
       fill_value='?',
            dtype=object), 'params': [{'bootstrap': False, 'criter

In [32]:
#vamos olhar para os melhores resultados encontrados pelo Grid Search
print('Melhor resultado f1:', random_cv1.best_score_)
print('\n\nMelhor configuração de hiperparâmetros:', random_cv1.best_params_)

print( '\n\nConfigurações de todos os hiperparâmetros do melhor estimado encontrado pelo RandomizedSearchCV: \n', random_cv1.best_estimator_)

Melhor resultado f1: 0.7984322518413547


Melhor configuração de hiperparâmetros: {'bootstrap': True, 'criterion': 'gini', 'n_estimators': 488}


Configurações de todos os hiperparâmetros do melhor estimado encontrado pelo RandomizedSearchCV: 
 RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=488,
                       n_jobs=None, oob_score=False, random_state=5762,
                       verbose=0, warm_start=False)
