# <font color = 'green'> Machine Learning com Python </font>

# 1 - Primeiros Passos

## 1.1 - Importação da Base de Dados

In [5]:
# Importando módulos necessários
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [6]:
# Importando a base de dados
exams = pd.read_csv('/home/joeldspy/Modelos/datasets/exams.csv')

In [9]:
# Visualizando a dimensão da base de dados
exams.shape

(569, 35)

In [8]:
# Visualizando os primeiros registros
exams.head(3)

Unnamed: 0,id,diagnostico,exame_1,exame_2,exame_3,exame_4,exame_5,exame_6,exame_7,exame_8,...,exame_24,exame_25,exame_26,exame_27,exame_28,exame_29,exame_30,exame_31,exame_32,exame_33
0,842302,M,17.99,10.38,122.8,103.78,1001.0,0.1184,0.2776,0.3001,...,184.6,2019.0,0.1622,0.6656,0.7119,0.786,0.2654,0.4601,0.1189,
1,842517,M,20.57,17.77,132.9,103.78,1326.0,0.08474,0.07864,0.0869,...,158.8,1956.0,0.1238,0.1866,0.2416,0.786,0.186,0.275,0.08902,
2,84300903,M,19.69,21.25,130.0,103.78,1203.0,0.1096,0.1599,0.1974,...,152.5,1709.0,0.1444,0.4245,0.4504,0.786,0.243,0.3613,0.08758,


In [13]:
# Variável constante
pd.value_counts(exams['exame_4'])

103.78    569
Name: exame_4, dtype: int64

In [21]:
# Variável nula
(exams.isna().sum() / len(exams)) * 100

id              0.000000
diagnostico     0.000000
exame_1         0.000000
exame_2         0.000000
exame_3         0.000000
exame_4         0.000000
exame_5         0.000000
exame_6         0.000000
exame_7         0.000000
exame_8         0.000000
exame_9         0.000000
exame_10        0.000000
exame_11        0.000000
exame_12        0.000000
exame_13        0.000000
exame_14        0.000000
exame_15        0.000000
exame_16        0.000000
exame_17        0.000000
exame_18        0.000000
exame_19        0.000000
exame_20        0.000000
exame_21        0.000000
exame_22        0.000000
exame_23        0.000000
exame_24        0.000000
exame_25        0.000000
exame_26        0.000000
exame_27        0.000000
exame_28        0.000000
exame_29        0.000000
exame_30        0.000000
exame_31        0.000000
exame_32        0.000000
exame_33       73.637961
dtype: float64

## 1.2 - Fazendo um Rápido Modelo

In [24]:
# Separando entre dados de entrada e saída
x = exams.drop(['diagnostico', 'id', 'exame_4', 'exame_33'], axis=1)
y = exams['diagnostico']

In [25]:
# Fazendo subset de treino e teste
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, stratify=y, random_state=100)

<div align='center'>
<img src='https://serokell.io/files/vz/vz1f8191.Ensemble-of-decision-trees.png' width=700px>
</div>

In [28]:
# Criando um modelo de Floresta Aleátoria
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=100)

In [32]:
# Treinando o modelo
rfc.fit(x_train, y_train)

In [38]:
# Predizendo variáveis
y_pred = rfc.predict(x_test)

In [46]:
# Validando o modelo
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_pred, y_test)
print(f'Com RandomForestClassifier e um total de {x.shape[1]} variáveis treinadas, temos uma acurácia de {accuracy:.2f}')

Com RandomForestClassifier e um total de 31 variáveis treinadas, temos uma acurácia de 0.98


## 1.3 - Estabelecendo um Modelo de Baseline

In [49]:
# Criando um modelo Dummy para baseline
from sklearn.dummy import DummyClassifier

dummy = DummyClassifier()

dummy.fit(x_train, y_train)

y_pred = dummy.predict(x_test)

accuracy = accuracy_score(y_pred, y_test)

print(f'Com DummyClassifier e um total de {x.shape[1]} variáveis treinadas, temos uma acurácia de {accuracy:.2f}')

Com DummyClassifier e um total de 31 variáveis treinadas, temos uma acurácia de 0.63


# 2 - Melhorando Nosso Modelo

In [64]:
# Criando objeto para escolha das melhores variáveis
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
skb = SelectKBest(chi2, k=5)

In [65]:
# Separando entre treino e teste normalizados
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=100, stratify=y)

In [69]:
# Selecionando as melhores variáveis
skb.fit(x_train, y_train)
x_train = skb.transform(x_train)
x_test = skb.transform(x_test)

In [71]:
# Validando o novo modelo com apenas 5 variáveis
rfc = RandomForestClassifier(n_estimators=100)

rfc.fit(x_train, y_train)

y_pred = rfc.predict(x_test)

accuracy = accuracy_score(y_pred, y_test)

print(f'Com RandomForestClassifier e um total de {x.shape[1]} variáveis treinadas, temos uma acurácia de {accuracy:.2f}')

Com RandomForestClassifier e um total de 31 variáveis treinadas, temos uma acurácia de 0.94


In [75]:
# Variáveis escolhidas pelo SelectKBest
x.columns[skb.get_support()]

Index(['exame_3', 'exame_5', 'exame_15', 'exame_24', 'exame_25'], dtype='object')