# Machine Learning aplicado - Introdução

## Processo inteiro - primeira vez

Usando o conjunto de dados [Wine](https://scikit-learn.org/stable/datasets/toy_dataset.html#wine-dataset)

In [None]:
from sklearn.datasets import load_wine

data = load_wine()
data.keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names'])

In [None]:
import pandas as pd

pd.DataFrame(data['data'], columns=data['feature_names'])

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline
0,14.23,1.71,2.43,15.6,127.0,2.80,3.06,0.28,2.29,5.64,1.04,3.92,1065.0
1,13.20,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.40,1050.0
2,13.16,2.36,2.67,18.6,101.0,2.80,3.24,0.30,2.81,5.68,1.03,3.17,1185.0
3,14.37,1.95,2.50,16.8,113.0,3.85,3.49,0.24,2.18,7.80,0.86,3.45,1480.0
4,13.24,2.59,2.87,21.0,118.0,2.80,2.69,0.39,1.82,4.32,1.04,2.93,735.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
173,13.71,5.65,2.45,20.5,95.0,1.68,0.61,0.52,1.06,7.70,0.64,1.74,740.0
174,13.40,3.91,2.48,23.0,102.0,1.80,0.75,0.43,1.41,7.30,0.70,1.56,750.0
175,13.27,4.28,2.26,20.0,120.0,1.59,0.69,0.43,1.35,10.20,0.59,1.56,835.0
176,13.17,2.59,2.37,20.0,120.0,1.65,0.68,0.53,1.46,9.30,0.60,1.62,840.0


In [None]:
print(data['target_names'])
print(data['target'])

['class_0' 'class_1' 'class_2']
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2]


In [None]:
from sklearn.neighbors import KNeighborsClassifier

  # Treinando um modelo
clf = KNeighborsClassifier()
clf.fit(data['data'], data['target']) #obs: falta vermos uma coisa

KNeighborsClassifier()

In [None]:
from sklearn.metrics import classification_report

# Gerando predições
y_pred = clf.predict(data['data'])

print(classification_report(y_true=data['target'], y_pred=y_pred))

              precision    recall  f1-score   support

           0       0.87      0.90      0.88        59
           1       0.80      0.75      0.77        71
           2       0.67      0.71      0.69        48

    accuracy                           0.79       178
   macro avg       0.78      0.78      0.78       178
weighted avg       0.79      0.79      0.79       178



## Processo inteiro - segunda vez

In [None]:
from sklearn.datasets import load_wine
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

data = load_wine()
X = data['data']
y = data['target']

# Separação dos dados
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42)

clf = KNeighborsClassifier()
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.89      0.89      0.89        19
           1       0.75      0.71      0.73        21
           2       0.53      0.57      0.55        14

    accuracy                           0.74        54
   macro avg       0.73      0.73      0.73        54
weighted avg       0.74      0.74      0.74        54



# Validação Cruzada 10-folds

Métricas possíveis para o parâmetro [scoring](https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter) de `cross_val_score`

https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter

In [None]:
from sklearn.datasets import load_wine
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_validate

data = load_wine()
X = data['data']
y = data['target']

# Separação dos dados
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42)

# Validação cruzada com os dados de treino
cross_validate(KNeighborsClassifier(), 
               X_train, 
               y_train, 
               cv=5, 
               scoring=['accuracy', 'f1_macro'], 
               return_train_score=True)

{'fit_time': array([0.00201273, 0.00078321, 0.00071764, 0.00078297, 0.00078535]),
 'score_time': array([0.00348687, 0.00300026, 0.00256562, 0.00263   , 0.00254893]),
 'test_accuracy': array([0.64, 0.68, 0.6 , 0.64, 0.75]),
 'train_accuracy': array([0.82828283, 0.78787879, 0.76767677, 0.77777778, 0.82      ]),
 'test_f1_macro': array([0.62835596, 0.68582202, 0.56725146, 0.64460784, 0.69551657]),
 'train_f1_macro': array([0.81199642, 0.78218433, 0.75807752, 0.76720721, 0.81965573])}

In [None]:
# Treino e Teste
clf = KNeighborsClassifier()
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.89      0.89      0.89        19
           1       0.75      0.71      0.73        21
           2       0.53      0.57      0.55        14

    accuracy                           0.74        54
   macro avg       0.73      0.73      0.73        54
weighted avg       0.74      0.74      0.74        54



## Pré-processamento
- One hot encoding (dummy)
- Ordinal encoding
- MinMaxScaler

In [None]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, MinMaxScaler
import numpy as np

In [None]:
df = pd.read_csv("https://raw.githubusercontent.com/hype-usp/Grupos-de-estudos/main/Drug%20Classification/drug.csv")
df.head()

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug
0,23,F,HIGH,HIGH,25.355,DrugY
1,47,M,LOW,HIGH,13.093,drugC
2,47,M,LOW,HIGH,10.114,drugC
3,28,F,NORMAL,HIGH,7.798,drugX
4,61,F,LOW,HIGH,18.043,DrugY


## OneHot

In [None]:
encoder = OneHotEncoder(drop='first')
encoder.fit(df[["Sex", "BP", "Cholesterol"]])

encoded_categories = encoder.transform(df[["Sex", "BP", "Cholesterol"]])
encoded_categories

df_categories = pd.DataFrame(encoded_categories.toarray())
df_categories.columns = encoder.get_feature_names_out()
df_categories

Unnamed: 0,Sex_M,BP_LOW,BP_NORMAL,Cholesterol_NORMAL
0,0.0,0.0,0.0,0.0
1,1.0,1.0,0.0,0.0
2,1.0,1.0,0.0,0.0
3,0.0,0.0,1.0,0.0
4,0.0,1.0,0.0,0.0
...,...,...,...,...
195,0.0,1.0,0.0,0.0
196,1.0,1.0,0.0,0.0
197,1.0,0.0,1.0,0.0
198,1.0,0.0,1.0,1.0


# OrdinalEncoder

In [None]:
encoder = OrdinalEncoder(categories=[["LOW", "NORMAL", "HIGH"], ["NORMAL", "HIGH"], ["F", "M"]])
encoded_categories = encoder.fit_transform(df[["BP", "Cholesterol", "Sex"]])

df_categories = pd.DataFrame(encoded_categories)
df_categories.columns = ['BP', 'Cholesterol', 'Sex']
df_categories

Unnamed: 0,BP,Cholesterol,Sex
0,2.0,1.0,0.0
1,0.0,1.0,1.0
2,0.0,1.0,1.0
3,1.0,1.0,0.0
4,0.0,1.0,0.0
...,...,...,...
195,0.0,1.0,0.0
196,0.0,1.0,1.0
197,1.0,1.0,1.0
198,1.0,0.0,1.0


# Escala de variáveis numéricas

In [None]:
scaler = MinMaxScaler()

df_numeric = pd.DataFrame(scaler.fit_transform(df[['Age', 'Na_to_K']]))
df_numeric.columns = ['Age', 'Na_to_K']
df_numeric

Unnamed: 0,Age,Na_to_K
0,0.135593,0.596848
1,0.542373,0.213397
2,0.542373,0.120239
3,0.220339,0.047814
4,0.779661,0.368191
...,...,...
195,0.694915,0.165676
196,0.016949,0.179405
197,0.627119,0.113359
198,0.135593,0.242385


In [None]:
drug_X = pd.concat([df_categories, df_numeric], axis=1)
drug_y = df['Drug']

## Aplicação

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.datasets import load_wine
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.metrics import classification_report

In [None]:
data = load_wine()
X = data['data']
y = data['target']

In [None]:
scaler = MinMaxScaler()
X = scaler.fit_transform(X)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=42)

In [None]:
cv_results = cross_validate(KNeighborsClassifier(), 
                            X_train, 
                            y_train, 
                            cv=10,    
                            scoring=['f1_macro'], 
                            return_train_score=True)

print("Train F1:", 
      cv_results['train_f1_macro'].mean(),
      "| Validation F1:", 
      cv_results['test_f1_macro'].mean()
)

clf = KNeighborsClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

Train F1: 0.9716903669184674 | Validation F1: 0.9557263107263108
              precision    recall  f1-score   support

           0       0.94      1.00      0.97        15
           1       1.00      0.83      0.91        18
           2       0.86      1.00      0.92        12

    accuracy                           0.93        45
   macro avg       0.93      0.94      0.93        45
weighted avg       0.94      0.93      0.93        45

