# Captura dos dados

In [3]:
from sklearn.datasets import load_wine, load_diabetes
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import classification_report, f1_score, r2_score
from sklearn.model_selection import cross_validate, train_test_split

In [None]:
data = load_wine()
X = data['data']
y = data['target']

# Separação dos dados
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42)

# KNN

In [None]:
# Validação cruzada com os dados de treino
k_list = [1,3,5,7,9,15]

for k in k_list:
  knn_results = cross_validate(KNeighborsClassifier(n_neighbors=k), 
                              X_train, 
                              y_train, 
                              cv=10, 
                              scoring=['f1_macro'], 
                              return_train_score=True)
  print("K:", 
        k, 
        "| Train F1:", 
        knn_results['train_f1_macro'].mean(),
        "| Validation F1:", 
        knn_results['test_f1_macro'].mean()
  )

  # Test
  clf = KNeighborsClassifier(n_neighbors=k) # melhor k do cross validation aqui
  clf.fit(X_train, y_train)
  y_pred = clf.predict(X_test)
  print("Test:", f1_score(y_test, y_pred, average='macro'))
  print()

K: 1 | Train F1: 1.0 | Validation F1: 0.7241234691234691
Test: 0.7880952380952381

K: 3 | Train F1: 0.8279918786411427 | Validation F1: 0.6501226551226551
Test: 0.7260560990364894

K: 5 | Train F1: 0.7764584044308321 | Validation F1: 0.6317978317978318
Test: 0.7260560990364894

K: 7 | Train F1: 0.7748408406098384 | Validation F1: 0.6527946127946128
Test: 0.747008547008547

K: 9 | Train F1: 0.7390964509328753 | Validation F1: 0.6823665223665224
Test: 0.7115740740740741

K: 15 | Train F1: 0.711169974640615 | Validation F1: 0.6910365560365561
Test: 0.7464048873216954



# Naive Bayes

## Multinomial

In [None]:
import pandas as pd
data = pd.DataFrame({
    "Dinheiro": [2,1,0,0,0,1],
    "Convite": [0,1,0,2,1,1],
    "Amigo": [0,2,2,0,2,0],
    "Parabéns": [1,0,1,1,0,0],
    "classe": ["Spam", "Spam", "Spam", "Não Spam", "Não Spam", "Não Spam"]
})
data

Unnamed: 0,Dinheiro,Convite,Amigo,Parabéns,classe
0,2,0,0,1,Spam
1,1,1,2,0,Spam
2,0,0,2,1,Spam
3,0,2,0,1,Não Spam
4,0,1,2,0,Não Spam
5,1,1,0,0,Não Spam


In [None]:
X = data[["Dinheiro", "Convite", "Amigo", "Parabéns"]]
y = data["classe"]

In [None]:
from sklearn.naive_bayes import MultinomialNB

clf = MultinomialNB(alpha=1)
clf.fit(X, y)

MultinomialNB(alpha=1)

In [None]:
y_test = pd.DataFrame({
    "Dinheiro": [1],
    "Convite": [1],
    "Amigo": [0],
    "Parabéns": [0],
})

clf.predict(y_test)

array(['Não Spam'], dtype='<U8')

## Gaussian

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

data = load_wine()
X = scaler.fit_transform(data['data'])
y = data['target']

# Separação dos dados
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42)

In [None]:
from sklearn.naive_bayes import GaussianNB

cv_results = cross_validate(GaussianNB(), 
                            X_train, 
                            y_train, 
                            cv=10, 
                            scoring=['f1_macro'], 
                            return_train_score=True)

print("| Train f1_macro:", 
      cv_results['train_f1_macro'].mean(),
      "| Validation f1_macro:", 
      cv_results['test_f1_macro'].mean()
)

clf = GaussianNB()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("Test:",  f1_score(y_test, y_pred, average='macro'))
print()

| Train f1_macro: 0.9774366517727371 | Validation f1_macro: 0.9608850408850408
Test: 1.0



# Regressão Linear

Usando o dataset [diabetes](https://scikit-learn.org/stable/datasets/toy_dataset.html#diabetes-dataset)

In [None]:
data = load_diabetes()
X = data['data']
y = data['target']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42)

In [None]:
# import sklearn
# sorted(sklearn.metrics.SCORERS.keys())

In [None]:
cv_results = cross_validate(LinearRegression(), 
                            X_train, 
                            y_train, 
                            cv=10, 
                            scoring=['r2', 'neg_mean_squared_error'], 
                            return_train_score=True)

print("| Train R2:", 
      cv_results['train_r2'].mean(),
      "| Validation R2:", 
      cv_results['test_r2'].mean()
)

regressor = LinearRegression()
regressor.fit(X_train, y_train)
y_pred = regressor.predict(X_test)
print("Test:", r2_score(y_test, y_pred))
print()

| Train R2: 0.525808388286437 | Validation R2: 0.39925729042152164
Test: 0.4772920174157329



# Regressão Logística

In [2]:
data = load_wine()
X = data['data']
y = data['target']

# Separação dos dados
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42)

In [6]:
# Validação cruzada com os dados de treino
penalties = ['none', 'l2']

for penalty in penalties:
  lr_results = cross_validate(LogisticRegression(solver='newton-cg', penalty=penalty), 
                              X_train, 
                              y_train, 
                              cv=10, 
                              scoring=['f1_macro'], 
                              return_train_score=True)
  print("penalty:", 
        penalty, 
        "| Train F1:", 
        lr_results['train_f1_macro'].mean(),
        "| Validation F1:", 
        lr_results['test_f1_macro'].mean()
  )

penalty: none | Train F1: 1.0 | Validation F1: 0.9142159692159693
penalty: l2 | Train F1: 0.9933137285249961 | Validation F1: 0.9341125541125541


In [7]:
# Test
clf = LogisticRegression(solver='newton-cg', penalty='l2')
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("Test:", f1_score(y_test, y_pred, average='macro'))
print()

Test: 1.0

