In [1]:
# importando bibliotecas necessárias
import pandas as pd
import numpy as np

# graphs
import matplotlib.pyplot as plt
import seaborn as sns

# models
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB, ComplementNB

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, MinMaxScaler

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score

In [2]:
def print_metrics(model_name: str, y_test, y_pred, y_test_pca, y_pred_pca):
  return (f'''
Acurácia do Modelo {model_name} sem PCA: {accuracy_score(y_test, y_pred):.4f}
Acurácia do Modelo {model_name} com PCA: {accuracy_score(y_test_pca, y_pred_pca):.4f}

Precisão do Modelo {model_name} sem PCA: {precision_score(y_test, y_pred, average='macro'):.4f}
Precisão do Modelo {model_name} com PCA: {precision_score(y_test_pca, y_pred_pca, average='macro'):.4f}

Recall (Sensibilidade) do Modelo {model_name} sem PCA: {recall_score(y_test, y_pred, average='macro'):.4f}
Recall (Sensibilidade) do Modelo {model_name} com PCA: {recall_score(y_test_pca, y_pred_pca, average='macro'):.4f}
''')

In [16]:
# aqui a gente cala a boca do pandas e do sklearn
# importa o filtro
from warnings import simplefilter
from pandas.core.common import SettingWithCopyWarning
from sklearn.exceptions import ConvergenceWarning, UndefinedMetricWarning, FitFailedWarning

# monta o filtro
simplefilter("ignore", category=SettingWithCopyWarning)
simplefilter("ignore", category=ConvergenceWarning)
simplefilter("ignore", category=UndefinedMetricWarning)
simplefilter("ignore", category=FitFailedWarning)

In [4]:
dataset = pd.read_csv('aula2305/star_classification.csv')
dataset.head()

Unnamed: 0,obj_ID,alpha,delta,u,g,r,i,z,run_ID,rerun_ID,cam_col,field_ID,spec_obj_ID,class,redshift,plate,MJD,fiber_ID
0,1.237661e+18,135.689107,32.494632,23.87882,22.2753,20.39501,19.16573,18.79371,3606,301,2,79,6.543777e+18,GALAXY,0.634794,5812,56354,171
1,1.237665e+18,144.826101,31.274185,24.77759,22.83188,22.58444,21.16812,21.61427,4518,301,5,119,1.176014e+19,GALAXY,0.779136,10445,58158,427
2,1.237661e+18,142.18879,35.582444,25.26307,22.66389,20.60976,19.34857,18.94827,3606,301,2,120,5.1522e+18,GALAXY,0.644195,4576,55592,299
3,1.237663e+18,338.741038,-0.402828,22.13682,23.77656,21.61162,20.50454,19.2501,4192,301,3,214,1.030107e+19,GALAXY,0.932346,9149,58039,775
4,1.23768e+18,345.282593,21.183866,19.43718,17.58028,16.49747,15.97711,15.54461,8102,301,3,137,6.891865e+18,GALAXY,0.116123,6121,56187,842


In [5]:
X = dataset.drop(['obj_ID', 'run_ID', 'rerun_ID', 'field_ID', 'spec_obj_ID', 'plate', 'MJD', 'fiber_ID', 'cam_col', 'class'], axis=1)
y = dataset['class']

## convertendo class para inteiro
y = y.replace({'GALAXY': 0, 'QSO': 1, 'STAR': 2})

In [6]:
# normalizando os dados
X_std = StandardScaler().fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_std, y, test_size=0.3, random_state=42)

In [9]:
#normalizando entre 0 e 1
X_normal = MinMaxScaler(feature_range=(0, 1), copy=True, clip=False).fit_transform(X)
X_train_normal, X_test_normal, y_train_normal, y_test_normal = train_test_split(X_normal, y, test_size=0.3, random_state=42)

In [10]:
# pca
pca = PCA(n_components=7)
X_pca = pca.fit_transform(X_normal)

X_train_pca, X_test_pca, y_train_pca, y_test_pca = train_test_split(X_pca, y, test_size=0.3, random_state=42)

In [None]:
## rodar o minmaxscaler no pca de novo
X_normal = MinMaxScaler(feature_range=(0, 1), copy=True, clip=False).fit_transform(X)
X_train_normal, X_test_normal, y_train_normal, y_test_normal = train_test_split(X_normal, y, test_size=0.3, random_state=42)

In [20]:
# cross-validation
five_kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(GaussianNB(), X_normal, y, cv=five_kf, scoring='accuracy')
print(f'Acurácia Gaussian NB KF5 sem PCA: {scores.mean():.3f}, {scores.std():.3f}')

scores = cross_val_score(GaussianNB(), X_pca, y, cv=five_kf, scoring='accuracy')
print(f'Acurácia Gaussian NB KF5 com PCA: {scores.mean():.3f}, {scores.std():.3f}')

Acurácia Gaussian NB KF5 sem PCA: 0.779, 0.071
Acurácia Gaussian NB KF5 com PCA: 0.746, 0.036


In [21]:
# cross-validation
five_kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(MultinomialNB(), X_normal, y, cv=five_kf, scoring='accuracy')
print(f'Acurácia MultinomialNB KF5 sem PCA: {scores.mean():.3f}, {scores.std():.3f}')

scores = cross_val_score(MultinomialNB(), X_pca, y, cv=five_kf, scoring='accuracy')
print(f'Acurácia MultinomialNB KF5 com PCA: {scores.mean():.3f}, {scores.std():.3f}')

Acurácia MultinomialNB KF5 sem PCA: 0.595, 0.000
Acurácia MultinomialNB KF5 com PCA: nan, nan


In [26]:
# cross-validation
five_kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(BernoulliNB(), X_normal, y, cv=five_kf, scoring='accuracy')
print(f'Acurácia BernoulliNB KF5 sem PCA: {scores.mean():.3f}, {scores.std():.3f}')

scores = cross_val_score(BernoulliNB(), X_pca, y, cv=five_kf, scoring='accuracy')
print(f'Acurácia BernoulliNB KF5 com PCA: {scores.mean():.3f}, {scores.std():.3f}')

Acurácia BernoulliNB KF5 sem PCA: 0.594, 0.000
Acurácia BernoulliNB KF5 com PCA: 0.748, 0.001


In [25]:
# cross-validation
five_kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(ComplementNB(), X_normal, y, cv=five_kf, scoring='accuracy')
print(f'Acurácia ComplementNB KF5 sem PCA: {scores.mean():.3f}, {scores.std():.3f}')

scores = cross_val_score(ComplementNB(), X_pca, y, cv=five_kf, scoring='accuracy')
print(f'Acurácia ComplementNB KF5 com PCA: {scores.mean():.3f}, {scores.std():.3f}')

Acurácia ComplementNB KF5 sem PCA: 0.531, 0.003
Acurácia ComplementNB KF5 com PCA: nan, nan


In [24]:
# cross-validation
five_kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(LogisticRegression(), X_normal, y, cv=five_kf, scoring='accuracy')
print(f'Acurácia LogisticRegression KF5 sem PCA: {scores.mean():.3f}, {scores.std():.3f}')

scores = cross_val_score(LogisticRegression(), X_pca, y, cv=five_kf, scoring='accuracy')
print(f'Acurácia LogisticRegression KF5 com PCA: {scores.mean():.3f}, {scores.std():.3f}')

Acurácia LogisticRegression KF5 sem PCA: 0.927, 0.004
Acurácia LogisticRegression KF5 com PCA: 0.927, 0.002


### GridSearchCV


In [28]:
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

from sklearn.svm import SVC

scoring = {"AUC": "roc_auc", "Accuracy": make_scorer(accuracy_score)}
gs = GridSearchCV(
    SVC(random_state=42),
    param_grid={"kernel": ["rbf", "linear", "poly", "sigmoid"], "C": [0.1, 1, 10]},
    scoring=scoring,
    refit="AUC",
    n_jobs=2,
    return_train_score=True,
)
gs.fit(X_train_pca, y_train_pca)
results = gs.cv_results_


Traceback (most recent call last):
  File "/home/joao/GitHub/Blue/trilha_dados/dados_mod1/.projeto2/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 761, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/home/joao/GitHub/Blue/trilha_dados/dados_mod1/.projeto2/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 103, in __call__
    score = scorer._score(cached_call, estimator, *args, **kwargs)
  File "/home/joao/GitHub/Blue/trilha_dados/dados_mod1/.projeto2/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 349, in _score
    raise ValueError("{0} format is not supported".format(y_type))
ValueError: multiclass format is not supported

Traceback (most recent call last):
  File "/home/joao/GitHub/Blue/trilha_dados/dados_mod1/.projeto2/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 761, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/home/joao/GitHub/Blue/trilha_dados/dados_mod1/.

In [36]:
gs.best_params_

{'C': 0.1, 'kernel': 'rbf'}

In [38]:
gs.best_index_

0

In [39]:
results

{'mean_fit_time': array([ 92.49505363, 101.16523967, 102.69431963, 338.75588436,
         88.7710238 , 120.56264524, 163.36213851, 248.82410383,
         36.36293378,  48.10774732, 124.53932872, 173.44021678]),
 'std_fit_time': array([ 3.49007943,  2.6988809 ,  0.3748836 , 79.1638648 , 11.5882947 ,
        11.5790713 ,  6.69593146, 20.19203914,  1.96217827,  2.34079588,
         2.27664365, 19.63218003]),
 'mean_score_time': array([0.01599903, 0.00096684, 0.00103226, 0.0013114 , 0.00108037,
        0.001337  , 0.00198264, 0.00092607, 0.00078421, 0.00075369,
        0.00075054, 0.00065765]),
 'std_score_time': array([2.95687764e-02, 2.37844893e-04, 1.86581061e-04, 2.87247694e-04,
        1.88090144e-04, 4.09508279e-04, 1.53270037e-03, 2.94348275e-04,
        1.98259200e-04, 1.64695468e-04, 7.33466385e-05, 2.28213288e-05]),
 'param_C': masked_array(data=[0.1, 0.1, 0.1, 0.1, 1, 1, 1, 1, 10, 10, 10, 10],
              mask=[False, False, False, False, False, False, False, False,
          

In [43]:
results['mean_test_score']

KeyError: 'mean_test_score'