# <font color='green'> Validação de Modelos </font>

# 1 - Validação Simples

In [35]:
# Importando bibliotecas necessárias
import pandas as pd
import numpy as np

In [36]:
# Importando base de dados
cars = pd.read_csv('/home/joeldspy/Modelos/datasets/cars.csv').iloc[:, 1:]

In [37]:
# Visualizando os 5 primeiros registros da base de dados
cars.head()

Unnamed: 0,prc,sld,age,kpy
0,30941.02,1,18,35085.22134
1,40557.96,1,20,12622.05362
2,89627.5,0,12,11440.79806
3,95276.14,0,3,43167.32682
4,117384.68,1,4,12770.1129


In [4]:
# Separando dados entre dados de entrada e saída
x = cars.drop(columns=['sld'])
y = cars['sld']

In [38]:
# Estabelencendo uma baseline
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score

np.random.seed(100)

x_train, x_test, y_train, y_test = train_test_split(x, y, stratify=y)

dummy = DummyClassifier()

dummy.fit(x_train, y_train)

y_pred = dummy.predict(x_test)

acr = accuracy_score(y_pred, y_test)

print(f'Com o modelo DummyClassifier, a acurácia foi de {acr:.2f}')

Com o modelo DummyClassifier, a acurácia foi de 0.58


In [39]:
# Gerando um modelo inicial
from sklearn.tree import DecisionTreeClassifier

np.random.seed(100)

x_train, x_test, y_train, y_test = train_test_split(x, y, stratify=y)

dtc = DecisionTreeClassifier(max_depth=2)

dtc.fit(x_train, y_train)

y_pred = dtc.predict(x_test)

acr = accuracy_score(y_pred, y_test)

print(f'Com o modelo DecisionTreeClassifier, a acurácia foi de {acr:.2f}')

Com o modelo DecisionTreeClassifier, a acurácia foi de 0.76


# 2 - Aplicando Validação Cruzada

In [71]:
# Aplicando a validação de cruzada
from sklearn.model_selection import cross_validate

np.random.seed(100)

model = DecisionTreeClassifier(max_depth=2)

results = cross_validate(model, x, y, cv = 5, return_train_score=False)

sigma = results['test_score'].std()
mean = results['test_score'].mean()

interval = [mean - 2 * sigma, mean + 2 * sigma]

interval

[0.7438200143061588, 0.771779985693841]

In [73]:
def print_results(results):
    sigma = results['test_score'].std()
    mean = results['test_score'].mean()

    interval = [mean - 2 * sigma, mean + 2 * sigma]

    print(interval)

In [74]:
# Aplicando a validação de cruzada com aleatoriedade
from sklearn.model_selection import cross_validate
from sklearn.model_selection import KFold

np.random.seed(100)

model = DecisionTreeClassifier(max_depth=2)

cv = KFold(n_splits=10, shuffle=True)

results = cross_validate(model, x, y, cv = cv, return_train_score=False)

print_results(results)

[0.7374864577190486, 0.7781135422809514]


In [76]:
# Aplicando a validação de cruzada com algum azar
from sklearn.model_selection import cross_validate
from sklearn.model_selection import KFold

bad_luck = cars.sort_values(by='sld')

x = bad_luck.drop(columns=['sld'])
y = bad_luck['sld']

np.random.seed(100)

model = DecisionTreeClassifier(max_depth=2)

cv = KFold(n_splits=10, shuffle=False)

results = cross_validate(model, x, y, cv = cv, return_train_score=False)

print_results(results)

[0.3429273688939625, 0.8138726311060376]


In [82]:
# Aplicando a validação de cruzada com algum azar porém, aleatorizando os dados
from sklearn.model_selection import cross_validate
from sklearn.model_selection import KFold

bad_luck = cars.sort_values(by='sld')

x = bad_luck.drop(columns=['sld'])
y = bad_luck['sld']

np.random.seed(100)

model = DecisionTreeClassifier(max_depth=2)

cv = KFold(n_splits=10, shuffle=True)

results = cross_validate(model, x, y, cv = cv, return_train_score=False)

print_results(results)

[0.7374864577190486, 0.7781135422809514]


In [83]:
# Aplicando a validação de cruzada com algum azar porém, usando validação estratificada
from sklearn.model_selection import cross_validate
from sklearn.model_selection import StratifiedKFold

bad_luck = cars.sort_values(by='sld')

x = bad_luck.drop(columns=['sld'])
y = bad_luck['sld']

np.random.seed(100)

model = DecisionTreeClassifier(max_depth=2)

cv = StratifiedKFold(n_splits=10, shuffle=True)

results = cross_validate(model, x, y, cv = cv, return_train_score=False)

print_results(results)

[0.7390225001655409, 0.7767774998344591]


# 3 - Criando uma Nova Coluna dos dados

In [117]:
cars.head(3)

Unnamed: 0,prc,sld,age,kpy,model
0,30941.02,1,18,35085.22134,Mobi
1,40557.96,1,20,12622.05362,Mobi
2,89627.5,0,12,11440.79806,Fox


In [118]:
cars['model'] = cars['age'] + np.random.randint(-2, 3, len(cars))

In [119]:
cars.head(3)

Unnamed: 0,prc,sld,age,kpy,model
0,30941.02,1,18,35085.22134,17
1,40557.96,1,20,12622.05362,20
2,89627.5,0,12,11440.79806,12


In [120]:
cars['model'].unique()

array([17, 20, 12,  4,  9, 18, 11,  3, 15, 13,  6, 19,  7, 21, 10, 14, 16,
        8,  1, 22,  5,  2,  0, -1])

In [121]:
cars['model'] = cars['model'] + abs(cars['model'].min()) + 1

In [122]:
car_models = {
    1: 'Fiesta',
    2: 'Gol',
    3: 'Corsa',
    4: 'Uno',
    5: 'Fit',
    6: 'Corolla',
    7: 'Focus',
    8: 'Voyage',
    9: 'Celta',
    10: 'Palio',
    11: 'City',
    12: 'Yaris',
    13: 'Ecosport',
    14: 'Fox',
    15: 'Onix',
    16: 'Siena',
    17: 'Civic',
    18: 'Etios',
    19: 'Hilux',
    20: 'Polo',
    21: 'Cruze',
    22: 'Mobi',
    23: 'Accord',
    24: 'Hilux'
}

cars['name'] = cars.model.replace(car_models)

In [124]:
cars.head()

Unnamed: 0,prc,sld,age,kpy,model,name
0,30941.02,1,18,35085.22134,19,Hilux
1,40557.96,1,20,12622.05362,22,Mobi
2,89627.5,0,12,11440.79806,14,Fox
3,95276.14,0,3,43167.32682,6,Corolla
4,117384.68,1,4,12770.1129,6,Corolla


In [139]:
# Aplicando a validação de cruzada com algum azar porém, usando validação por grupo
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GroupKFold

bad_luck = cars.sort_values(by='sld')

x = bad_luck.drop(columns=['sld', 'name', 'model'])
y = bad_luck['sld']

np.random.seed(100)

model = DecisionTreeClassifier(max_depth=2)

cv = GroupKFold(n_splits=10)

results = cross_validate(model, x, y, cv = cv, return_train_score=False, groups=cars.model)

print_results(results)

[0.7290359908703958, 0.7866242360966524]
