In [1]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', 23)
arquivo = pd.read_csv('D:/Github/data-science/machine-learning/segundo-modulo/bagging/recipedata.csv', encoding = "ISO-8859-1")

In [2]:
selecao = arquivo.loc[arquivo['StyleID'].isin([7, 10, 134, 9, 4, 30, 86, 12, 92, 6, 175, 39])]

In [3]:
selecao.drop(['BeerID', 'Name', 'URL', 'Style', 'UserId', 'PrimingMethod', 'PrimingAmount'], axis = 1, inplace = True)

In [4]:
selecao.head(3)

Unnamed: 0,StyleID,Size(L),OG,FG,ABV,IBU,Color,BoilSize,BoilTime,BoilGravity,Efficiency,MashThickness,SugarScale,BrewMethod,PitchRate,PrimaryTemp
2,7,18.93,1.063,1.018,5.91,59.25,8.98,22.71,60,,70.0,,Specific Gravity,extract,,
3,7,22.71,1.061,1.017,5.8,54.48,8.5,26.5,60,,70.0,,Specific Gravity,All Grain,,
5,10,24.61,1.055,1.013,5.58,40.12,8.0,29.34,70,1.047,79.0,,Specific Gravity,All Grain,1.0,


In [5]:
selecao['SugarScale'] = selecao['SugarScale'].replace('Specific Gravity', 0)
selecao['SugarScale'] = selecao['SugarScale'].replace('Plato', 1)

In [6]:
#Transformando variáveis texto na coluna 'BrewMethod' em categorias com one hot enconding
brewmethod_encode = pd.get_dummies(selecao['BrewMethod'])

#Excluindo a coluna de texto 'BrewMethod'
selecao.drop('BrewMethod', axis = 1, inplace = True)

#Inserindo as variaveis one hot encode novamente no dataset
concatenado = pd.concat([selecao, brewmethod_encode], axis=1)

In [7]:
concatenado['PitchRate'].fillna(concatenado['PitchRate'].mean(), inplace=True)
concatenado.fillna(concatenado.median(), inplace=True)

In [8]:
concatenado.head()

Unnamed: 0,StyleID,Size(L),OG,FG,ABV,IBU,Color,BoilSize,BoilTime,BoilGravity,Efficiency,MashThickness,SugarScale,PitchRate,PrimaryTemp,All Grain,BIAB,Partial Mash,extract
2,7,18.93,1.063,1.018,5.91,59.25,8.98,22.71,60,1.047,70.0,1.5,0,0.677102,20.0,0,0,0,1
3,7,22.71,1.061,1.017,5.8,54.48,8.5,26.5,60,1.047,70.0,1.5,0,0.677102,20.0,1,0,0,0
5,10,24.61,1.055,1.013,5.58,40.12,8.0,29.34,70,1.047,79.0,1.5,0,1.0,20.0,1,0,0,0
6,86,22.71,1.072,1.018,7.09,268.71,6.33,30.28,90,1.047,75.0,1.5,0,0.677102,20.0,1,0,0,0
9,86,20.82,1.08,1.017,8.22,93.02,8.29,28.39,60,1.058,70.0,1.5,0,0.677102,21.11,1,0,0,0


In [9]:
#Definindo variáveis preditoras e variável target
y = concatenado['StyleID']
x = concatenado.drop('StyleID', axis=1)

In [11]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score

#Criação do modelo:
modelo = DecisionTreeClassifier()
skfold = StratifiedKFold(n_splits=3)
resultado = cross_val_score(modelo,x,y,cv = skfold, n_jobs=-1)
print(resultado.mean())

0.47160117434507676


In [14]:
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

#Definindo os valores que serão testados em DecisionTree:
minimos_split = np.array([2, 3, 4, 5, 6, 7,8])
maximo_nivel = np.array([5, 6, 7, 8, 9, 10, 11])
minimo_leaf = np.array([1, 2, 3, 4, 5, 6, 7, 8])
valores_grid = {'min_samples_split': minimos_split, 'min_samples_leaf':minimo_leaf, 'max_depth':maximo_nivel}

#Criação do modelo:
modelo = DecisionTreeClassifier()

#Criando os grids:
gridDecisionTree = GridSearchCV(estimator = modelo, param_grid = valores_grid, cv=3, n_jobs=-1)
gridDecisionTree.fit(x,y)

#Imprimindo os melhores parâmetros:
print ("Mínimo split: ", gridDecisionTree.best_estimator_.min_samples_split)
print ("Máxima produndidade: ", gridDecisionTree.best_estimator_.max_depth)
print ("Mínimo leaf: ", gridDecisionTree.best_estimator_.min_samples_leaf)
print ("Acurácia: ", gridDecisionTree.best_score_)

Mínimo split:  8
Máxima produndidade:  8
Mínimo leaf:  5
Acurácia:  0.5856481481481483


In [21]:
from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score

#Criação do modelo:
modelo = BaggingClassifier(n_estimators=50, max_samples=0.5, n_jobs=-1)
skfold = StratifiedKFold(n_splits=3)
resultado = cross_val_score(modelo,x,y,cv = skfold, n_jobs=-1)
print(resultado.mean())

0.6045054200542005


In [25]:
from sklearn.ensemble import BaggingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score

#Criação do modelo:
modelo = BaggingClassifier(base_estimator = LogisticRegression(), n_estimators=10, max_samples=0.5, n_jobs=-1)
skfold = StratifiedKFold(n_splits=3)
resultado = cross_val_score(modelo,x,y,cv = skfold, n_jobs=-1)
print(resultado.mean())

0.44585591689250226


In [26]:
from sklearn.ensemble import BaggingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score

#Criação do modelo:
modelo = BaggingClassifier(base_estimator = GaussianNB(), n_estimators=10, max_samples=0.5, n_jobs=-1)
skfold = StratifiedKFold(n_splits=3)
resultado = cross_val_score(modelo,x,y,cv = skfold, n_jobs=-1)
print(resultado.mean())

0.4250508130081301
