# Optimization hyperparameter

In [1]:
# Import dependencies 
import pandas as pd
import os
import matplotlib.pyplot as plt

In [10]:
# import dependencies
from numpy import loadtxt
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [2]:
# Import encoded datafram
encoded_df3 = pd.read_csv('encoded_df3.csv')
encoded_df3

Unnamed: 0,state_beer,style,availability,abv,types,taste
0,36,German Hefeweizen,2,5.0,4,4.50
1,36,German Hefeweizen,6,5.6,2,4.50
2,43,German Hefeweizen,6,5.4,1,4.00
3,37,German Hefeweizen,4,5.0,10,4.25
4,46,German Hefeweizen,2,4.5,3,4.00
...,...,...,...,...,...,...
443995,4,Stout,6,5.8,3,4.50
443996,10,Stout,2,8.0,2,3.50
443997,4,Stout,2,11.0,10,5.00
443998,22,Stout,2,13.1,5,4.75


In [3]:
#encode string values
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
encoded_df4 = encoded_df3.copy()
encoded_df4['style'] = le.fit_transform(encoded_df4['style'])
encoded_df4

Unnamed: 0,state_beer,style,availability,abv,types,taste
0,36,18,2,5.0,4,4.50
1,36,18,6,5.6,2,4.50
2,43,18,6,5.4,1,4.00
3,37,18,4,5.0,10,4.25
4,46,18,2,4.5,3,4.00
...,...,...,...,...,...,...
443995,4,36,6,5.8,3,4.50
443996,10,36,2,8.0,2,3.50
443997,4,36,2,11.0,10,5.00
443998,22,36,2,13.1,5,4.75


In [40]:
# Define the target set
y = encoded_df3["style"].values
y

array(['German Hefeweizen', 'German Hefeweizen', 'German Hefeweizen', ...,
       'Stout', 'Stout', 'Stout'], dtype=object)

In [18]:
#define the features set
X = encoded_df3.copy()
X = X.drop(columns =["style"])
X

Unnamed: 0,state_beer,availability,abv,types,taste
0,36,2,5.0,4,4.50
1,36,6,5.6,2,4.50
2,43,6,5.4,1,4.00
3,37,4,5.0,10,4.25
4,46,2,4.5,3,4.00
...,...,...,...,...,...
443995,4,6,5.8,3,4.50
443996,10,2,8.0,2,3.50
443997,4,2,11.0,10,5.00
443998,22,2,13.1,5,4.75


In [41]:
# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)

In [42]:
# Create a StandardScaler instance
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [43]:
# Create a random forest classifier
xgb_model = XGBClassifier()
# fit the model
xgb_model = xgb_model.fit(X_train_scaled, y_train)





In [44]:
# make predictions for test data
predictions = xgb_model.predict(X_test_scaled)

In [45]:
#calculate matrix and accuracy score
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(cm)
acc_score = accuracy_score(y_test, predictions)


In [46]:
display(cm_df)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,30,31,32,33,34,35,36,37,38,39
0,2675,1,19,3,3,19,31,74,21,7,...,19,6,22,29,2,60,7,1,38,1
1,2,2907,10,207,74,0,0,0,3,41,...,14,5,1,1,24,5,123,62,22,6
2,4,1,2947,12,41,3,6,2,2,33,...,28,11,15,55,40,5,10,20,73,19
3,5,89,2,3157,47,3,0,0,2,5,...,6,1,0,1,16,5,76,37,5,1
4,12,50,34,59,2996,4,1,1,8,37,...,28,8,9,11,26,2,28,51,17,4
5,55,4,13,2,23,2319,47,66,79,5,...,18,4,18,36,2,8,2,0,50,9
6,34,0,4,0,0,14,2950,37,9,3,...,6,0,3,12,1,18,0,0,101,0
7,75,0,30,0,4,113,99,2145,50,32,...,21,35,27,29,6,27,3,3,65,4
8,77,15,66,17,17,27,38,62,2087,28,...,80,20,29,34,22,26,61,44,71,42
9,11,66,106,74,175,37,64,36,33,1985,...,56,35,15,52,78,19,55,71,93,66


In [47]:
# Display results
print("Confusion Matrix")
print(f"Accuracy Score: {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix
Accuracy Score: 0.7102306852306852
Classification Report
                                             precision    recall  f1-score   support

                         American Cream Ale       0.68      0.73      0.71      3651
                                 Barleywine       0.76      0.79      0.77      3689
                             Belgian Dubbel       0.67      0.80      0.73      3668
                   Belgian Quadrupel (Quad)       0.81      0.87      0.84      3638
                             Belgian Tripel       0.67      0.84      0.74      3584
                            Belgian Witbier       0.70      0.63      0.66      3683
                            Berliner Weisse       0.65      0.80      0.72      3681
                                 Blonde Ale       0.60      0.58      0.59      3685
                                  Brown Ale       0.67      0.57      0.61      3671
                                   Dark Ale       0.67      0.53      0.59 

In [48]:
# calculate feature importance in the Random Forest model
importances = xgb_model.feature_importances_
importances

array([0.189438  , 0.2811871 , 0.2896496 , 0.18110017, 0.05862512],
      dtype=float32)

In [49]:
# Sort the features by importances
sorted(zip(xgb_model.feature_importances_, encoded_df3.columns), reverse = True)

[(0.2896496, 'availability'),
 (0.2811871, 'style'),
 (0.189438, 'state_beer'),
 (0.18110017, 'abv'),
 (0.058625124, 'types')]

# Pruner for XGBoost