In [10]:
conda install -c conda-forge xgboost
pip install xgboost

Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

## Package Plan ##

  environment location: C:\ProgramData\anaconda3

  added / updated specs:
    - xgboost


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    _py-xgboost-mutex-2.0      |            cpu_0          11 KB  conda-forge
    ca-certificates-2023.5.7   |       h56e8100_0         145 KB  conda-forge
    certifi-2023.5.7           |     pyhd8ed1ab_0         149 KB  conda-forge
    libxgboost-1.7.3           |       hd77b12b_0         1.5 MB
    py-xgboost-1.7.3           |  py310haa95532_0         200 KB
    xgboost-1.7.3              |  py310haa95532_0          12 KB
    ------------------------------------------------------------
                                           Total:         2.0 MB

The following NEW packages will be INSTALLED:

  _py-xgboost-mutex  cond


EnvironmentNotWritableError: The current user does not have write permissions to the target environment.
  environment location: C:\ProgramData\anaconda3




In [None]:
# Restart Kernel and start from the cell below.

In [331]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split 
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

In [3]:
df = pd.read_csv('./data/car.data', sep=',', header=None)

In [5]:
df.columns = ['buying','maint','doors','persons','lug_boot','safety','class']

In [7]:
df.shape

(1728, 7)

In [16]:
df['buying'].value_counts()

vhigh    432
high     432
med      432
low      432
Name: buying, dtype: int64

In [27]:
df['buying'].unique()

array(['vhigh', 'high', 'med', 'low'], dtype=object)

In [20]:
df['maint'].unique()

array(['vhigh', 'high', 'med', 'low'], dtype=object)

In [21]:
df['doors'].unique()

array(['2', '3', '4', '5more'], dtype=object)

In [22]:
df['persons'].unique()

array(['2', '4', 'more'], dtype=object)

In [23]:
df['lug_boot'].unique()

array(['small', 'med', 'big'], dtype=object)

In [24]:
df['safety'].unique()

array(['low', 'med', 'high'], dtype=object)

In [25]:
df['class'].unique()

array(['unacc', 'acc', 'vgood', 'good'], dtype=object)

In [26]:
df['class'].value_counts()

unacc    1210
acc       384
good       69
vgood      65
Name: class, dtype: int64

In [28]:
df.isnull().sum()

buying      0
maint       0
doors       0
persons     0
lug_boot    0
safety      0
class       0
dtype: int64

In [31]:
df.dtypes

buying      object
maint       object
doors       object
persons     object
lug_boot    object
safety      object
class       object
dtype: object

In [32]:
# Drop persons
df.drop('persons', axis=1, inplace=True)

In [42]:
# Encode categories. In this case, all features are categories
features = ['maint','doors','lug_boot','safety','class']

df = pd.get_dummies(df, columns=features)

In [340]:
df.dtypes # Check new df dtype

buying            object
maint_high         uint8
maint_low          uint8
maint_med          uint8
maint_vhigh        uint8
doors_2            uint8
doors_3            uint8
doors_4            uint8
doors_5more        uint8
lug_boot_big       uint8
lug_boot_med       uint8
lug_boot_small     uint8
safety_high        uint8
safety_low         uint8
safety_med         uint8
class_acc          uint8
class_good         uint8
class_unacc        uint8
class_vgood        uint8
dtype: object

In [373]:
# Split dataset for train test
X = df.drop('buying', axis=1)
y = df['buying']

label_encoder = LabelEncoder()
label_encoder = label_encoder.fit(y)
label_encoded_y = label_encoder.transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
X_train2, X_test2, y_train2, y_test2 = train_test_split(X, label_encoded_y, test_size=0.3, random_state=7)

In [374]:
# Try Decision Tree Classifier
for max_d in range(1, 10):
    dtree = DecisionTreeClassifier(max_depth=max_d, random_state=42)
    dtree.fit(X_train, y_train)
    
    print('Training Accuracy for max_depth {} is: '.format(max_d), dtree.score(X_test, y_test))
    # Max_depth=4 gave the best score, 0.3208092485549133

Training Accuracy for max_depth 1 is:  0.27167630057803466
Training Accuracy for max_depth 2 is:  0.2861271676300578
Training Accuracy for max_depth 3 is:  0.30346820809248554
Training Accuracy for max_depth 4 is:  0.3208092485549133
Training Accuracy for max_depth 5 is:  0.28901734104046245
Training Accuracy for max_depth 6 is:  0.27167630057803466
Training Accuracy for max_depth 7 is:  0.24855491329479767
Training Accuracy for max_depth 8 is:  0.18497109826589594
Training Accuracy for max_depth 9 is:  0.15028901734104047


In [375]:
for max_leaf in range(2, 10):
    dtree = DecisionTreeClassifier(max_leaf_nodes=max_leaf, max_depth=4, random_state=42)
    dtree.fit(X_train, y_train)
    
    print('Training Accuracy for max_leaf {} is: '.format(max_leaf), dtree.score(X_test, y_test))
    # Max_leaf_nodes=7 gave the best score, 0.3352601156069364

Training Accuracy for max_leaf 2 is:  0.27167630057803466
Training Accuracy for max_leaf 3 is:  0.28901734104046245
Training Accuracy for max_leaf 4 is:  0.2861271676300578
Training Accuracy for max_leaf 5 is:  0.2861271676300578
Training Accuracy for max_leaf 6 is:  0.3063583815028902
Training Accuracy for max_leaf 7 is:  0.3352601156069364
Training Accuracy for max_leaf 8 is:  0.32947976878612717
Training Accuracy for max_leaf 9 is:  0.3265895953757225


In [384]:
# Try XGBoost
xgb_model = xgb.XGBClassifier(
    booster='gbtree',
    gamma=0,
    learning_rate=0.2,
    max_delta_step=0,
    max_depth=1,
    min_child_weight=2,
    n_estimators=100,
    objective='multi:softmax',
    reg_alpha=0,
    reg_lambda=1,
    seed=0,
    subsample=1
)

xgb_model.fit(X_train2, y_train2)
xgb_y_predict = xgb_model.predict(X_test2)

In [385]:
print("Accuracy of XGBoost Model::",accuracy_score(y_test2, xgb_y_predict))

Accuracy of XGBoost Model:: 0.3179190751445087


In [389]:
val_input = [1,0,0,0,0,0,1,0,1,0,0,1,0,0,0,1,0,0]
val_input = np.array(val_input)
val_input = np.reshape(val_input, (1,-1))

val = xgb_model.predict(val_input)

In [391]:
print(val) 
# Prediction: buying = low (31.7% confident... or 69.3% not confident sounds better)

[1]


In [394]:
print(label_encoded_y)

array([3, 3, 3, ..., 1, 1, 1])

In [395]:
y

0       vhigh
1       vhigh
2       vhigh
3       vhigh
4       vhigh
        ...  
1723      low
1724      low
1725      low
1726      low
1727      low
Name: buying, Length: 1728, dtype: object