In [31]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split 
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder


In [2]:
df = pd.read_csv('./data/car.data', sep=',', header=None)

In [3]:
df.columns = ['buying','maint','doors','persons','lug_boot','safety','class']

In [4]:
df.shape

(1728, 7)

In [5]:
df['buying'].value_counts()

vhigh    432
high     432
med      432
low      432
Name: buying, dtype: int64

In [6]:
df['buying'].unique()

array(['vhigh', 'high', 'med', 'low'], dtype=object)

In [7]:
df['maint'].unique()

array(['vhigh', 'high', 'med', 'low'], dtype=object)

In [8]:
df['doors'].unique()

array(['2', '3', '4', '5more'], dtype=object)

In [9]:
df['persons'].unique()

array(['2', '4', 'more'], dtype=object)

In [10]:
df['lug_boot'].unique()

array(['small', 'med', 'big'], dtype=object)

In [11]:
df['safety'].unique()

array(['low', 'med', 'high'], dtype=object)

In [12]:
df['class'].unique()

array(['unacc', 'acc', 'vgood', 'good'], dtype=object)

In [13]:
df['class'].value_counts()

unacc    1210
acc       384
good       69
vgood      65
Name: class, dtype: int64

In [14]:
df.isnull().sum()

buying      0
maint       0
doors       0
persons     0
lug_boot    0
safety      0
class       0
dtype: int64

In [15]:
df.dtypes

buying      object
maint       object
doors       object
persons     object
lug_boot    object
safety      object
class       object
dtype: object

In [16]:
X, y = df.drop('buying', axis=1), df['buying']

In [25]:
categories = X.select_dtypes('object').columns.tolist()

for col in categories:
    X[col] = X[col].astype('category')
    
label_encoder = LabelEncoder()
label_encoder = label_encoder.fit(y)
label_encoded_y = label_encoder.transform(y)


In [26]:
X.dtypes

maint       category
doors       category
persons     category
lug_boot    category
safety      category
class       category
dtype: object

In [27]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, label_encoded_y, random_state=1)

In [28]:
import xgboost as xgb

# Create regression matrices
dtrain_reg = xgb.DMatrix(X_train, y_train, enable_categorical=True)
dtest_reg = xgb.DMatrix(X_test, y_test, enable_categorical=True)

In [37]:
# Define hyperparameters
params = {"objective": "multi:softprob", "tree_method": "gpu_hist", "num_class":"1"}

n = 100
model = xgb.train(
   params=params,
   dtrain=dtrain_reg,
   num_boost_round=n,
)

XGBoostError: [23:01:51] C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-07593ffd91cd9da33-1\xgboost\xgboost-ci-windows\src\objective\multiclass_obj.cu:123: SoftmaxMultiClassObj: label must be in [0, num_class).

In [32]:
predict = model.predict(dtest_reg)

In [34]:
y_test

array([2, 0, 0, 1, 0, 3, 1, 2, 2, 2, 3, 1, 1, 2, 3, 3, 1, 2, 1, 3, 2, 1,
       2, 2, 0, 3, 1, 2, 1, 2, 1, 1, 3, 0, 2, 3, 3, 1, 2, 0, 0, 0, 2, 1,
       1, 1, 1, 0, 1, 3, 2, 3, 0, 3, 2, 2, 0, 3, 2, 0, 0, 2, 2, 2, 0, 2,
       3, 3, 3, 1, 0, 2, 1, 0, 2, 0, 3, 3, 2, 0, 3, 2, 1, 1, 0, 2, 0, 1,
       0, 3, 0, 3, 0, 0, 2, 0, 3, 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 1,
       2, 3, 2, 1, 2, 3, 3, 2, 0, 0, 2, 1, 3, 3, 1, 1, 3, 1, 3, 2, 3, 0,
       1, 3, 1, 3, 3, 1, 1, 3, 2, 3, 0, 3, 3, 3, 1, 1, 1, 0, 3, 1, 2, 3,
       1, 1, 2, 1, 2, 0, 3, 3, 3, 1, 1, 2, 1, 0, 2, 2, 0, 0, 1, 3, 1, 1,
       0, 3, 3, 1, 1, 2, 3, 3, 0, 2, 0, 2, 1, 2, 0, 3, 1, 2, 2, 1, 2, 3,
       2, 1, 0, 2, 1, 0, 1, 3, 0, 2, 0, 1, 0, 0, 3, 1, 1, 2, 3, 0, 3, 2,
       1, 3, 3, 3, 3, 3, 1, 2, 2, 2, 0, 3, 2, 0, 2, 2, 2, 0, 2, 2, 1, 2,
       2, 1, 2, 1, 1, 1, 1, 2, 0, 2, 2, 3, 3, 1, 1, 0, 3, 0, 3, 0, 3, 0,
       2, 0, 1, 2, 1, 1, 2, 1, 1, 2, 3, 1, 2, 3, 3, 1, 3, 2, 3, 1, 3, 2,
       3, 3, 0, 3, 1, 3, 1, 1, 0, 2, 2, 1, 3, 2, 0,