# Installing and importing dataset

In [None]:
pip install ucimlrepo

Collecting ucimlrepo
  Downloading ucimlrepo-0.0.7-py3-none-any.whl.metadata (5.5 kB)
Downloading ucimlrepo-0.0.7-py3-none-any.whl (8.0 kB)
Installing collected packages: ucimlrepo
Successfully installed ucimlrepo-0.0.7


In [None]:
from ucimlrepo import fetch_ucirepo

# fetch dataset
car_evaluation = fetch_ucirepo(id=19)

# data (as pandas dataframes)
x = car_evaluation.data.features
y = car_evaluation.data.targets


In [None]:
x.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety
0,vhigh,vhigh,2,2,small,low
1,vhigh,vhigh,2,2,small,med
2,vhigh,vhigh,2,2,small,high
3,vhigh,vhigh,2,2,med,low
4,vhigh,vhigh,2,2,med,med


In [None]:
# info on feature column
x.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1728 entries, 0 to 1727
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   buying    1728 non-null   object
 1   maint     1728 non-null   object
 2   doors     1728 non-null   object
 3   persons   1728 non-null   object
 4   lug_boot  1728 non-null   object
 5   safety    1728 non-null   object
dtypes: object(6)
memory usage: 81.1+ KB


In [None]:
y.head()

Unnamed: 0,class
0,unacc
1,unacc
2,unacc
3,unacc
4,unacc


In [None]:
# info on target column
y.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1728 entries, 0 to 1727
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   class   1728 non-null   object
dtypes: object(1)
memory usage: 13.6+ KB


# Data Preprocessing

In [None]:
# checking unique value in every feature column
for col in x.columns:
    print(col, x[col].unique())

buying ['vhigh' 'high' 'med' 'low']
maint ['vhigh' 'high' 'med' 'low']
doors ['2' '3' '4' '5more']
persons ['2' '4' 'more']
lug_boot ['small' 'med' 'big']
safety ['low' 'med' 'high']


In [None]:
# checking unique values in target column
for col in y.columns:
    print(col, y[col].unique())

class ['unacc' 'acc' 'vgood' 'good']


In [None]:
# replace ambiguous value to be clearer
x['doors'] = x['doors'].replace('5more', 'more')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x['doors'] = x['doors'].replace('5more', 'more')


In [None]:
# checking again to see if its clear
for col in x.columns:
    print(col, x[col].unique())

buying ['vhigh' 'high' 'med' 'low']
maint ['vhigh' 'high' 'med' 'low']
doors ['2' '3' '4' 'more']
persons ['2' '4' 'more']
lug_boot ['small' 'med' 'big']
safety ['low' 'med' 'high']


In [None]:
x.isnull().sum()

Unnamed: 0,0
buying,0
maint,0
doors,0
persons,0
lug_boot,0
safety,0


In [None]:
y.isnull().sum()

Unnamed: 0,0
class,0


In [None]:
x.duplicated().sum()

0

# Feature Engineering

In [None]:
# encoding categorical column that have ordinal value
lug_boot_mapping = {'big': 3, 'med': 2, 'small': 1}
safety_mapping = {'high': 3, 'med': 2, 'low': 1}
buying_maint_mapping = {'vhigh': 4, 'high': 3, 'med': 2, 'low': 1}
doors_mapping = {'2': 1, '3': 2, '4': 3, 'more': 4}
persons_mapping = {'2': 1, '4': 2, 'more': 3}

x['lug_boot'] = x['lug_boot'].map(lug_boot_mapping)
x['safety'] = x['safety'].map(safety_mapping)
x['buying'] = x['buying'].map(buying_maint_mapping)
x['maint'] = x['maint'].map(buying_maint_mapping)
x['doors'] = x['doors'].map(doors_mapping)
x['persons'] = x['persons'].map(persons_mapping)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x['lug_boot'] = x['lug_boot'].map(lug_boot_mapping)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x['safety'] = x['safety'].map(safety_mapping)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x['buying'] = x['buying'].map(buying_maint_mapping)
A value is trying to be set on a copy of a slice from a

In [None]:
import pandas as pd

# use one-hot encoding for columns that doesn't have ordinal value
# x = pd.get_dummies(x, columns=['doors', 'persons'])
# x = x.astype(int)

In [None]:
x.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety
0,4,4,1,1,1,1
1,4,4,1,1,1,2
2,4,4,1,1,1,3
3,4,4,1,1,2,1
4,4,4,1,1,2,2


# Train & Testing

In [None]:
from sklearn.model_selection import train_test_split

# Splitting the data into training and testing sets (for example, 70% for training and 30% for testing)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

print("Train Data:")
print(x_train.shape, y_train.shape)

print("Test Data:")
print(x_test.shape, y_test.shape)

Train Data:
(1209, 6) (1209, 1)
Test Data:
(519, 6) (519, 1)


In [None]:
# using random forest as the model
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

model_rf = RandomForestClassifier(random_state=42)
model_rf.fit(x_train, y_train)
y_pred = model_rf.predict(x_test)

  return fit_method(estimator, *args, **kwargs)


In [None]:
# show classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# show confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

         acc       0.95      0.92      0.94       118
        good       0.74      0.89      0.81        19
       unacc       1.00      0.99      1.00       358
       vgood       0.83      0.83      0.83        24

    accuracy                           0.97       519
   macro avg       0.88      0.91      0.89       519
weighted avg       0.97      0.97      0.97       519

Confusion Matrix:
[[109   5   1   3]
 [  1  17   0   1]
 [  2   0 356   0]
 [  3   1   0  20]]


In [None]:
# using xgboost model
import xgboost as xgb
from xgboost import XGBClassifier

In [None]:
# mapping target column because xgboost only works on numeric
y_mapping = {'unacc': 0, 'acc': 1, 'good': 2, 'vgood': 3}
y_encoded = y['class'].map(y_mapping)

In [None]:
# Splitting the data into training and testing sets (for example, 70% for training and 30% for testing)
x_train, x_test, y_train, y_test = train_test_split(x, y_encoded, test_size=0.3, random_state=42)

# initialize the xgboost model
model = xgb.XGBClassifier()

# train model
model.fit(x_train, y_train)

# Predict
y_pred = model.predict(x_test)

# Evaluating model
print("Accuracy on Test Data:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

# showing confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Accuracy on Test Data: 0.9788053949903661
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       358
           1       0.97      0.96      0.97       118
           2       0.74      0.89      0.81        19
           3       0.91      0.88      0.89        24

    accuracy                           0.98       519
   macro avg       0.91      0.93      0.92       519
weighted avg       0.98      0.98      0.98       519

Confusion Matrix:
[[357   1   0   0]
 [  0 113   4   1]
 [  0   1  17   1]
 [  0   1   2  21]]
