In [279]:
import pandas as pd


In [280]:
'''
| class values

unacc, acc, good, vgood

| attributes

buying:   vhigh, high, med, low.
maint:    vhigh, high, med, low.
doors:    2, 3, 4, 5more.
persons:  2, 4, more.
lug_boot: small, med, big.
safety:   low, med, high.

CAR                      car acceptability
. PRICE                  overall price
. . buying               buying price
. . maint                price of the maintenance
. TECH                   technical characteristics
. . COMFORT              comfort
. . . doors              number of doors
. . . persons            capacity in terms of persons to carry
. . . lug_boot           the size of luggage boot
. . safety               estimated safety of the car

9. Class Distribution (number of instances per class)

   class      N          N[%]
   -----------------------------
   unacc     1210     (70.023 %) 
   acc        384     (22.222 %) 
   good        69     ( 3.993 %) 
   v-good      65     ( 3.762 %) 

'''

'\n| class values\n\nunacc, acc, good, vgood\n\n| attributes\n\nbuying:   vhigh, high, med, low.\nmaint:    vhigh, high, med, low.\ndoors:    2, 3, 4, 5more.\npersons:  2, 4, more.\nlug_boot: small, med, big.\nsafety:   low, med, high.\n\nCAR                      car acceptability\n. PRICE                  overall price\n. . buying               buying price\n. . maint                price of the maintenance\n. TECH                   technical characteristics\n. . COMFORT              comfort\n. . . doors              number of doors\n. . . persons            capacity in terms of persons to carry\n. . . lug_boot           the size of luggage boot\n. . safety               estimated safety of the car\n\n9. Class Distribution (number of instances per class)\n\n   class      N          N[%]\n   -----------------------------\n   unacc     1210     (70.023 %) \n   acc        384     (22.222 %) \n   good        69     ( 3.993 %) \n   v-good      65     ( 3.762 %) \n\n'

In [281]:
df = pd.read_csv(
    'car_dataset/car.data', \
    names=['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'target']
)

df.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,target
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc


In [282]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1728 entries, 0 to 1727
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   buying    1728 non-null   object
 1   maint     1728 non-null   object
 2   doors     1728 non-null   object
 3   persons   1728 non-null   object
 4   lug_boot  1728 non-null   object
 5   safety    1728 non-null   object
 6   target    1728 non-null   object
dtypes: object(7)
memory usage: 94.6+ KB


In [283]:
df.describe()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,target
count,1728,1728,1728,1728,1728,1728,1728
unique,4,4,4,3,3,3,4
top,vhigh,vhigh,2,2,small,low,unacc
freq,432,432,432,576,576,576,1210


In [284]:
# TODO: EDA


In [285]:
# TODO: Feature Engineering - door to passenger ratio?



In [286]:
cat_cols = df.columns[:len(df.columns) - 1]

x = df[cat_cols]

x.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety
0,vhigh,vhigh,2,2,small,low
1,vhigh,vhigh,2,2,small,med
2,vhigh,vhigh,2,2,small,high
3,vhigh,vhigh,2,2,med,low
4,vhigh,vhigh,2,2,med,med


In [287]:
y = df['target']

y.head()

0    unacc
1    unacc
2    unacc
3    unacc
4    unacc
Name: target, dtype: object

In [288]:
# Modeling
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

preprocess = ColumnTransformer(
    transformers = [
        ('categorical', OneHotEncoder(), cat_cols)
    ]
)

preprocess

ColumnTransformer(transformers=[('categorical', OneHotEncoder(),
                                 Index(['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety'], dtype='object'))])

In [289]:
x = preprocess.fit_transform(x)
y = LabelEncoder().fit_transform(y)

x

<1728x21 sparse matrix of type '<class 'numpy.float64'>'
	with 10368 stored elements in Compressed Sparse Row format>

In [296]:
# Train test split
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, stratify=y, random_state=88)

x_train

<1382x21 sparse matrix of type '<class 'numpy.float64'>'
	with 8292 stored elements in Compressed Sparse Row format>

In [291]:
# SVM
from sklearn.metrics import accuracy_score, classification_report
from sklearn import svm

svm = svm.SVC()

svm = svm.fit(x_train, y_train)

train_predictions = svm.predict(x_train)
test_predictions = svm.predict(x_test)

train_score = accuracy_score(train_predictions, y_train)
test_score = accuracy_score(test_predictions, y_test)

print('Train Accuracy: {}'.format(train_score))
print('Test Accuracy: {}'.format(test_score))

Train Accuracy: 0.9876989869753979
Test Accuracy: 0.9624277456647399


In [292]:
# KNN
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()

knn = knn.fit(x_train, y_train)

train_predictions = knn.predict(x_train)
test_predictions = knn.predict(x_test)

train_score = accuracy_score(train_predictions, y_train)
test_score = accuracy_score(test_predictions, y_test)

print('Train Accuracy: {}'.format(train_score))
print('Test Accuracy: {}'.format(test_score))

Train Accuracy: 0.9392185238784371
Test Accuracy: 0.8959537572254336


In [293]:
# Decision Tree
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier()

dt = dt.fit(x_train, y_train)

train_predictions = dt.predict(x_train)
test_predictions = dt.predict(x_test)

train_score = accuracy_score(train_predictions, y_train)
test_score = accuracy_score(test_predictions, y_test)

print('Train Accuracy: {}'.format(train_score))
print('Test Accuracy: {}'.format(test_score))

Train Accuracy: 1.0
Test Accuracy: 0.9739884393063584
