In [1]:
import pandas as pd

col_names = ['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'class']

df = pd.read_csv('car_dataset/car.data', names=col_names)

df.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc


# EDA

Let's look into our data!

In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1728 entries, 0 to 1727
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   buying    1728 non-null   object
 1   maint     1728 non-null   object
 2   doors     1728 non-null   object
 3   persons   1728 non-null   object
 4   lug_boot  1728 non-null   object
 5   safety    1728 non-null   object
 6   class     1728 non-null   object
dtypes: object(7)
memory usage: 94.6+ KB


In [3]:
df.describe()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class
count,1728,1728,1728,1728,1728,1728,1728
unique,4,4,4,3,3,3,4
top,vhigh,vhigh,2,2,small,low,unacc
freq,432,432,432,576,576,576,1210


# EDA Conclusion

No missing or Nan values, great! 

## Insights:
1. All columns are categorical
2. Our target variable (buying) has 4 classes, which means we are tackling a multiclass classification problem

In [4]:
# Separate input and target columns
cat_cols = df.columns[1:len(df.columns)]

x = df[cat_cols]
y = df['buying']

x.head()

Unnamed: 0,maint,doors,persons,lug_boot,safety,class
0,vhigh,2,2,small,low,unacc
1,vhigh,2,2,small,med,unacc
2,vhigh,2,2,small,high,unacc
3,vhigh,2,2,med,low,unacc
4,vhigh,2,2,med,med,unacc


In [5]:
# Preprocess data
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, OrdinalEncoder

preprocess = ColumnTransformer(
    transformers = [
        ('categorical', OrdinalEncoder(), cat_cols)
    ]
)

encoder = LabelEncoder()

x = preprocess.fit_transform(x)
y = encoder.fit_transform(y)

x.shape

(1728, 6)

In [6]:
# Perform train-test split on dataset, stratify by target label to ensure same distribution in train and test sets
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, stratify=y, random_state=88)

x_train

array([[2., 1., 2., 1., 2., 0.],
       [2., 0., 2., 1., 0., 0.],
       [0., 2., 1., 2., 2., 0.],
       ...,
       [0., 0., 0., 0., 0., 2.],
       [1., 1., 1., 1., 0., 0.],
       [1., 3., 1., 0., 0., 0.]])

In [7]:
# Function to run evaluation on all models
from sklearn.metrics import classification_report

def evaluate(train_predictions, y_train, test_predictions, y_test):
    train_report = classification_report(train_predictions, y_train)
    test_report = classification_report(test_predictions, y_test)
    
    print('Train:')
    print(train_report)
    
    print('\nTest')
    print(test_report)


In [8]:
# Use gridsearch to find best hyperparameters
from sklearn.model_selection import GridSearchCV

def perform_gridsearch(estimator, params, x, y):
    search = GridSearchCV(estimator=estimator, param_grid=params, cv=5, scoring='accuracy')
    best = search.fit(x, y)
    print('Best Model Params:')
    print(best.best_estimator_)
    print()
    
    return best.best_estimator_

In [9]:
# SVM
from sklearn import svm

svm = svm.SVC()

params = {
    'C' : [0.1, 0.3, 0.5, 0.7, 0.9, 1.0],
    'kernel' : ['linear', 'poly', 'rbf', 'sigmoid']
}

svm = perform_gridsearch(estimator=svm, params=params, x=x_train, y=y_train)

train_predictions = svm.predict(x_train)
test_predictions = svm.predict(x_test)

evaluate(train_predictions, y_train, test_predictions, y_test)

Best Model Params:
SVC(C=0.3)

Train:
              precision    recall  f1-score   support

           0       0.31      0.35      0.33       304
           1       0.19      0.43      0.26       150
           2       0.20      0.44      0.27       158
           3       0.67      0.30      0.41       770

    accuracy                           0.34      1382
   macro avg       0.34      0.38      0.32      1382
weighted avg       0.48      0.34      0.36      1382


Test
              precision    recall  f1-score   support

           0       0.20      0.24      0.22        70
           1       0.09      0.20      0.13        40
           2       0.13      0.31      0.18        36
           3       0.60      0.26      0.36       200

    accuracy                           0.25       346
   macro avg       0.25      0.25      0.22       346
weighted avg       0.41      0.25      0.29       346



In [10]:
# KNN
from sklearn.neighbors import KNeighborsClassifier

params = {
    'n_neighbors' : [x for x in range(4, 20, 2)],
    'weights' : ['uniform', 'distance']
}

knn = KNeighborsClassifier()

knn = perform_gridsearch(estimator=knn, params=params, x=x_train, y=y_train)

train_predictions = knn.predict(x_train)
test_predictions = knn.predict(x_test)

evaluate(train_predictions, y_train, test_predictions, y_test)

Best Model Params:
KNeighborsClassifier(n_neighbors=18)

Train:
              precision    recall  f1-score   support

           0       0.55      0.36      0.44       529
           1       0.38      0.43      0.41       307
           2       0.29      0.39      0.33       257
           3       0.33      0.39      0.36       289

    accuracy                           0.39      1382
   macro avg       0.39      0.39      0.38      1382
weighted avg       0.42      0.39      0.39      1382


Test
              precision    recall  f1-score   support

           0       0.30      0.18      0.23       142
           1       0.20      0.22      0.21        76
           2       0.08      0.12      0.10        58
           3       0.18      0.23      0.20        70

    accuracy                           0.19       346
   macro avg       0.19      0.19      0.18       346
weighted avg       0.22      0.19      0.20       346



In [11]:
# Decision Tree
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(random_state=88)

params = {
    'criterion' : ['gini', 'entropy'],
    'max_depth' : [x for x in range(2, 20, 2)],
    'min_samples_leaf' : [x for x in range(2, 20, 2)]
}

dt = perform_gridsearch(estimator=dt, params=params, x=x_train, y=y_train)

train_predictions = dt.predict(x_train)
test_predictions = dt.predict(x_test)

evaluate(train_predictions, y_train, test_predictions, y_test)

Best Model Params:
DecisionTreeClassifier(max_depth=4, min_samples_leaf=18, random_state=88)

Train:
              precision    recall  f1-score   support

           0       0.27      0.36      0.31       253
           1       0.23      0.46      0.30       171
           2       0.42      0.32      0.36       454
           3       0.49      0.34      0.40       504

    accuracy                           0.35      1382
   macro avg       0.35      0.37      0.34      1382
weighted avg       0.40      0.35      0.36      1382


Test
              precision    recall  f1-score   support

           0       0.18      0.27      0.22        59
           1       0.08      0.20      0.12        35
           2       0.40      0.28      0.33       122
           3       0.53      0.35      0.42       130

    accuracy                           0.30       346
   macro avg       0.30      0.28      0.27       346
weighted avg       0.38      0.30      0.32       346



In [12]:
# Random Forest - warning, this cell takes really long to run =D
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(random_state=88)

params = {
    'n_estimators' : [x for x in range(100, 200, 10)],
    'criterion' : ['gini', 'entropy'],
    'max_depth' : [x for x in range(2, 10, 2)],
    'min_samples_leaf' : [x for x in range(2, 20, 2)]
}

rf = perform_gridsearch(estimator=rf, params=params, x=x_train, y=y_train)

train_predictions = rf.predict(x_train)
test_predictions = rf.predict(x_test)

evaluate(train_predictions, y_train, test_predictions, y_test)

Best Model Params:
RandomForestClassifier(max_depth=2, min_samples_leaf=12, n_estimators=120,
                       random_state=88)

Train:
              precision    recall  f1-score   support

           0       0.28      0.35      0.31       273
           1       0.17      0.54      0.26       108
           2       0.19      0.38      0.25       169
           3       0.72      0.30      0.42       832

    accuracy                           0.34      1382
   macro avg       0.34      0.39      0.31      1382
weighted avg       0.52      0.34      0.37      1382


Test
              precision    recall  f1-score   support

           0       0.13      0.17      0.14        66
           1       0.15      0.39      0.22        33
           2       0.10      0.24      0.15        38
           3       0.61      0.25      0.36       209

    accuracy                           0.25       346
   macro avg       0.25      0.26      0.22       346
weighted avg       0.42      0.25    

# Modeling Conclusion

Most models achieved between 30-40% on train accuracy and between 19-30% on test accuracy

I would pick to deploy the Decision Tree model as it achieved the best test results, showing signs that it is not overfitted on the train set

# Prediction on custom input values
1. Maintenance = High
2. Number of doors = 4
3. Lug Boot Size = Big
4. Safety = High
5. Class Value = Good

In [13]:
# Create custom input, noted that persons column value is not given, we fill Nan value with the mode=2
x = pd.DataFrame(
    [['high', '4', '2', 'big', 'high', 'good']], 
    columns=col_names[1:]
)

x

Unnamed: 0,maint,doors,persons,lug_boot,safety,class
0,high,4,2,big,high,good


In [14]:
# Let's predict using our best model!
x = preprocess.transform(x)
result = dt.predict(x)
result = encoder.inverse_transform(result)

result

array(['med'], dtype=object)

# Prediction results

The buy price is med based on the prediction from the decision tree model!