### Import Packages

In [1]:
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

### Data Preprocessing

In [2]:
colnames=['buying', 'maint', 'doors', 'persons', 'lugboot', 'safety', 'class']
df = pd.read_csv('car.data', names=colnames, header= None)
df = df.drop('persons', axis=1) # dropped 'person' column as it is not used as input
df

Unnamed: 0,buying,maint,doors,lugboot,safety,class
0,vhigh,vhigh,2,small,low,unacc
1,vhigh,vhigh,2,small,med,unacc
2,vhigh,vhigh,2,small,high,unacc
3,vhigh,vhigh,2,med,low,unacc
4,vhigh,vhigh,2,med,med,unacc
...,...,...,...,...,...,...
1723,low,low,5more,med,med,good
1724,low,low,5more,med,high,vgood
1725,low,low,5more,big,low,unacc
1726,low,low,5more,big,med,good


In [3]:
df.nunique() # no missing attributes, number of categories correspond with given information

buying     4
maint      4
doors      4
lugboot    3
safety     3
class      4
dtype: int64

In [4]:
# ordinal encoding for maint, safety and class as there is ranking involved
maint_ordinal = preprocessing.OrdinalEncoder(categories=[['vhigh', 'high', 'med', 'low']]) # low maintenance is better
df['maint'] = maint_ordinal.fit_transform(df[['maint']])
safety_ordinal = preprocessing.OrdinalEncoder(categories=[['low', 'med', 'high']]) # high safety is better
df['safety'] = safety_ordinal.fit_transform(df[['safety']])
class_ordinal = preprocessing.OrdinalEncoder(categories=[['unacc', 'acc', 'good', 'vgood']]) # vgood class value is better
df['class'] = class_ordinal.fit_transform(df[['class']])

In [5]:
# more/less doors and greater/smaller boot size does not mean that it is better (based on personal preference)
# hence we do one-hot encoding on it instead
processed_data = pd.get_dummies(df, columns = ['doors'])
processed_data = pd.get_dummies(processed_data, columns = ['lugboot'])
print(processed_data)

     buying  maint  safety  class  doors_2  doors_3  doors_4  doors_5more  \
0     vhigh    0.0     0.0    0.0        1        0        0            0   
1     vhigh    0.0     1.0    0.0        1        0        0            0   
2     vhigh    0.0     2.0    0.0        1        0        0            0   
3     vhigh    0.0     0.0    0.0        1        0        0            0   
4     vhigh    0.0     1.0    0.0        1        0        0            0   
...     ...    ...     ...    ...      ...      ...      ...          ...   
1723    low    3.0     1.0    2.0        0        0        0            1   
1724    low    3.0     2.0    3.0        0        0        0            1   
1725    low    3.0     0.0    0.0        0        0        0            1   
1726    low    3.0     1.0    2.0        0        0        0            1   
1727    low    3.0     2.0    3.0        0        0        0            1   

      lugboot_big  lugboot_med  lugboot_small  
0               0          

In [7]:
# splitting of data into train, validation set
# test set not needed as we are going to predict using the given input
x_data = processed_data.drop(columns=['buying'])
y_data = processed_data['buying']
x_train, x_val, y_train, y_val = train_test_split(x_data, y_data, test_size=0.1, random_state=42)

In [8]:
df['buying'].value_counts(ascending=True)
#since target variable is not imbalance, no further action is required

vhigh    432
high     432
med      432
low      432
Name: buying, dtype: int64

### Testing different models before choosing the model with the highest accuracy

In [9]:
# KNN classifier
knn = KNeighborsClassifier().fit(x_train, y_train)
knn_predictions = knn.predict(x_val) 
print(metrics.classification_report(y_val,knn_predictions))

              precision    recall  f1-score   support

        high       0.21      0.26      0.23        46
         low       0.32      0.40      0.35        43
         med       0.10      0.07      0.08        43
       vhigh       0.12      0.10      0.11        41

    accuracy                           0.21       173
   macro avg       0.19      0.21      0.19       173
weighted avg       0.19      0.21      0.20       173



In [10]:
# SVC
svm_model_linear = SVC().fit(x_train, y_train)
svm_predictions = svm_model_linear.predict(x_val)
print(metrics.classification_report(y_val,svm_predictions))

              precision    recall  f1-score   support

        high       0.12      0.07      0.08        46
         low       0.45      0.30      0.36        43
         med       0.21      0.14      0.17        43
       vhigh       0.25      0.56      0.35        41

    accuracy                           0.26       173
   macro avg       0.26      0.27      0.24       173
weighted avg       0.26      0.26      0.24       173



In [11]:
# random forest
rf = RandomForestClassifier(n_estimators = 1000, random_state = 42).fit(x_train, y_train)
rf_predictions = svm_model_linear.predict(x_val)
print(metrics.classification_report(y_val,rf_predictions))


              precision    recall  f1-score   support

        high       0.12      0.07      0.08        46
         low       0.45      0.30      0.36        43
         med       0.21      0.14      0.17        43
       vhigh       0.25      0.56      0.35        41

    accuracy                           0.26       173
   macro avg       0.26      0.27      0.24       173
weighted avg       0.26      0.26      0.24       173



### Prediction

SVC and random forest has the highest accuracy. We will go ahead and use SVC to predict the given input.

In [13]:
# maintenance = high --> 1
# safety = high --> 2
# class = good --> 2
# doors = 4 --> doors_2=0, doors_3=0, doors_4=1, doors_5more=0
# lugboot = big --> lugboot_big=1, lugboot_med=0, lugboot_small=0
svm_model_linear.predict([[1, 2, 2, 0, 0, 1, 0, 1, 0, 0]]) 
#predicted buying price: low

array(['low'], dtype=object)

##### predicted buying price for given input: low