In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
import copy
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn import decomposition, tree

### Importing the DataFrame and assigning headers

In [2]:
df =  pd.read_csv('car.data', sep=",", header = None)
df.columns = ['price', 'maintainence', 'doors', 'persons', 'lug_boot', 'safety', 'class']
df.drop('persons', axis =1, inplace = True)
df.head()

Unnamed: 0,price,maintainence,doors,lug_boot,safety,class
0,vhigh,vhigh,2,small,low,unacc
1,vhigh,vhigh,2,small,med,unacc
2,vhigh,vhigh,2,small,high,unacc
3,vhigh,vhigh,2,med,low,unacc
4,vhigh,vhigh,2,med,med,unacc


## Exploratory Data Analysis

### We first check for any null values, and the data distribution within the respective columns

In [3]:
for col in df.columns:
    
    print(df[col].value_counts())

med      432
low      432
high     432
vhigh    432
Name: price, dtype: int64
med      432
low      432
high     432
vhigh    432
Name: maintainence, dtype: int64
5more    432
3        432
4        432
2        432
Name: doors, dtype: int64
big      576
med      576
small    576
Name: lug_boot, dtype: int64
med     576
low     576
high    576
Name: safety, dtype: int64
unacc    1210
acc       384
good       69
vgood      65
Name: class, dtype: int64


In [4]:
df.isnull().values.any()

False

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1728 entries, 0 to 1727
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   price         1728 non-null   object
 1   maintainence  1728 non-null   object
 2   doors         1728 non-null   object
 3   lug_boot      1728 non-null   object
 4   safety        1728 non-null   object
 5   class         1728 non-null   object
dtypes: object(6)
memory usage: 81.1+ KB


### Since several columns contain categorical data, we conduct one hot encoding to encode the data. 

In [6]:
def process_data(df):
    # One-hot encode data
    one_hot_main = pd.get_dummies(df['maintainence'], prefix="maintainence")
    df = df.drop('maintainence',axis = 1)
    df = df.join(one_hot_main)

    one_hot_doors = pd.get_dummies(df['doors'], prefix="doors")
    df = df.drop('doors',axis = 1)
    df = df.join(one_hot_doors)
    
    one_hot_lug = pd.get_dummies(df['lug_boot'], prefix="lug_boot")
    df = df.drop('lug_boot',axis = 1)
    df = df.join(one_hot_lug)
    
    one_hot_safety = pd.get_dummies(df['safety'], prefix="safety")
    df = df.drop('safety',axis = 1)
    df = df.join(one_hot_safety)
    
    one_hot_class = pd.get_dummies(df['class'], prefix="class")
    df = df.drop('class',axis = 1)
    df = df.join(one_hot_class)
    
    
    return df

### We also create an additional dataset that is label encoded, to see if better results may be obtained if the variables maintain an ordinal relationship. For label encoding, we normalise the data after encoding it to ensure that there will not be biases in the dataset.

In [7]:
def label_encode(df_label):
    df_label['maintainence'] = df_label['maintainence'].replace(['vhigh','high', 'med', 'low'], [4,3,2,1])
    df_label['doors'] = df_label['doors'].replace(['2','4', '5more'], [1,2,3])
    df_label['lug_boot'] = df_label['lug_boot'].replace(['small','med', 'big'], [1,2,3])
    df_label['safety'] = df_label['safety'].replace(['low','med', 'high'], [1,2,3])
    df_label['class'] = df_label['class'].replace(['unacc','acc', 'good', 'vgood'], [1,2,3,4])
    X = df_label.drop('price', axis = 1)
    y = df_label['price']
    x = X.values
    min_max_scaler = preprocessing.MinMaxScaler()
    x_scaled = min_max_scaler.fit_transform(x)
    X = pd.DataFrame(x_scaled)

    return X, y

In [8]:
df_one_hot = process_data(df)

In [9]:
df_one_hot

Unnamed: 0,price,maintainence_high,maintainence_low,maintainence_med,maintainence_vhigh,doors_2,doors_3,doors_4,doors_5more,lug_boot_big,lug_boot_med,lug_boot_small,safety_high,safety_low,safety_med,class_acc,class_good,class_unacc,class_vgood
0,vhigh,0,0,0,1,1,0,0,0,0,0,1,0,1,0,0,0,1,0
1,vhigh,0,0,0,1,1,0,0,0,0,0,1,0,0,1,0,0,1,0
2,vhigh,0,0,0,1,1,0,0,0,0,0,1,1,0,0,0,0,1,0
3,vhigh,0,0,0,1,1,0,0,0,0,1,0,0,1,0,0,0,1,0
4,vhigh,0,0,0,1,1,0,0,0,0,1,0,0,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1723,low,0,1,0,0,0,0,0,1,0,1,0,0,0,1,0,1,0,0
1724,low,0,1,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0,1
1725,low,0,1,0,0,0,0,0,1,1,0,0,0,1,0,0,0,1,0
1726,low,0,1,0,0,0,0,0,1,1,0,0,0,0,1,0,1,0,0


### Create train test split for one hot encoded data

In [10]:
X = df_one_hot.drop('price', axis = 1)
y = df_one_hot['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7)


## Model Building

### To obtain the best balance of speed and accuracy, I will be experimenting with 2 main models, Decision trees and Random Forest

### We first get the first results of our first iteration of the model

In [11]:
from sklearn.tree import DecisionTreeClassifier

tree = DecisionTreeClassifier(criterion='gini', max_depth=2, random_state=0)
tree.fit(X_train, y_train)
print(accuracy_score(y_test, tree.predict(X_test)))
print(classification_report(y_test, tree.predict(X_test)))

0.302504816955684
              precision    recall  f1-score   support

        high       0.27      1.00      0.43       129
         low       0.58      0.23      0.33       124
         med       0.00      0.00      0.00       135
       vhigh       0.00      0.00      0.00       131

    accuracy                           0.30       519
   macro avg       0.21      0.31      0.19       519
weighted avg       0.21      0.30      0.18       519



  _warn_prf(average, modifier, msg_start, len(result))


### We can see that the results seem lacklustre, so we will conduct some hyperparameter tuning using GridSearch

In [12]:
std_slc = StandardScaler()
pca = decomposition.PCA()
tree = DecisionTreeClassifier()

pipe = Pipeline(steps=[('std_slc', std_slc), ('pca', pca), ('dec_tree', tree)])
n_components = list(range(1,X_train.shape[1]+1,1))

criterion = ['gini', 'entropy']
max_depth = [2,4,6,8,10,12]
parameters = dict(pca__n_components=n_components, dec_tree__criterion=criterion, dec_tree__max_depth=max_depth)
clf_GS = GridSearchCV(pipe, parameters)
clf_GS.fit(X_train, y_train)
    
print('Best Criterion:', clf_GS.best_estimator_.get_params()['dec_tree__criterion'])
print('Best max_depth:', clf_GS.best_estimator_.get_params()['dec_tree__max_depth'])
print('Best Number Of Components:', clf_GS.best_estimator_.get_params()['pca__n_components'])
print(clf_GS.best_estimator_.get_params()['dec_tree'])



Best Criterion: entropy
Best max_depth: 2
Best Number Of Components: 13
DecisionTreeClassifier(criterion='entropy', max_depth=2)


In [13]:
tree = DecisionTreeClassifier(criterion='gini', max_depth=4, random_state=0)
tree.fit(X_train, y_train)
print(classification_report(y_test, tree.predict(X_test)))


              precision    recall  f1-score   support

        high       0.27      0.53      0.36       129
         low       0.54      0.16      0.25       124
         med       0.39      0.08      0.13       135
       vhigh       0.34      0.51      0.41       131

    accuracy                           0.32       519
   macro avg       0.39      0.32      0.29       519
weighted avg       0.38      0.32      0.29       519



### We now attempt to do the same analysis for Random Forest

In [167]:
X, y = label_encode(df)
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7)


In [168]:
randf = RandomForestClassifier(n_estimators=200)
randf.fit(X_train, y_train)
print(classification_report(y_test, randf.predict(X_test)))

              precision    recall  f1-score   support

        high       0.07      0.08      0.07       118
         low       0.14      0.11      0.12       138
         med       0.07      0.07      0.07       130
       vhigh       0.10      0.11      0.10       133

    accuracy                           0.09       519
   macro avg       0.10      0.09      0.09       519
weighted avg       0.10      0.09      0.09       519



In [174]:
param_grid = {
    'bootstrap': [True],
    'max_depth': [80, 90, 100, 110],
    'max_features': [2, 3],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [100, 200, 300, 1000]
}
rf = RandomForestClassifier()
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, cv = 3, n_jobs = -1, verbose = 2)
grid_search.fit(X_train, y_train)
grid_search.best_params_

Fitting 3 folds for each of 288 candidates, totalling 864 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    3.8s
[Parallel(n_jobs=-1)]: Done 170 tasks      | elapsed:   19.6s
[Parallel(n_jobs=-1)]: Done 373 tasks      | elapsed:   49.2s
[Parallel(n_jobs=-1)]: Done 656 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 864 out of 864 | elapsed:  2.6min finished


{'bootstrap': True,
 'max_depth': 110,
 'max_features': 2,
 'min_samples_leaf': 5,
 'min_samples_split': 12,
 'n_estimators': 100}

In [177]:
rf = RandomForestClassifier( bootstrap = True, max_depth = 110, max_features = 2, min_samples_leaf = 5, 
                            min_samples_split = 12, n_estimators = 100)
rf.fit(X_train, y_train)
print(classification_report(y_test, rf.predict(X_test)))

              precision    recall  f1-score   support

        high       0.09      0.13      0.10       118
         low       0.27      0.20      0.23       138
         med       0.12      0.10      0.11       130
       vhigh       0.17      0.17      0.17       133

    accuracy                           0.15       519
   macro avg       0.16      0.15      0.15       519
weighted avg       0.16      0.15      0.16       519



### Since the random forest model has a worsee performance, we will go ahead with the decision tree model. For this case, since the size of the data is not large enough, we will not be attempting to use deep learning models to try and attain a better result. Additionaly, we want to follow Occam's razor and not make our model too complicated to the point it overfits the data. 

In [33]:
res = pd.DataFrame(['low', 'high', '4', 'big', 'high', 'good']).T 
res.columns = ['price', 'maintainence', 'doors', 'lug_boot', 'safety', 'class']
res = pd.concat([df, res],  ignore_index=True)
res

Unnamed: 0,price,maintainence,doors,lug_boot,safety,class
0,vhigh,vhigh,2,small,low,unacc
1,vhigh,vhigh,2,small,med,unacc
2,vhigh,vhigh,2,small,high,unacc
3,vhigh,vhigh,2,med,low,unacc
4,vhigh,vhigh,2,med,med,unacc
...,...,...,...,...,...,...
1724,low,low,5more,med,high,vgood
1725,low,low,5more,big,low,unacc
1726,low,low,5more,big,med,good
1727,low,low,5more,big,high,vgood


In [35]:
print(res.shape)
print(df.shape)


(1729, 6)
(1728, 6)


In [38]:
process_data(res)

Unnamed: 0,price,maintainence_high,maintainence_low,maintainence_med,maintainence_vhigh,doors_2,doors_3,doors_4,doors_5more,lug_boot_big,lug_boot_med,lug_boot_small,safety_high,safety_low,safety_med,class_acc,class_good,class_unacc,class_vgood
0,vhigh,0,0,0,1,1,0,0,0,0,0,1,0,1,0,0,0,1,0
1,vhigh,0,0,0,1,1,0,0,0,0,0,1,0,0,1,0,0,1,0
2,vhigh,0,0,0,1,1,0,0,0,0,0,1,1,0,0,0,0,1,0
3,vhigh,0,0,0,1,1,0,0,0,0,1,0,0,1,0,0,0,1,0
4,vhigh,0,0,0,1,1,0,0,0,0,1,0,0,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1724,low,0,1,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0,1
1725,low,0,1,0,0,0,0,0,1,1,0,0,0,1,0,0,0,1,0
1726,low,0,1,0,0,0,0,0,1,1,0,0,0,0,1,0,1,0,0
1727,low,0,1,0,0,0,0,0,1,1,0,0,1,0,0,0,0,0,1


### We manually add the required prediction to the last row of the required dataframe, and obtain the model results once more

In [48]:
q = process_data(res)
q = q.drop('price', axis =1)
q

Unnamed: 0,maintainence_high,maintainence_low,maintainence_med,maintainence_vhigh,doors_2,doors_3,doors_4,doors_5more,lug_boot_big,lug_boot_med,lug_boot_small,safety_high,safety_low,safety_med,class_acc,class_good,class_unacc,class_vgood
0,0,0,0,1,1,0,0,0,0,0,1,0,1,0,0,0,1,0
1,0,0,0,1,1,0,0,0,0,0,1,0,0,1,0,0,1,0
2,0,0,0,1,1,0,0,0,0,0,1,1,0,0,0,0,1,0
3,0,0,0,1,1,0,0,0,0,1,0,0,1,0,0,0,1,0
4,0,0,0,1,1,0,0,0,0,1,0,0,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1724,0,1,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0,1
1725,0,1,0,0,0,0,0,1,1,0,0,0,1,0,0,0,1,0
1726,0,1,0,0,0,0,0,1,1,0,0,0,0,1,0,1,0,0
1727,0,1,0,0,0,0,0,1,1,0,0,1,0,0,0,0,0,1


In [50]:

tree.predict(q)

array(['high', 'vhigh', 'vhigh', ..., 'low', 'low', 'med'], dtype=object)

## Final prediction = medium price