In [1]:
#Importing the basic libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.offline as py
from plotly import tools
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

In [2]:
#Reading the dataset 
cars = pd.read_csv('car_data.csv')
cars.shape


(1727, 7)

In [3]:
#Since our dataset doesn't contain the name of columns, the column names were assigned 
cars.columns = ['Buying', 'Maint', 'Doors','Persons','LugBoot','Safety','Evaluation']

In [4]:
#Taking an overview of data
cars.sample(10)

Unnamed: 0,Buying,Maint,Doors,Persons,LugBoot,Safety,Evaluation
973,med,high,2,2,small,high,unacc
1016,med,high,3,more,small,low,unacc
583,high,high,3,4,big,high,acc
1232,med,low,3,more,small,low,unacc
545,high,high,2,2,big,low,unacc
1719,low,low,5more,more,small,med,acc
342,vhigh,low,2,more,small,med,unacc
679,high,med,3,2,med,high,unacc
1721,low,low,5more,more,med,low,unacc
1168,med,med,5more,2,big,high,unacc


In [5]:
#Let's check if there are any missing values in our dataset 
cars.isnull().sum()

Buying        0
Maint         0
Doors         0
Persons       0
LugBoot       0
Safety        0
Evaluation    0
dtype: int64

In [6]:
#We see that there are no missing values in our dataset 
#Let's take a more analytical look at our dataset 
cars.describe()

Unnamed: 0,Buying,Maint,Doors,Persons,LugBoot,Safety,Evaluation
count,1727,1727,1727,1727,1727,1727,1727
unique,4,4,4,3,3,3,4
top,med,med,3,more,med,med,unacc
freq,432,432,432,576,576,576,1209


In [7]:
#We realize that our data has categorical values 
cars.columns

Index(['Buying', 'Maint', 'Doors', 'Persons', 'LugBoot', 'Safety',
       'Evaluation'],
      dtype='object')

In [8]:
#Lets find out the number of cars in each evaluation category
cars['Evaluation'].value_counts().sort_index()


acc       384
good       69
unacc    1209
vgood      65
Name: Evaluation, dtype: int64

In [9]:
fig = {
  "data": [
    {
      "values": [1210,384,69,65],
      "labels": [
        "Unacceptable",
        "Acceptable",
        "Good",
        "Very Good"
      ],
      "domain": {"column": 0},
      "name": "Car Evaluation",
      "hoverinfo":"label+percent+name",
      "hole": .6,
      "type": "pie"
    }],
  "layout": {
        "title":"Distribution of Evaluated Cars",
        "grid": {"rows": 1, "columns": 1},
        "annotations": [
            {
                "font": {
                    "size": 36
                },
                "showarrow": False,
                "text": "",
                "x": 0.5,
                "y": 0.5
            }
        ]
    }
}
py.iplot(fig, filename='cars_donut')

In [10]:
#cars.Evaluation.replace(('unacc', 'acc', 'good', 'vgood'), (0, 1, 2, 3), inplace = True)
#cars.Buying.replace(('vhigh', 'high', 'med', 'low'), (3, 2, 1, 0), inplace = True)
#cars.Maint.replace(('vhigh', 'high', 'med', 'low'), (3, 2, 1, 0), inplace = True)
#cars.Doors.replace(('5more'),(5),inplace=True)
#cars.Persons.replace(('more'),(5),inplace=True)
#cars.LugBoot.replace(('small','med','big'),(0,1,2),inplace=True)
#cars.Safety.replace(('low','med','high'),(0,1,2),inplace=True)

In [11]:
cars.Doors.replace(('5more'),('5'),inplace=True)
cars.Persons.replace(('more'),('5'),inplace=True)


In [12]:
features = cars.iloc[:,:-1]
features[:5]
a=[]
for i in features:
    a.append(features[i].value_counts())

In [13]:
buy = pd.crosstab(cars['Buying'], cars['Evaluation'])
mc = pd.crosstab(cars['Maint'], cars['Evaluation'])
drs = pd.crosstab(cars['Doors'], cars['Evaluation'])
prsn = pd.crosstab(cars['Persons'], cars['Evaluation'])
lb = pd.crosstab(cars['LugBoot'], cars['Evaluation'])
sfty = pd.crosstab(cars['Safety'], cars['Evaluation'])


In [14]:
buy

Evaluation,acc,good,unacc,vgood
Buying,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
high,108,0,324,0
low,89,46,258,39
med,115,23,268,26
vhigh,72,0,359,0


In [15]:
data = [
    go.Bar(
        x=a[0].index, # assign x as the dataframe column 'x'
        y=buy['unacc'],
        name='Unacceptable'
    ),
    go.Bar(
        x=a[0].index,
        y=buy['acc'],
        name='Acceptable'
    ),
    go.Bar(
        x=a[0].index,
        y=buy['good'],
        name='Good'
    ),
    go.Bar(
        x=a[0].index,
        y=buy['vgood'],
        name='Very Good'
    )

]

layout = go.Layout(
    barmode='stack',
    title='Selling Price vs Evaluation'
)

fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='distri')

In [16]:
data = [
    go.Bar(
        x=a[0].index, # assign x as the dataframe column 'x'
        y=mc['unacc'],
        name='Unacceptable'
    ),
    go.Bar(
        x=a[0].index,
        y=mc['acc'],
        name='Acceptable'
    ),
    go.Bar(
        x=a[0].index,
        y=mc['good'],
        name='Good'
    ),
    go.Bar(
        x=a[0].index,
        y=mc['vgood'],
        name='Very Good'
    )

]

layout = go.Layout(
    barmode='stack',
    title='Maintainance cost vs Evaluation'
)

fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='cars_donut')

In [17]:
data = [
    go.Bar(
        x=a[2].index, # assign x as the dataframe column 'x'
        y=drs['unacc'],
        name='Unacceptable'
    ),
    go.Bar(
        x=a[2].index,
        y=drs['acc'],
        name='Acceptable'
    ),
    go.Bar(
        x=a[2].index,
        y=drs['good'],
        name='Good'
    ),
    go.Bar(
        x=a[2].index,
        y=drs['vgood'],
        name='Very Good'
    )

]

layout = go.Layout(
    barmode='stack',
    title='Doors vs Evaluation'
)

fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='cars_donut')

In [18]:
data = [
    go.Bar(
        x=a[3].index, # assign x as the dataframe column 'x'
        y=prsn['unacc'],
        name='Unacceptable'
    ),
    go.Bar(
        x=a[3].index,
        y=prsn['acc'],
        name='Acceptable'
    ),
    go.Bar(
        x=a[3].index,
        y=prsn['good'],
        name='Good'
    ),
    go.Bar(
        x=a[3].index,
        y=prsn['vgood'],
        name='Very Good'
    )

]

layout = go.Layout(
    barmode='stack',
    title='Number of Passengers vs Evaluation'
)

fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='cars_donut')

In [19]:
data = [
    go.Bar(
        x=a[4].index, # assign x as the dataframe column 'x'
        y=lb['unacc'],
        name='Unacceptable'
    ),
    go.Bar(
        x=a[4].index,
        y=lb['acc'],
        name='Acceptable'
    ),
    go.Bar(
        x=a[4].index,
        y=lb['good'],
        name='Good'
    ),
    go.Bar(
        x=a[4].index,
        y=lb['vgood'],
        name='Very Good'
    )

]

layout = go.Layout(
    barmode='stack',
    title='Luggage Boot vs Evaluation'
)

fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='cars_donut')

In [20]:
data = [
    go.Bar(
        x=a[5].index, # assign x as the dataframe column 'x'
        y=sfty['unacc'],
        name='Unacceptable'
    ),
    go.Bar(
        x=a[5].index,
        y=sfty['acc'],
        name='Acceptable'
    ),
    go.Bar(
        x=a[5].index,
        y=sfty['good'],
        name='Good'
    ),
    go.Bar(
        x=a[5].index,
        y=sfty['vgood'],
        name='Very Good'
    )

]

layout = go.Layout(
    barmode='stack',
    title='Safety vs Evaluation'
)

fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='cars_donut')

In [21]:
#We need to encode the categorical data 
#We have two options, either we use label encoder or one hot encoder 
#We use label encoder when our target variable changes with increase or decrease in that feature variable 
#We use One hot encoder when a target variable depends upon the feature variable 

In [93]:
#Dividing the dataframe into x features and y target variable
x = cars.iloc[:, :-1]
y = cars.iloc[:, 6]

In [94]:
x.columns = ['Buying', 'Maint', 'Doors','Persons','LugBoot','Safety']
y.columns=['Evaluation']

In [95]:
x.head()

Unnamed: 0,Buying,Maint,Doors,Persons,LugBoot,Safety
0,vhigh,vhigh,2,2,small,med
1,vhigh,vhigh,2,2,small,high
2,vhigh,vhigh,2,2,med,low
3,vhigh,vhigh,2,2,med,med
4,vhigh,vhigh,2,2,med,high


In [96]:
#Using pandas dummies function to encode the data into categorical data
x = pd.get_dummies(x, prefix_sep='_', drop_first=True)

In [97]:
x.sample(5)

Unnamed: 0,Buying_low,Buying_med,Buying_vhigh,Maint_low,Maint_med,Maint_vhigh,Doors_3,Doors_4,Doors_5,Persons_4,Persons_5,LugBoot_med,LugBoot_small,Safety_low,Safety_med
889,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0
44,0,0,1,0,0,1,1,0,0,0,1,0,1,1,0
1537,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0
1185,0,1,0,0,1,0,0,0,1,0,1,0,0,0,1
1431,1,0,0,0,0,0,1,0,0,0,0,0,1,0,1


In [98]:
y.describe()

count      1727
unique        4
top       unacc
freq       1209
Name: Evaluation, dtype: object

In [99]:
x=x.values
y=y.values

In [100]:
#And the rest of them to be categorically encoded: ['Buying', 'Maint', 'Doors', 'Persons','Safety','Evaluation']

In [101]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.25, random_state = 0)


In [102]:
"""from sklearn.preprocessing import StandardScaler

sc = StandardScaler()

x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)"""

'from sklearn.preprocessing import StandardScaler\n\nsc = StandardScaler()\n\nx_train = sc.fit_transform(x_train)\nx_test = sc.transform(x_test)'

In [103]:
x_train[:5]

array([[0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1],
       [0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1],
       [0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0],
       [0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1]], dtype=uint8)

In [104]:
y_train[:5]

array(['unacc', 'unacc', 'unacc', 'unacc', 'unacc'], dtype=object)

In [105]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix


In [106]:
#Using ogistic regression
clf = LogisticRegression(random_state = 0)
clf.fit(x_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=0, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [108]:
y_pred = clf.predict(x_test)
f1_LR=f1_score(y_test,y_pred, average='macro')
print("Training Accuracy: ",clf.score(x_train, y_train))
print("Testing Accuracy: ", clf.score(x_test, y_test))
cm = confusion_matrix(y_test, y_pred)
print(cm)
print(classification_report(y_test,y_pred)) 

Training Accuracy:  0.8749034749034749
Testing Accuracy:  0.8333333333333334
[[ 74   3  19   0]
 [ 13   3   0   2]
 [ 16   0 282   0]
 [ 19   0   0   1]]
              precision    recall  f1-score   support

         acc       0.61      0.77      0.68        96
        good       0.50      0.17      0.25        18
       unacc       0.94      0.95      0.94       298
       vgood       0.33      0.05      0.09        20

    accuracy                           0.83       432
   macro avg       0.59      0.48      0.49       432
weighted avg       0.82      0.83      0.81       432



In [125]:
#Using KNN classifier
from sklearn.neighbors import KNeighborsClassifier
clf = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
clf.fit(x_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [126]:
y_pred = clf.predict(x_test)
f1_KNN=f1_score(y_test,y_pred, average='macro')
print("Training Accuracy: ",clf.score(x_train, y_train))
print("Testing Accuracy: ", clf.score(x_test, y_test))
cm = confusion_matrix(y_test, y_pred)
print(cm)
print(classification_report(y_test,y_pred))

Training Accuracy:  0.9196911196911197
Testing Accuracy:  0.8472222222222222
[[ 66   5  25   0]
 [ 11   1   5   1]
 [  4   0 294   0]
 [  8   4   3   5]]
              precision    recall  f1-score   support

         acc       0.74      0.69      0.71        96
        good       0.10      0.06      0.07        18
       unacc       0.90      0.99      0.94       298
       vgood       0.83      0.25      0.38        20

    accuracy                           0.85       432
   macro avg       0.64      0.49      0.53       432
weighted avg       0.83      0.85      0.83       432



In [127]:
#Using Linear SVC
from sklearn.svm import SVC
clf = SVC(kernel = 'linear', random_state = 0)
clf.fit(x_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='linear', max_iter=-1, probability=False, random_state=0,
    shrinking=True, tol=0.001, verbose=False)

In [128]:
y_pred = clf.predict(x_test)
f1_SVC_Linear=f1_score(y_test,y_pred, average='macro')
print("Training Accuracy: ",clf.score(x_train, y_train))
print("Testing Accuracy: ", clf.score(x_test, y_test))
cm = confusion_matrix(y_test, y_pred)
print(cm)
print(classification_report(y_test,y_pred))

Training Accuracy:  0.9405405405405406
Testing Accuracy:  0.9282407407407407
[[ 86   4   6   0]
 [  1  11   0   6]
 [ 14   0 284   0]
 [  0   0   0  20]]
              precision    recall  f1-score   support

         acc       0.85      0.90      0.87        96
        good       0.73      0.61      0.67        18
       unacc       0.98      0.95      0.97       298
       vgood       0.77      1.00      0.87        20

    accuracy                           0.93       432
   macro avg       0.83      0.86      0.84       432
weighted avg       0.93      0.93      0.93       432



In [129]:
#Using rbf SVC
from sklearn.svm import SVC
clf = SVC(kernel = 'rbf', random_state = 0)
clf.fit(x_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='rbf', max_iter=-1, probability=False, random_state=0,
    shrinking=True, tol=0.001, verbose=False)

In [130]:
y_pred = clf.predict(x_test)
f1_SVC_rbf=f1_score(y_test,y_pred, average='macro')
print("Training Accuracy: ",clf.score(x_train, y_train))
print("Testing Accuracy: ", clf.score(x_test, y_test))
cm = confusion_matrix(y_test, y_pred)
print(cm)
print(classification_report(y_test,y_pred))

Training Accuracy:  0.8934362934362934
Testing Accuracy:  0.8773148148148148
[[ 91   0   5   0]
 [ 17   0   0   1]
 [ 17   0 281   0]
 [ 13   0   0   7]]
              precision    recall  f1-score   support

         acc       0.66      0.95      0.78        96
        good       0.00      0.00      0.00        18
       unacc       0.98      0.94      0.96       298
       vgood       0.88      0.35      0.50        20

    accuracy                           0.88       432
   macro avg       0.63      0.56      0.56       432
weighted avg       0.86      0.88      0.86       432



In [131]:
#Using NB classifier
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
clf.fit(x_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [89]:
#GaussianNB?

In [132]:
y_pred = clf.predict(x_test)
print("Training Accuracy: ",clf.score(x_train, y_train))
print("Testing Accuracy: ", clf.score(x_test, y_test))
cm = confusion_matrix(y_test, y_pred)
print(cm)
print(classification_report(y_test,y_pred))

Training Accuracy:  0.48494208494208496
Testing Accuracy:  0.5
[[ 37  47   0  12]
 [  0  12   0   6]
 [ 96  36 147  19]
 [  0   0   0  20]]
              precision    recall  f1-score   support

         acc       0.28      0.39      0.32        96
        good       0.13      0.67      0.21        18
       unacc       1.00      0.49      0.66       298
       vgood       0.35      1.00      0.52        20

    accuracy                           0.50       432
   macro avg       0.44      0.64      0.43       432
weighted avg       0.77      0.50      0.56       432



Note that this is WRONG implementation of Naive Bayes classifier. Since the Independence assumption of NB classifier states that the features shoud not be correlated to each other; so when creating the dummy variables, we make family of dependent features and hence we get such a terrible accuracy. So after trying out a couple more algorithms, I've done this one properly :)

In [133]:
#Trying decision tree classifier
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
clf.fit(x_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=0, splitter='best')

In [134]:
y_pred = clf.predict(x_test)
f1_DT=f1_score(y_test,y_pred, average='macro')
print("Training Accuracy: ",clf.score(x_train, y_train))
print("Testing Accuracy: ", clf.score(x_test, y_test))
cm = confusion_matrix(y_test, y_pred)
print(cm)
print(classification_report(y_test,y_pred))

Training Accuracy:  1.0
Testing Accuracy:  0.8981481481481481
[[ 73   5  18   0]
 [  9   8   0   1]
 [  4   0 294   0]
 [  5   2   0  13]]
              precision    recall  f1-score   support

         acc       0.80      0.76      0.78        96
        good       0.53      0.44      0.48        18
       unacc       0.94      0.99      0.96       298
       vgood       0.93      0.65      0.76        20

    accuracy                           0.90       432
   macro avg       0.80      0.71      0.75       432
weighted avg       0.89      0.90      0.89       432



In [135]:
#Trying Random forest classifier
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators = 25, criterion = 'entropy', random_state = 0)
clf.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=25,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [136]:
y_pred = clf.predict(x_test)
f1_RF=f1_score(y_test,y_pred, average='macro')
print("Training Accuracy: ",clf.score(x_train, y_train))
print("Testing Accuracy: ", clf.score(x_test, y_test))
cm = confusion_matrix(y_test, y_pred)
print(cm)
print(classification_report(y_test,y_pred))

Training Accuracy:  1.0
Testing Accuracy:  0.875
[[ 71   6  19   0]
 [ 10   4   1   3]
 [  7   0 291   0]
 [  8   0   0  12]]
              precision    recall  f1-score   support

         acc       0.74      0.74      0.74        96
        good       0.40      0.22      0.29        18
       unacc       0.94      0.98      0.96       298
       vgood       0.80      0.60      0.69        20

    accuracy                           0.88       432
   macro avg       0.72      0.63      0.67       432
weighted avg       0.86      0.88      0.87       432



In [137]:
#Now trying the NB classifier again, this time without dummy variables 
x_new = cars.iloc[:,:-1]

In [138]:
from sklearn.preprocessing import LabelEncoder

In [139]:
lae = LabelEncoder()
x_new=x_new.apply(lambda col: lae.fit_transform(col))
x_new.head()

Unnamed: 0,Buying,Maint,Doors,Persons,LugBoot,Safety
0,3,3,0,0,2,2
1,3,3,0,0,2,0
2,3,3,0,0,1,1
3,3,3,0,0,1,2
4,3,3,0,0,1,0


In [140]:
x_new=x_new.values

In [141]:
x_train_new, x_test_new= train_test_split(x_new, test_size = 0.25, random_state = 0)


In [142]:
clf_new = GaussianNB(priors=None)
clf_new.fit(x_train_new, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [143]:
y_train[:10]

array(['unacc', 'unacc', 'unacc', 'unacc', 'unacc', 'acc', 'unacc', 'acc',
       'unacc', 'unacc'], dtype=object)

In [144]:
y_pred = clf_new.predict(x_test_new)
f1_NB=f1_score(y_test,y_pred, average='macro')
print("Training Accuracy: ",clf_new.score(x_train_new, y_train))
print("Testing Accuracy: ", clf_new.score(x_test_new, y_test))
cm = confusion_matrix(y_test, y_pred)
print(cm)
print(classification_report(y_test,y_pred))

Training Accuracy:  0.6447876447876448
Testing Accuracy:  0.625
[[  9   0  40  47]
 [  2   0   4  12]
 [  6   0 241  51]
 [  0   0   0  20]]
              precision    recall  f1-score   support

         acc       0.53      0.09      0.16        96
        good       0.00      0.00      0.00        18
       unacc       0.85      0.81      0.83       298
       vgood       0.15      1.00      0.27        20

    accuracy                           0.62       432
   macro avg       0.38      0.48      0.31       432
weighted avg       0.71      0.62      0.62       432



In [149]:
models=['Linear SVC', 'Kernel SVC','Logistic Regression','Decision Tree Classifier','Random Forest Classifier','Naive Bayes Classifier' ]
fig = go.Figure(data=[
    go.Bar(name='f1_score', x=models, y=[f1_SVC_Linear,f1_SVC_rbf,f1_LR,f1_DT,f1_RF,f1_NB])])
fig.show()