In [20]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [21]:
#buying: vhigh, high, med, low. 
#maint: vhigh, high, med, low. 
#doors: 2, 3, 4, 5more. 
#persons: 2, 4, more. 
#lug_boot: small, med, big. 
#safety: low, med, high. 

#renamed dataset car.data
#named columns

#changed the column names!
data=pd.read_csv('car.data.txt',names=['class','buying','maint','doors','persons','lug_boot','safety'])

In [22]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1728 entries, 0 to 1727
Data columns (total 7 columns):
class       1728 non-null object
buying      1728 non-null object
maint       1728 non-null object
doors       1728 non-null object
persons     1728 non-null object
lug_boot    1728 non-null object
safety      1728 non-null object
dtypes: object(7)
memory usage: 94.6+ KB


In [23]:
data.head()

Unnamed: 0,class,buying,maint,doors,persons,lug_boot,safety
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc


In [24]:
data['doors'].unique()

array(['2', '4', 'more'], dtype=object)

In [25]:
data['doors'] = data['doors'].replace('more',5)
data['doors'] = data['doors'].replace('2',2)
data['doors'] = data['doors'].replace('4',4)

In [26]:
data['doors'].unique()

array([2, 4, 5])

In [27]:
data['maint'].unique()

array(['2', '3', '4', '5more'], dtype=object)

In [28]:
data['maint'] = data['maint'].replace('5more',5)
data['maint'] = data['maint'].replace('4',4)
data['maint'] = data['maint'].replace('3',3)
data['maint'] = data['maint'].replace('2',2)

In [29]:
#this is for standardising the data
#you can also use MinMaxScaler
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
numeric_variables = scaler.fit_transform(data[['doors','maint']])

In [30]:
#use get dummies for categorical data
# we know that persons, lug_boot, safety, buying
categorical_variables = pd.get_dummies(data[['persons','lug_boot','buying','class']])  

In [31]:
categorical_variables

Unnamed: 0,persons_big,persons_med,persons_small,lug_boot_high,lug_boot_low,lug_boot_med,buying_high,buying_low,buying_med,buying_vhigh,class_high,class_low,class_med,class_vhigh
0,0,0,1,0,1,0,0,0,0,1,0,0,0,1
1,0,0,1,0,0,1,0,0,0,1,0,0,0,1
2,0,0,1,1,0,0,0,0,0,1,0,0,0,1
3,0,1,0,0,1,0,0,0,0,1,0,0,0,1
4,0,1,0,0,0,1,0,0,0,1,0,0,0,1
5,0,1,0,1,0,0,0,0,0,1,0,0,0,1
6,1,0,0,0,1,0,0,0,0,1,0,0,0,1
7,1,0,0,0,0,1,0,0,0,1,0,0,0,1
8,1,0,0,1,0,0,0,0,0,1,0,0,0,1
9,0,0,1,0,1,0,0,0,0,1,0,0,0,1


In [32]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
data['safety'] = le.fit_transform(data['safety'])

data['safety'].unique()

array([2, 0, 3, 1])

In [33]:
from sklearn.model_selection import train_test_split
X = np.concatenate((numeric_variables,categorical_variables.as_matrix()),axis = 1)
y = data['safety'].tolist() 

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.33,random_state=42)

In [34]:
from sklearn.ensemble import GradientBoostingClassifier
model_ab = GradientBoostingClassifier(n_estimators = 100,learning_rate = 0.1, random_state= 42, max_depth = 4)
model_ab.fit(X_train,y_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=4,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=42, subsample=1.0, verbose=0,
              warm_start=False)

In [35]:
y_pred_dt = model_ab.predict(X_test)

In [36]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score

#average = None argument present for multiclass classification
def performance_metrics(y_true,y_pred):
    acc = accuracy_score(y_true, y_pred)
    rec = recall_score(y_true, y_pred,average = None)
    pre = precision_score(y_true, y_pred,average = None)
    f1 = f1_score(y_true, y_pred,average = None)
    
    return acc,rec,pre,f1

In [37]:
acc,rec,pre,f1 = performance_metrics(y_test,y_pred_dt)

In [38]:
print("Accuracy score :{}".format(acc))
print("Recallscore :{}".format(rec))
print("Precision score :{}".format(pre))
print("F1 score :{}".format(f1))

Accuracy score :0.9737302977232924
Recallscore :[0.91472868 0.95       1.         0.88      ]
Precision score :[0.99159664 0.65517241 1.         0.84615385]
F1 score :[0.9516129 0.7755102 1.        0.8627451]
