In [1]:
import pandas as pd

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

In [2]:
col_names = ['buying', 'maint', 'doors', 'persons', 'lug_boot',
             'safety', 'label']

In [3]:
car_dataset = pd.read_csv('../data/car.data.csv', names=col_names, 
                         header=None, )

In [4]:
car_dataset.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,label
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc


In [5]:
quality_dict = {'vhigh': 4, 'high': 3, 'med': 2, 'low': 1}
boot_dict = {'small': 1, 'med': 2, 'big': 3}
label_dict = {'unacc': 1, 'acc': 2, 'good': 3, 'vgood': 4}

In [6]:
car_dataset['buying'] = car_dataset['buying'].map(lambda x: quality_dict[x])

In [7]:
car_dataset['maint'] = car_dataset['maint'].map(lambda x: quality_dict[x])

In [8]:
car_dataset['safety'] = car_dataset['safety'].map(lambda x: quality_dict[x])

In [9]:
car_dataset['lug_boot'] = car_dataset['lug_boot'].map(lambda x: boot_dict[x])

In [10]:
car_dataset['label'] = car_dataset['label'].map(lambda x: label_dict[x])

In [11]:
car_dataset['doors'] = car_dataset['doors'].map(lambda x: x if 'more' not in str(x) else 5)

In [12]:
car_dataset['persons'] = car_dataset['persons'].map(lambda x: x if 'more' not in str(x) else 6)

In [13]:
car_dataset.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,label
0,4,4,2,2,1,1,1
1,4,4,2,2,1,2,1
2,4,4,2,2,1,3,1
3,4,4,2,2,2,1,1
4,4,4,2,2,2,2,1


In [14]:
X_train, X_test, y_train, y_test = train_test_split(
    car_dataset[['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety']], 
    car_dataset['label'])

In [15]:
car_dataset.to_csv('../data/car_cleaned.csv', index=False)

In [16]:
X_train

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety
1546,1,2,3,2,3,2
1490,1,3,5,2,2,3
967,2,4,5,6,2,2
495,3,4,4,4,1,1
1566,1,2,4,2,1,1
58,4,4,4,2,2,2
1266,2,1,4,6,3,1
1421,1,3,2,4,3,3
1568,1,2,4,2,1,3
172,4,3,4,4,1,2


In [17]:
classifier_tree = DecisionTreeClassifier()

In [18]:
classifier_tree.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [19]:
preds = classifier_tree.predict(X_test)

In [20]:
sum(preds == y_test) / len(y_test)

0.9814814814814815

In [21]:
from sklearn.externals import joblib

In [22]:
joblib.dump(classifier_tree, '../data/car_tree.pkl')

['../data/car_tree.pkl']