# Decision Tree Exercise with Wine Dataset

In [None]:
from pandas import read_csv, DataFrame
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from IPython.display import Image  
from sklearn.tree import export_graphviz
from subprocess import call
import pydotplus
from sklearn.preprocessing import LabelEncoder

## Step 1 - Load data

In [None]:
raw_data = read_csv('wine.csv')
raw_data.head()

In [None]:
class_labels = raw_data['class'].unique()
class_labels

In [None]:
feature_columns = raw_data.drop('class', axis=1).columns
label_columns = ['class']
print('feature columns:' + str(feature_columns) + '\n')
print(' label columns:' + str(label_columns))

## Step 2 - Split data in train and test

In [None]:
x = raw_data.filter(feature_columns).apply(LabelEncoder().fit_transform).as_matrix()
y = raw_data.filter(label_columns).as_matrix()

x_train, x_test, y_train, y_test = train_test_split(x, y, train_size = 0.70, random_state = 100)

## Step 3 - Train decision tree

In [None]:
wine_decision_tree = DecisionTreeClassifier().fit(x_train, y_train)

## Step 4 - Test decision tree

In [None]:
y_pred = wine_decision_tree.predict(x_test)
'Decision tree accuracy is ' + str(accuracy_score(y_test, y_pred))

## Step 5 - Plot the decision tree

In [None]:
dot_data = export_graphviz(wine_decision_tree, 
                    out_file=None,
                    feature_names=feature_columns,
                    class_names=['type-1','type-2','type-3'],  # [1, 2, 3]
                    filled=True, 
                    rounded=True,
                    special_characters = False)  

graph = pydotplus.graph_from_dot_data(dot_data)  
Image(graph.create_png())

## Attributes importance

In [None]:
attr_importance = DataFrame()
attr_importance['feature']    = feature_columns
attr_importance['importance'] = wine_decision_tree.feature_importances_
attr_importance.sort_values('importance', ascending=False)

## Simplified decision tree

In [None]:
simplified_decision_tree = DecisionTreeClassifier(max_depth=2, min_samples_split=5).fit(x_train, y_train)
y_pred_2 = simplified_decision_tree.predict(x_test)
'Simplified decision tree accuracy is ' + str(accuracy_score(y_test, y_pred_2))

In [None]:
dot_data = export_graphviz(simplified_decision_tree, out_file=None, 
                    feature_names=feature_columns,  
                    class_names=['type-1','type-2','type-3'],  # [1, 2, 3]  
                    filled=True, rounded=True,  
                    special_characters=True)  

graph = pydotplus.graph_from_dot_data(dot_data)  
Image(graph.create_png())