In [74]:
import numpy as np   
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier  
from sklearn.metrics import accuracy_score
from sklearn import preprocessing
from sklearn.model_selection import ShuffleSplit
from sklearn.tree import export_graphviz 
import time
import pandas as pd

In [75]:
def read_csv(f):
    # use ";" to separate
    data_list = pd.read_csv(f,sep=";")
    return data_list   

In [76]:
def cross_validation_split(df):
    # get the data and output
    X_ori = df.values[:, :-1]
    Y = df.values[:,-1]
    X = np.zeros(X_ori.shape)

    # do some encoding before using fit
    # fit() does not accept Strings 
    # LabelEncoder : turn your string into incremental value
    
    le = preprocessing.LabelEncoder()
    for i in range(df.shape[1]-1):
        X[:,i] = le.fit_transform(X_ori[:,i])

    # get testdata and validation data
    # x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.3)  
    
    
    # split data to 4 pieces, the test size is .25
    # want to use cross validation
    rs = ShuffleSplit(n_splits=4, test_size=.25)
    rs_list = rs.split(X)      
    
    return X, Y, rs_list

In [77]:
def decisiontree(depth, csv, *data):
    X, Y, data_index = data
    
    # data_index is generator, so list it
    data_index_list = list(data_index)
    
    depth_list = range(1,depth+1)
    test_score = []
    rootnode_list = []
    
    tic = time.clock()
    #criterion : entropy
    for i in depth_list:
        clf_entropy = DecisionTreeClassifier(criterion = "entropy", random_state = 100, max_depth=i)
        cross_vali_score = []
        
        for train_index, test_index in data_index_list:
            
            # the train_index is 3/4 of data, and it's random
            # train the tree
            clf_entropy.fit(X[train_index], Y[train_index])

            #predict the output
            y_pred_en = clf_entropy.predict(X[test_index])

            #score the prediction
            s = accuracy_score(Y[test_index],y_pred_en)*100
            cross_vali_score.append(s)
                   
        # mean the 4 cross validation scores
        score = np.mean(cross_vali_score)
        test_score.append(score)
        del cross_vali_score
        root_node = clf_entropy.tree_.feature[0]
        rootnode_list.append(root_node)
        draw_tree(clf_entropy, i, csv)
        
    cost_time = time.clock() - tic    
    
    print("The full score is 100")    
    for i,j in zip(depth_list, rootnode_list):
        print("The prediction accuracy score in depth {} is {}".format(i, test_score[i-1]))
        print("The tree's root node feature is {}".format(csv.columns[j]))
        print()
    
    print()
    print('Time for training spent {} secs' .format(cost_time))

In [78]:
def draw_tree(entropy_t, index_num, csv):
      
    # Open the .dot file in a text editor
    # Copy the piece of code and paste it @ webgraphviz.com
    
    export_graphviz(entropy_t, "D:/test/entropy{}".format(index_num), feature_names=csv.columns[:20])

In [79]:
def main():
    csv_file = "C:/Users/user/Desktop/bank-additional/bank-additional-full.csv"
    csv_df = read_csv(csv_file)
    X, Y, rs_list = cross_validation_split(csv_df)
    
    depth=10
    decisiontree(depth, csv_df, X, Y, rs_list)

In [80]:
if __name__=="__main__":
    main()

The full score is 100
The prediction accuracy score in depth 1 is 88.63018354860638
The tree's root node feature is nr.employed

The prediction accuracy score in depth 2 is 90.4389628047004
The tree's root node feature is nr.employed

The prediction accuracy score in depth 3 is 90.6259104593571
The tree's root node feature is nr.employed

The prediction accuracy score in depth 4 is 91.24745071380013
The tree's root node feature is nr.employed

The prediction accuracy score in depth 5 is 91.48052830921628
The tree's root node feature is nr.employed

The prediction accuracy score in depth 6 is 91.5630766242595
The tree's root node feature is nr.employed

The prediction accuracy score in depth 7 is 91.50480722540547
The tree's root node feature is nr.employed

The prediction accuracy score in depth 8 is 91.35670583665146
The tree's root node feature is nr.employed

The prediction accuracy score in depth 9 is 91.1600466155191
The tree's root node feature is nr.employed

The prediction accu