In [273]:
import numpy as np   
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier  
from sklearn.metrics import accuracy_score
from sklearn.model_selection import ShuffleSplit
from sklearn.tree import export_graphviz 
import time
import pandas as pd

In [274]:
def read_csv(f):
    # use ";" to separate
    data_list = pd.read_csv(f,sep=";")
    return data_list   

In [275]:
def cross_validation_split(df):
    # get the data and output
    X_ori = df.values[:, :-1]
    Y = df.values[:,-1]
    X = np.zeros(X_ori.shape)

    # do some encoding before using fit
    # fit() does not accept Strings 
    # LabelEncoder : turn your string into incremental value
    
    le = preprocessing.LabelEncoder()
    for i in range(df.shape[1]-1):
        X[:,i] = le.fit_transform(X_ori[:,i])


    # get testdata and validation data
    # x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.3)  
    
    # split data to 4 pieces, the test size is .25
    # want to use cross validation
    rs = ShuffleSplit(n_splits=4, test_size=.25)
    rs_list = rs.split(X)      
    
    return X, Y, rs_list

In [276]:
def decisiontree(depth, *data):
    X, Y, data_index = data
    
    # data_index is generator, so list it
    data_index_list = list(data_index)
    
    depth_list = range(1,depth+1)
    test_score = []
    
    tic = time.clock()
    #criterion : entropy
    for i in depth_list:
        clf_entropy = DecisionTreeClassifier(criterion = "entropy", random_state = 100, max_depth=i)
        cross_vali_score = []
        
        for train_index, test_index in data_index_list:
            
            # the train_index is 3/4 of data, and it's random
            # train the tree
            clf_entropy.fit(X[train_index], Y[train_index])

            #predict the output
            y_pred_en = clf_entropy.predict(X[test_index])

            #score the prediction
            s = accuracy_score(Y[test_index],y_pred_en)*100
            cross_vali_score.append(s)
        
        # mean the 4 cross validation scores
        score = np.mean(cross_vali_score)
        test_score.append(score)
        del cross_vali_score
        draw_tree(clf_entropy, i)
        
    cost_time = time.clock() - tic        
    print("The full score is 100")    
    for i in depth_list:
        print("The prediction accuracy score in depth {} is {}".format(i, test_score[i-1]))
    
    print()
    print('Time for training spent {} secs' .format(cost_time))

In [277]:
def draw_tree(entropy_t, index_num):
      
    # Open the .dot file in a text editor
    # Copy the piece of code and paste it @ webgraphviz.com
    
    export_graphviz(entropy_t, "D:/test/entropy{}".format(index_num))

In [278]:
def main():
    csv_file = "C:/Users/user/Desktop/bank-additional/bank-additional-full.csv"
    csv_df = read_csv(csv_file)
    X, Y, rs_list = cross_validation_split(csv_df)
    
    depth=10
    decisiontree(depth, X, Y, rs_list)

In [279]:
if __name__=="__main__":
    main()

The full score is 100
The prediction accuracy score in depth 1 is 88.60104884917938
The prediction accuracy score in depth 2 is 90.08934641157619
The prediction accuracy score in depth 3 is 90.16218316014374
The prediction accuracy score in depth 4 is 91.04836360104885
The prediction accuracy score in depth 5 is 91.13819559094881
The prediction accuracy score in depth 6 is 91.21346023113529
The prediction accuracy score in depth 7 is 91.09934932504613
The prediction accuracy score in depth 8 is 91.14547926580556
The prediction accuracy score in depth 9 is 90.84684859667865
The prediction accuracy score in depth 10 is 90.77158395649218

Time for training spent 5.644827222635001 secs
