In [114]:
import numpy as np   
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier  
from sklearn import preprocessing
from sklearn.metrics import accuracy_score
from sklearn.model_selection import ShuffleSplit
from sklearn.tree import export_graphviz
import matplotlib.pyplot as plt 
import pandas as pd

In [115]:
def read_csv(f):
    # use ";" to separate
    data_list = pd.read_csv(f,sep=";")
    return data_list   

In [116]:
def get_train_and_test_data(df):
    # get the data and output
    X_ori = df.values[:, :-1]
    Y = df.values[:,-1]
    X = np.zeros(X_ori.shape)

    # do some encoding before using fit
    # fit() does not accept Strings 
    # LabelEncoder : turn your string into incremental value
    
    le = preprocessing.LabelEncoder()
    for i in range(df.shape[1]-1):
        X[:,i] = le.fit_transform(X_ori[:,i])


    # get testdata and validation data
    x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.3)       
    
    return x_train, x_test, y_train, y_test

In [117]:
def decisiontree(depth, *data):
    x_train, x_test, y_train, y_test = data
    
    depth_list = range(1,depth+1)
    test_score = []
    
    #criterion = entropy
    for i in depth_list:
        clf_entropy = DecisionTreeClassifier(criterion = "entropy", random_state = 100, max_depth=i)
        clf_entropy.fit(x_train, y_train)

        #predict the output
        y_pred_en = clf_entropy.predict(x_test)
     
        #score
        s = accuracy_score(y_test,y_pred_en)*100
        test_score.append(s)
        
        draw_tree(clf_entropy, i)
        

    print("The full score is 100")    
    for i in depth_list:
        print("The prediction accuracy score in depth {} is {}".format(i, test_score[i-1]))
              

In [118]:
def draw_tree(entropy_t, index_num):
      
    # Open the .dot file in a text editor
    # Copy the piece of code and paste it @ webgraphviz.com
    
    export_graphviz(entropy_t, "D:/test/entropy{}".format(index_num))
    

In [119]:
def main():
    csv_file = "C:/Users/user/Desktop/bank-additional/bank-additional-full.csv"
    csv_df = read_csv(csv_file)
    x_train, x_test, y_train, y_test = get_train_and_test_data(csv_df)
    
    depth=10
    decisiontree(depth, x_train, x_test, y_train, y_test)

In [120]:
if __name__=="__main__":
    main()

The full score is 100
The prediction accuracy score in depth 1 is 88.76750020231448
The prediction accuracy score in depth 2 is 90.53977502630089
The prediction accuracy score in depth 3 is 90.53977502630089
The prediction accuracy score in depth 4 is 91.45423646516144
The prediction accuracy score in depth 5 is 91.62418062636563
The prediction accuracy score in depth 6 is 91.93979121145908
The prediction accuracy score in depth 7 is 91.45423646516144
The prediction accuracy score in depth 8 is 91.6484583636805
The prediction accuracy score in depth 9 is 91.84268026219956
The prediction accuracy score in depth 10 is 91.72129157562516
