In [16]:
import numpy as np   
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier  
from sklearn import preprocessing
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt 
import pandas as pd

In [17]:
def read_csv(f):
    # use ";" to separate
    data_list = pd.read_csv(f,sep=";")
    return data_list   

In [18]:
def get_train_and_test_data(df):
    # get the data and output
    X_ori = df.values[:, :-1]
    Y = df.values[:,-1]
    X = np.zeros(X_ori.shape)

    # do some encoding before using fit
    # fit() does not accept Strings 
    # LabelEncoder : turn your string into incremental value
    le = preprocessing.LabelEncoder()
    for i in range(df.shape[1]-1):
        X[:,i] = le.fit_transform(X_ori[:,i])

    # get testdata and validation data
    x_train, x_test, y_train, y_test = train_test_split( X, Y, test_size = 0.3)
    
    return x_train, x_test, y_train, y_test

In [19]:
def decisiontree(depth, *data):
    x_train, x_test, y_train, y_test = data
    
    
    # criterion = gini
    clf_gini = DecisionTreeClassifier(criterion = "gini", max_features="auto",
                                  random_state = 100,max_depth=depth,)
    clf_gini.fit(x_train, y_train)
    
    
    #criterion = entropy
    clf_entropy = DecisionTreeClassifier(criterion = "entropy", max_features="auto",
                                  random_state = 100,max_depth=depth,)
    clf_entropy.fit(x_train, y_train)
    
    #predict the output
    y_pred = clf_gini.predict(x_test)
    y_pred_en = clf_entropy.predict(x_test)
    
    #score
    print("The full score is 100")
    print("The prediction accuracy score with criterion gini index is ", accuracy_score(y_test,y_pred)*100)
    print("The prediction accuracy score with criterion information gain is ", accuracy_score(y_test,y_pred_en)*100)
    
    return clf_gini, clf_entropy

In [20]:
def draw_tree(gini_t, entropy_t):
    from sklearn.tree import export_graphviz  
    # Open the .dot file in a text editor
    # Copy the piece of code and paste it @ webgraphviz.com
    export_graphviz(gini_t, "D:/test/gini")
    export_graphviz(entropy_t, "D:/test/entropy")

In [21]:
def main():
    csv_file = "C:/Users/user/Desktop/bank-additional/bank-additional-full.csv"
    csv_df = read_csv(csv_file)
    x_train, x_test, y_train, y_test = get_train_and_test_data(csv_df)
    
    depth=5
    clf_gini, clf_entropy = decisiontree(depth, x_train, x_test, y_train, y_test)
    draw_tree(clf_gini, clf_entropy)

In [22]:
if __name__=="__main__":
    main()

The full score is 100
The prediction accuracy score with criterion gini index is  90.5883305009
The prediction accuracy score with criterion information gain is  90.7178117666
