# Investigating Information Gain
## Lillian Mueller & Regina Hong

Using the Iris dataset, build a decision tree for the data using sklearn and graphviz. 

In [3]:
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree, preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, average_precision_score, r2_score

import pandas as pd 
import numpy as np

pd.set_option('display.max_rows', None) # to see all the rows of a dataframe

In [4]:
# loading in the iris dataset from sklearn
iris_data = load_iris()

# turning dataset into dataframe format for easier reading
df_iris = pd.DataFrame(data=iris_data.data, columns=iris_data.feature_names)

df_iris.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [5]:
# adding the target as a class column and renaming numbers to class names
# 0 = setosa, 1 = versicolor, 2 = virginica

df_iris['class'] = iris_data.target.tolist()
df_iris['class'].replace({0: 'setosa', 1: 'versicolor', 2: 'virginica'},
                inplace=True)

df_iris.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),class
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [6]:
#Create a new column for the target (to be ready for processing)
label_obj = preprocessing.LabelEncoder()
df_iris['class_level'] = label_obj.fit_transform(df_iris['class'])

# Create decision tree model object
dt_model = tree.DecisionTreeClassifier()
predictors, target = iris_data.data, iris_data.target
dt_model.fit(predictors, target)

# Create the tree diagram
with open("iris.dot", 'w') as f:
    f = tree.export_graphviz(dt_model, out_file=f, feature_names=iris_data.feature_names, class_names=iris_data.target_names)

In [7]:
# changing purity measure from Gini impurity to emtropy
dt_model2 = tree.DecisionTreeClassifier(criterion='entropy')
dt_model2.fit(predictors, target)

# Create the new tree diagram
with open("iris2.dot", 'w') as f:
    f = tree.export_graphviz(dt_model2, out_file=f, feature_names=iris_data.feature_names, class_names=iris_data.target_names)

In [8]:
# splitting the dataset so that 2/3 is used for training and 1/3 for testing

# Creating Train and Test datasets
x_train, x_test, y_train, y_test = train_test_split(predictors, target, test_size = 0.33)

dt_modeltr = tree.DecisionTreeClassifier()
dt_modeltr.fit(x_train,y_train)

# Predict Accuracy Score
y_pred = dt_modeltr.predict(x_test)
print("Train data accuracy:",accuracy_score(y_true = y_train, y_pred=dt_modeltr.predict(x_train)))
print("Test data accuracy:",accuracy_score(y_true = y_test, y_pred=y_pred))
print("Test data r2:", r2_score(y_true = y_test, y_pred=y_pred))

Train data accuracy: 1.0
Test data accuracy: 0.96
Test data r2: 0.9351070733290071


The accuracy changes every time you run it, probably due to the random assignment of test data.

Typically results in accuracy of 0.90+


In [9]:
# alternate method: splitting the dataset so it is 80%/20% train/test

# Creating Train and Test datasets
x_train2, x_test2, y_train2, y_test2 = train_test_split(predictors, target, test_size=0.2)

dt_modeltr2 = tree.DecisionTreeClassifier()
dt_modeltr2.fit(x_train2,y_train2)

# Predict Accuracy Score
y_pred2 = dt_modeltr2.predict(x_test2)
print("Train data accuracy:",accuracy_score(y_true = y_train2, y_pred=dt_modeltr2.predict(x_train2)))
print("Test data accuracy:",accuracy_score(y_true = y_test2, y_pred=y_pred2))
print("Test data r2:", r2_score(y_true = y_test2, y_pred=y_pred2))

Train data accuracy: 1.0
Test data accuracy: 0.8333333333333334
Test data r2: 0.6932515337423313


Accuracy results are around 0.93+ and sometimes hit 1.0, a little more accurate than when 2/3 of the data was used for training.

Accuracy didn't really drop to consistently being below 0.90 until the test size was increased to 85% - may be due to small sample in general.

In [19]:
# alternate method: keeping original test/train split but constraining max_depth parameter to 3 or 4 levels

# Creating Train and Test datasets
x_train3, x_test3, y_train3, y_test3 = train_test_split(predictors, target, test_size = 0.33)

dt_modeltr3 = tree.DecisionTreeClassifier(max_depth=4)
dt_modeltr3.fit(x_train3,y_train3)

# Create the new tree diagram
with open("iris-gini-4level.dot", 'w') as f:
    f = tree.export_graphviz(dt_modeltr3, out_file=f, feature_names=iris_data.feature_names, class_names=iris_data.target_names)

# Predict Accuracy Score
y_pred3 = dt_modeltr3.predict(x_test3)
print("Train data accuracy:",accuracy_score(y_true = y_train3, y_pred=dt_modeltr3.predict(x_train3)))
print("Test data accuracy:",accuracy_score(y_true = y_test3, y_pred=y_pred3))
print("Test data r2:", r2_score(y_true = y_test3, y_pred=y_pred3))

Train data accuracy: 0.98
Test data accuracy: 0.98
Test data r2: 0.9712808730614589


In [17]:
# alternate method: keeping original test/train split but constraining max_depth parameter to 3 or 4 levels

# Creating Train and Test datasets
x_train4, x_test4, y_train4, y_test4 = train_test_split(predictors, target, test_size = 0.33)

dt_modeltr4 = tree.DecisionTreeClassifier(criterion='entropy', max_depth=4)
dt_modeltr4.fit(x_train4,y_train4)

# Create the new tree diagram
with open("iris-entropy-4level.dot", 'w') as f:
    f = tree.export_graphviz(dt_modeltr4, out_file=f, feature_names=iris_data.feature_names, class_names=iris_data.target_names)

# Predict Accuracy Score
y_pred4 = dt_modeltr4.predict(x_test4)
print("Train data accuracy:",accuracy_score(y_true = y_train4, y_pred=dt_modeltr4.predict(x_train4)))
print("Test data accuracy:",accuracy_score(y_true = y_test4, y_pred=y_pred4))
print("Test data r2:", r2_score(y_true = y_test4, y_pred=y_pred4))

Train data accuracy: 1.0
Test data accuracy: 0.96
Test data r2: 0.9459167117360735


In [20]:
def run_model(criterion, maxDepth):
    x_train, x_test, y_train, y_test = train_test_split(predictors, target, test_size = 0.33)

    dt_modeltr = tree.DecisionTreeClassifier(criterion=criterion, max_depth=maxDepth)
    dt_modeltr.fit(x_train,y_train)

    # Predict Accuracy Score
    y_pred = dt_modeltr.predict(x_test)
    
    return [ 
        accuracy_score(y_true = y_train, y_pred=dt_modeltr.predict(x_train)), 
        accuracy_score(y_true = y_test, y_pred=y_pred),
        r2_score(y_true = y_test, y_pred=y_pred)
    ]

In [44]:
print('Run model with gini inpurity 10 times')
scores = []
for i in range(1000):
    scores.append(run_model('gini', None))

score_df = pd.DataFrame(scores, columns=['Train Data Accuracy', 'Test Data Accuracy', 'r2 Score'])
print(score_df.describe())

Run model with gini inpurity 10 times
       Train Data Accuracy  Test Data Accuracy     r2 Score
count               1000.0         1000.000000  1000.000000
mean                   1.0            0.945100     0.915793
std                    0.0            0.028636     0.044914
min                    1.0            0.840000     0.750000
25%                    1.0            0.920000     0.887892
50%                    1.0            0.940000     0.916574
75%                    1.0            0.960000     0.942824
max                    1.0            1.000000     1.000000


In [45]:
print('Run model with entropy 10 times')
scores = []
for i in range(1000):
    scores.append(run_model('entropy', None))

score_df = pd.DataFrame(scores, columns=['Train Data Accuracy', 'Test Data Accuracy', 'r2 Score'])
print(score_df.describe())

Run model with entropy 10 times
       Train Data Accuracy  Test Data Accuracy     r2 Score
count               1000.0         1000.000000  1000.000000
mean                   1.0            0.943680     0.913660
std                    0.0            0.027291     0.043502
min                    1.0            0.780000     0.633333
25%                    1.0            0.920000     0.886492
50%                    1.0            0.940000     0.914237
75%                    1.0            0.960000     0.942562
max                    1.0            1.000000     1.000000


In [46]:
print('Run model with gini inpurity with depth of 4, 10 times')
scores = []
for i in range(1000):
    scores.append(run_model('gini', 4))

score_df = pd.DataFrame(scores, columns=['Train Data Accuracy', 'Test Data Accuracy', 'r2 Score'])
print(score_df.describe())

Run model with gini inpurity with depth of 4, 10 times
       Train Data Accuracy  Test Data Accuracy     r2 Score
count          1000.000000         1000.000000  1000.000000
mean              0.992880            0.945540     0.916688
std               0.007494            0.027685     0.043817
min               0.970000            0.840000     0.678457
25%               0.990000            0.920000     0.891068
50%               0.990000            0.940000     0.916713
75%               1.000000            0.960000     0.942562
max               1.000000            1.000000     1.000000


In [47]:
print('Run model with entropy with depth of 4, 10 times')
scores = []
for i in range(1000):
    scores.append(run_model('entropy', 4))

score_df = pd.DataFrame(scores, columns=['Train Data Accuracy', 'Test Data Accuracy', 'r2 Score'])
print(score_df.describe())

Run model with entropy with depth of 4, 10 times
       Train Data Accuracy  Test Data Accuracy     r2 Score
count          1000.000000         1000.000000  1000.000000
mean              0.992370            0.943140     0.912387
std               0.008953            0.027439     0.043462
min               0.960000            0.840000     0.740428
25%               0.990000            0.920000     0.885649
50%               0.990000            0.940000     0.913043
75%               1.000000            0.960000     0.941038
max               1.000000            1.000000     1.000000
