First, we import all relevant packages

The crossvalidation's train_test_split() help us by splitting data into train & test set. This is easy way out before we do further processing:
We should preprocess the data by partioning with the same percentage for training, cross_validation and test set.

In [1]:
import numpy as np
import pandas as pd
from sklearn.cross_validation import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn import tree



Loading dataset

In [2]:
input_data = pd.read_csv('processed_train.csv', index_col=0)

In [3]:
print ("Dataset Length:: ", len(input_data))
print ("Dataset Shape: ", input_data.shape)
input_data.head(5)

Dataset Length::  2973
Dataset Shape:  (2973, 217)


Unnamed: 0,Id,idhogar,Target,hacdor,hacapo,v14a,refrig,paredblolad,paredzocalo,paredpreb,...,escolari-min,escolari-max,escolari-sum,escolari-std,escolari-range_,age-min,age-max,age-sum,age-std,age-range_
0,ID_279628684,21eb7fcc1,4,0,0,1,1,1,0,0,...,10,10,10,0.0,0,43,43,43,0.0,0
1,ID_f29eb3ddd,0e5d7a658,4,0,0,1,1,0,0,0,...,12,12,12,0.0,0,67,67,67,0.0,0
2,ID_68de51c94,2c7317ea8,4,0,0,1,1,0,0,0,...,11,11,11,0.0,0,92,92,92,0.0,0
3,ID_ec05b1a7b,2b58d945f,4,0,0,1,1,1,0,0,...,2,11,33,4.272002,9,8,38,100,14.899664,30
4,ID_1284f8aad,d6dae86b7,4,1,0,1,1,1,0,0,...,0,11,23,5.123475,11,7,30,76,11.690452,23


As we are not doing any feature selection yet, we are gonna leave this section blank. 

In [4]:
#Split data into variables types - boolean, categorical, continuous, ID
bool_var = list(input_data.select_dtypes(['bool']))
cont_var = list(input_data.select_dtypes(['float64']))
cat_var = list(input_data.select_dtypes(['int64']))
id_var = list(input_data.select_dtypes(['object']))

#Get dataset with only categorical variables
cat_data = input_data[cat_var + bool_var]

#Get Continuous Variables from Data
cont_data = input_data[cont_var]

#Input Data can be from all
final_input_data = input_data[cat_var + bool_var + cont_var]

In [5]:
cat_data.head(5)

Unnamed: 0,Target,hacdor,hacapo,v14a,refrig,paredblolad,paredzocalo,paredpreb,pisocemento,pareddes,...,mobilephone-range_,escolari-min,escolari-max,escolari-sum,escolari-range_,age-min,age-max,age-sum,age-range_,v2a1-missing
0,4,0,0,1,1,1,0,0,0,0,...,0,10,10,10,0,43,43,43,0,False
1,4,0,0,1,1,0,0,0,0,0,...,0,12,12,12,0,67,67,67,0,False
2,4,0,0,1,1,0,0,0,0,0,...,0,11,11,11,0,92,92,92,0,False
3,4,0,0,1,1,1,0,0,0,0,...,0,2,11,33,9,8,38,100,30,False
4,4,1,0,1,1,1,0,0,0,0,...,0,0,11,23,11,7,30,76,23,False


Creating X and Y variables. 
As shown above, target feature is at index 3 and the rest of the variables are the predictor variables. 

In [6]:
X = cat_data.values[:, 1:]
Y = cat_data.values[:, 0]
Y=Y.astype('int')

In [7]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.3, 
                                                    random_state = 100 )

### Creating basic Decision Tree using Gini Index as Criterion

In [11]:
clf_gini = DecisionTreeClassifier(criterion = "gini", random_state = 100,
                               max_depth=3, min_samples_leaf=5)
clf_gini.fit(X_train, Y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=3,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=5,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=100, splitter='best')

### Creating basic Decision Tree using Information Gain as Criterion

Using Information Gain is the same as using entropy as metric

In [12]:
clf_entropy = DecisionTreeClassifier(criterion = "entropy", random_state = 100,
                                     max_depth=3, min_samples_leaf=5)
clf_entropy.fit(X_train, Y_train)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=3,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=5,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=100, splitter='best')