In [2]:
import pandas as pd
from sklearn import preprocessing, decomposition, tree
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from id3 import Id3Estimator, export_graphviz
%matplotlib inline



In [3]:
df_train = pd.read_csv("/Users/inderjot/Downloads/train.csv")
df_test = pd.read_csv("/Users/inderjot/Downloads/test.csv")

In [4]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [5]:
##Changing Datatype
for i in df_train.columns:
    if df_train[i].dtype == 'object':
        df_train[i] = df_train[i].astype('category')   

In [6]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null category
Sex            891 non-null category
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null category
Fare           891 non-null float64
Cabin          204 non-null category
Embarked       889 non-null category
dtypes: category(5), float64(2), int64(5)
memory usage: 134.4 KB


In [7]:
## Dropping Duplicates
dup = df_train[df_train.duplicated()]
df_train.drop_duplicates()
dup.shape

(0, 12)

In [8]:
x = df_train.drop(['Survived', 'Name', 'PassengerId', 'Pclass', 'Ticket', 'Cabin'], axis=1)
y = df_train['Survived']

In [9]:
x.head()

Unnamed: 0,Sex,Age,SibSp,Parch,Fare,Embarked
0,male,22.0,1,0,7.25,S
1,female,38.0,1,0,71.2833,C
2,female,26.0,0,0,7.925,S
3,female,35.0,1,0,53.1,S
4,male,35.0,0,0,8.05,S


In [10]:
## Checking Null Values 
pd.isna(x).sum()

Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Embarked      2
dtype: int64

In [11]:
## filling age with mode
m_age = x['Age'].mode()
m_age.values.reshape(1,1)
x['Age'].fillna(m_age[0], inplace=True)
pd.isna(x['Age']).value_counts()

False    891
Name: Age, dtype: int64

In [12]:
## Filling Embarked with mode
m_Embarked = x['Embarked'].mode()
x['Embarked'].fillna(m_Embarked[0], inplace=True)
pd.isna(x['Embarked']).value_counts()

False    891
Name: Embarked, dtype: int64

In [13]:
x['Sex'] = pd.get_dummies(x['Sex'])
x['Embarked'] = pd.get_dummies(x['Embarked'])

In [14]:
## Scaling
x_s = preprocessing.scale(x)
y_s = preprocessing.scale(y)

In [15]:
le = preprocessing.LabelEncoder()
y_s = le.fit_transform(y_s)

In [16]:
## Splitting 
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.25, random_state=4)

In [17]:
## ID3 algorithm
estimator = Id3Estimator()
estimator.fit(x_train, y_train)

Id3Estimator(gain_ratio=False, is_repeating=False, max_depth=None,
             min_entropy_decrease=0.0, min_samples_split=2, prune=False)

In [18]:
export_graphviz(estimator.tree_, 'tree.dot', list(x.head(0)))

<_io.TextIOWrapper name='tree.dot' mode='w' encoding='utf8'>

In [19]:
## Predicting on test features
y_pred = estimator.predict(x_test)

In [20]:
## Checking Accuracy
acc = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)

In [21]:
acc

0.852017937219731

In [22]:
cm

array([[137,  13],
       [ 20,  53]], dtype=int64)

In [23]:
## CART Algorithm 

In [24]:
## Creating a CART model
cart = tree.DecisionTreeClassifier(criterion='gini')
cart.fit(x_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [25]:
## Predicting result on test data
y_pred_c = cart.predict(x_test)

In [26]:
# accuracy
acc_cart = accuracy_score(y_test, y_pred_c)
cm = confusion_matrix(y_test, y_pred_c)

In [27]:
# accuracy of CART
print(acc_cart)

0.820627802690583


In [28]:
cm

array([[132,  18],
       [ 22,  51]], dtype=int64)

In [None]:
## ID3 has more accuracy than CART