In [1]:
import pandas as pd
import numpy as np

from sklearn.datasets import fetch_mldata
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split

train = pd.read_csv("titanic/train.csv")
test = pd.read_csv("titanic/test.csv")

train["Age"] = train["Age"].fillna(train["Age"].median())
train["Embarked"] = train["Embarked"].fillna("S")

test["Age"] = test["Age"].fillna(test["Age"].median())
test["Fare"] = test["Fare"].fillna(test["Fare"].median())

train['Sex'] = train['Sex'].apply(lambda x: 1 if x == 'male' else 0)
train['Embarked'] = train['Embarked'].map( {'S': 0 , 'C':1 , 'Q':2}).astype(int)

test['Sex'] = test['Sex'].apply(lambda x: 1 if x == 'male' else 0)
test['Embarked'] = test['Embarked'].map( {'S': 0 , 'C':1 , 'Q':2}).astype(int)

In [2]:
train_y = train.Survived
train_X = train.drop(['Survived','PassengerId','Name','Ticket','Cabin'],axis = 1)
(train_X, test_X ,train_y, test_y) = train_test_split(train_X, train_y, test_size = 0.3, random_state = 666)

## Decision Tree Classifier

In [5]:
from sklearn.tree import DecisionTreeClassifier
dec_tree = DecisionTreeClassifier()
dec_tree = dec_tree.fit(train_X , train_y)

In [6]:
y_pred = dec_tree.predict(test_X)

print(
    'Accuracy:', accuracy_score(test_y, y_pred),
    'F1 score:', f1_score(test_y, y_pred, average='weighted')
)

Accuracy: 0.7910447761194029 F1 score: 0.7910447761194029


## Random Forest

In [3]:
from sklearn.ensemble import RandomForestClassifier
rnd_forest = RandomForestClassifier()
rnd_forest = rnd_forest.fit(train_X , train_y)



In [4]:
y_pred = rnd_forest.predict(test_X)

print(
    'Accuracy:', accuracy_score(test_y, y_pred),
    'F1 score:', f1_score(test_y, y_pred, average='weighted')
)

Accuracy: 0.8208955223880597 F1 score: 0.8185260797359765


## Deep Forest

In [7]:
from deep_forest.deep_forest import MGCForest
import random
import uuid


mgc_forest = MGCForest(
    estimators_config={
        'mgs': [{
            'estimator_class': ExtraTreesClassifier,
            'estimator_params': {
                'n_estimators': 30,
                'min_samples_split': 21,
                'n_jobs': -1,
            }
        }, {
            'estimator_class': RandomForestClassifier,
            'estimator_params': {
                'n_estimators': 30,
                'min_samples_split': 21,
                'n_jobs': -1,
            }
        }],
        'cascade': [{
            'estimator_class': ExtraTreesClassifier,
            'estimator_params': {
                'n_estimators': 1000,
                'min_samples_split': 11,
                'max_features': 1,
                'n_jobs': -1,
            }
        }, {
            'estimator_class': ExtraTreesClassifier,
            'estimator_params': {
                'n_estimators': 1000,
                'min_samples_split': 11,
                'max_features': 'sqrt',
                'n_jobs': -1,
            }
        }, {
            'estimator_class': RandomForestClassifier,
            'estimator_params': {
                'n_estimators': 1000,
                'min_samples_split': 11,
                'max_features': 1,
                'n_jobs': -1,
            }
        }, {
            'estimator_class': RandomForestClassifier,
            'estimator_params': {
                'n_estimators': 1000,
                'min_samples_split': 11,
                'max_features': 'sqrt',
                'n_jobs': -1,
            }
        }]
    },
    stride_ratios=[1.0 / 4, 1.0 / 9, 1.0 / 16],
)

In [8]:
X_train = train_X.as_matrix()
y_train  = train_y.as_matrix()

X_test = test_X.as_matrix()
y_test  = test_y.as_matrix()

  """Entry point for launching an IPython kernel.
  
  after removing the cwd from sys.path.
  """


In [9]:
mgc_forest.fit(X_train , y_train)

<MultiGrainedScanner stride_ratio=0.25> - Scanning and fitting for X ((623, 7)) and y ((623,)) started
<MultiGrainedScanner stride_ratio=0.25> - Window shape: [1] Total windows: 7
<MultiGrainedScanner stride_ratio=0.25> - Finished scan X ((623, 7)) and got predictions with shape (623, 28)
<MultiGrainedScanner stride_ratio=0.1111111111111111> - Scanning and fitting for X ((623, 7)) and y ((623,)) started
<MultiGrainedScanner stride_ratio=0.1111111111111111> - Window shape: [1] Total windows: 7
<MultiGrainedScanner stride_ratio=0.1111111111111111> - Finished scan X ((623, 7)) and got predictions with shape (623, 28)
<MultiGrainedScanner stride_ratio=0.0625> - Scanning and fitting for X ((623, 7)) and y ((623,)) started
<MultiGrainedScanner stride_ratio=0.0625> - Window shape: [1] Total windows: 7
<MultiGrainedScanner stride_ratio=0.0625> - Finished scan X ((623, 7)) and got predictions with shape (623, 28)
<CascadeForest forests=4> - Cascade fitting for X ((623, 84)) and y ((623,)) start

In [10]:
y_pred = mgc_forest.predict(X_test)

print('Prediction shape:', y_pred.shape)
print(
    'Accuracy:', accuracy_score(y_test, y_pred),
    'F1 score:', f1_score(y_test, y_pred, average='weighted')
)

<MultiGrainedScanner stride_ratio=0.25> - Scanning and fitting for X ((268, 7)) and y (None) started
<MultiGrainedScanner stride_ratio=0.25> - Window shape: [1] Total windows: 7
<MultiGrainedScanner stride_ratio=0.25> - Finished scan X ((268, 7)) and got predictions with shape (268, 28)
<MultiGrainedScanner stride_ratio=0.1111111111111111> - Scanning and fitting for X ((268, 7)) and y (None) started
<MultiGrainedScanner stride_ratio=0.1111111111111111> - Window shape: [1] Total windows: 7
<MultiGrainedScanner stride_ratio=0.1111111111111111> - Finished scan X ((268, 7)) and got predictions with shape (268, 28)
<MultiGrainedScanner stride_ratio=0.0625> - Scanning and fitting for X ((268, 7)) and y (None) started
<MultiGrainedScanner stride_ratio=0.0625> - Window shape: [1] Total windows: 7
<MultiGrainedScanner stride_ratio=0.0625> - Finished scan X ((268, 7)) and got predictions with shape (268, 28)
<CascadeForest forests=4> - Shape of predictions: (4, 268, 2) shape of X: (268, 84)


Prediction shape: (268,)
Accuracy: 0.8171641791044776 F1 score: 0.8094520467654797
