In [1]:
import os
import pandas as pd
import re
import numpy as np

from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier


data_name = "data"
train_bn = "train.csv"
test_bn = "test.csv"

proj_dir = os.path.abspath(
        os.path.join(os.path.abspath(__name__), os.pardir, os.pardir))
data_dir = os.path.join(proj_dir, data_name)
train_fn = os.path.join(data_dir, train_bn)
test_fn = os.path.join(data_dir, test_bn)

if not os.path.exists(data_dir):
    raise OSError("Data directory not properly setup.")

In [2]:
try: 
    df_train = pd.read_csv(train_fn)
except OSError as e:
    print("Training file missing.")
try:
    df_test = pd.read_csv(test_fn)
except OSError as e:
    print("Test file missing.")

In [3]:
name = "Name"
sex = "Sex"
emb = "Embarked"
cabin = "Cabin"
age = "Age"
fare = "Fare"
ticket = "Ticket"

dummy_cols = ["Pclass", name, sex, "SibSp", "Parch", "Cabin", "Embarked"]
dep_vars = ["Survived"]
indices = ["PassengerId"]
ind_vars = [x for x in df_train.columns if x not in (dep_vars+indices+[ticket])]

In [4]:
import sklearn.preprocessing as preprocessing

class LabelEncoderExt(preprocessing.LabelEncoder):
    
    UNK = "UNK"
    
    def __init__(self):
        
        super().__init__()
        
    def fit(self, y):
        
        if not isinstance(y, np.ndarray):
            y = np.array(y)
        assert (len(y.shape) == 1), "Require 1D array"
        y = np.concatenate((y, np.array([self.UNK])))
        super().fit(y)
        
    def transform(self, y):
        
        y[~np.isin(y, self.classes_, assume_unique=True)] = self.UNK
        return super().transform(y)
    
    def fit_transform(self, y):
        
        self.fit(y)
        return self.transform(y)

In [5]:
reg_ex = "\w+\s?\w*(\.)"
reg = re.compile(reg_ex)
f = lambda x: x.split(',')[1].strip()
g = lambda x: reg.match(x).group()
h = lambda x: x[0] if len(x) > 0 else ''

In [6]:
df_train[[emb]] = df_train[emb].fillna('')
df_train[[cabin]] = df_train[cabin].fillna('')

In [8]:
enc_name = LabelEncoderUnk()
enc_sex = preprocessing.LabelEncoder()
enc_emb = preprocessing.LabelEncoder()
enc_cabin = preprocessing.LabelEncoder()

scl = preprocessing.StandardScaler()

df_train[[age, fare]] = scl.fit_transform(df_train[[age, fare]])

df_train[name] = enc_name.fit_transform(df_train[name].apply(f).apply(g))
df_train[sex] = enc_sex.fit_transform(df_train[sex])
df_train[emb] = enc_emb.fit_transform(df_train[emb])
df_train[cabin] = enc_cabin.fit_transform(df_train[cabin].apply(h))

In [9]:
df_train.loc[df_train[age].isnull(), age] = 0

In [11]:
is_NaN = df_train.isnull()
row_has_NaN = is_NaN.any(axis=1)
rows_with_NaN = df_train[row_has_NaN]
print(rows_with_NaN)

Empty DataFrame
Columns: [PassengerId, Survived, Pclass, Name, Sex, Age, SibSp, Parch, Ticket, Fare, Cabin, Embarked]
Index: []


In [12]:
X_train = pd.get_dummies(df_train, columns=dummy_cols).drop(dep_vars+indices+["Ticket"], axis=1)

In [13]:
#X_train = df_train[ind_vars]
Y_train = df_train[dep_vars]
X_test = df_test[ind_vars]
#Y_test = df_test[dep_vars]

In [16]:

rf_clf = RandomForestClassifier(n_estimators=100, max_depth=8)
rf_clf.fit(X_train, Y_train.values.ravel())

RandomForestClassifier(max_depth=8)

In [17]:
rf_clf.score(X_train, Y_train)

0.9001122334455668

In [18]:

parameters = {
        "n_estimators": [100],
        "max_depth": [3, 5, 8, 10, 12, 15]
}
rf_clf = RandomForestClassifier()
clf = GridSearchCV(rf_clf, parameters, cv=5)
clf.fit(X_train, Y_train.values.ravel())

GridSearchCV(cv=5, estimator=RandomForestClassifier(),
             param_grid={'max_depth': [3, 5, 8, 10, 12, 15],
                         'n_estimators': [100]})

In [19]:
pd.DataFrame(clf.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.103761,0.001594,0.009026,0.000205,3,100,"{'max_depth': 3, 'n_estimators': 100}",0.826816,0.808989,0.792135,0.752809,0.803371,0.796824,0.024697,6
1,0.108356,0.000732,0.009318,0.000203,5,100,"{'max_depth': 5, 'n_estimators': 100}",0.821229,0.808989,0.825843,0.780899,0.837079,0.814808,0.019197,4
2,0.115636,0.000951,0.009556,0.000155,8,100,"{'max_depth': 8, 'n_estimators': 100}",0.826816,0.808989,0.842697,0.792135,0.848315,0.82379,0.020925,2
3,0.118905,0.001197,0.009815,0.000249,10,100,"{'max_depth': 10, 'n_estimators': 100}",0.821229,0.797753,0.865169,0.792135,0.848315,0.82492,0.028262,1
4,0.121485,0.001266,0.009819,0.000226,12,100,"{'max_depth': 12, 'n_estimators': 100}",0.810056,0.792135,0.853933,0.775281,0.848315,0.815944,0.030809,3
5,0.124732,0.001439,0.010011,0.000262,15,100,"{'max_depth': 15, 'n_estimators': 100}",0.810056,0.797753,0.842697,0.769663,0.848315,0.813697,0.02914,5


In [20]:

parameters = {
        "n_estimators": [100],
        "max_depth": [3, 5, 8, 10, 12, 15]
}
rf_clf = GradientBoostingClassifier()
clf = GridSearchCV(rf_clf, parameters, cv=5)
clf.fit(X_train, Y_train.values.ravel())

pd.DataFrame(clf.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.092497,0.002892,0.002048,0.0001,3,100,"{'max_depth': 3, 'n_estimators': 100}",0.821229,0.786517,0.859551,0.803371,0.859551,0.826044,0.029479,2
1,0.164937,0.006548,0.002648,0.000339,5,100,"{'max_depth': 5, 'n_estimators': 100}",0.832402,0.814607,0.870787,0.803371,0.848315,0.833896,0.023979,1
2,0.355309,0.019557,0.002893,4.4e-05,8,100,"{'max_depth': 8, 'n_estimators': 100}",0.782123,0.814607,0.848315,0.764045,0.814607,0.804739,0.029193,4
3,0.521693,0.028796,0.00376,0.000961,10,100,"{'max_depth': 10, 'n_estimators': 100}",0.759777,0.820225,0.831461,0.769663,0.842697,0.804764,0.033605,3
4,0.654856,0.04163,0.003272,0.000126,12,100,"{'max_depth': 12, 'n_estimators': 100}",0.782123,0.792135,0.859551,0.758427,0.820225,0.802492,0.034739,5
5,0.65481,0.040738,0.003156,8.2e-05,15,100,"{'max_depth': 15, 'n_estimators': 100}",0.748603,0.780899,0.808989,0.747191,0.831461,0.783429,0.033144,6
