In [1]:
import os
import pandas as pd
import re
import numpy as np

from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB

data_name = "data"
train_bn = "train.csv"
test_bn = "test.csv"
results_bn = "results.csv"

full_bn = "full.csv"

proj_dir = os.path.abspath(
        os.path.join(os.path.abspath(__name__), os.pardir, os.pardir))
data_dir = os.path.join(proj_dir, data_name)
train_fn = os.path.join(data_dir, train_bn)
test_fn = os.path.join(data_dir, test_bn)
results_fn = os.path.join(data_dir, results_bn)

full_fn = os.path.join(data_dir, full_bn)

if not os.path.exists(data_dir):
    raise OSError("Data directory not properly setup.")

In [2]:
try: 
    df_train = pd.read_csv(train_fn)
except OSError as e:
    print("Training file missing.")
try:
    df_test = pd.read_csv(test_fn)
except OSError as e:
    print("Test file missing.")
    
try:
    df_full = pd.read_csv(full_fn)
except OSError as e:
    print("Test file missing.")
print(df_full.shape)

(1309, 14)


In [3]:
df_train.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [4]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [5]:
t = df_test.join(df_full.set_index("name"), how="left", on="Name").drop_duplicates(subset=["Name"])

In [6]:
name = "Name"
sex = "Sex"
emb = "Embarked"
cabin = "Cabin"
age = "Age"
fare = "Fare"
ticket = "Ticket"
sib = "SibSp"
par = "Parch"
pclass = "Pclass"

dummy_cols = [pclass, name, sex, cabin, emb]
dep_vars = ["Survived"]
indices = ["PassengerId"]
ind_vars = [x for x in df_train.columns if x not in (dep_vars+indices+[ticket])]

In [7]:
import sklearn.preprocessing as preprocessing

class LabelEncoderExt(preprocessing.LabelEncoder):
    
    UNK = "UNK"
    
    def __init__(self):
        
        super().__init__()
        
    def fit(self, y):
        
        if not isinstance(y, np.ndarray):
            y = np.array(y)
        assert (len(y.shape) == 1), "Require 1D array"
        y = np.concatenate((y, np.array([self.UNK])))
        super().fit(y)
        
    def transform(self, y):
        
        y[~np.isin(y, self.classes_, assume_unique=True)] = self.UNK
        return super().transform(y)
    
    def fit_transform(self, y):
        
        self.fit(y)
        return self.transform(y)

In [8]:
def pre_encoding(df):
    
    reg_ex = "\w+\s?\w*(\.)"
    reg = re.compile(reg_ex)
    f = lambda x: x.split(',')[1].strip()
    g = lambda x: reg.match(x).group()
    h = lambda x: x[0] if len(x) > 0 else ''
    thresh = 0.01
    unk = "UNK"
    
    df[[cabin]] = df[cabin].fillna('')    
    
    df[name] = df[name].apply(f).apply(g)
    freq = df[name].value_counts(normalize=True)
    k = lambda x: x if freq[x] >= thresh else unk
    df[name] = df[name].apply(k)
    
    df[cabin] = df[cabin].apply(h)
    df[sex] = df[sex].apply(h)
    
    df[par] = df[par] + df[sib] + 1
    df = df.drop(sib, axis=1)
    
    return df


def build_encoders(df):
   
    enc_name = LabelEncoderExt()
    enc_sex = preprocessing.LabelEncoder()
    enc_emb = preprocessing.LabelEncoder()
    enc_cabin = preprocessing.LabelEncoder()

    enc_name.fit(df[name])
    enc_sex.fit(df[sex].dropna())
    enc_emb.fit(df[emb].dropna())
    enc_cabin.fit(df[cabin])
    enc_dict = {name: enc_name, sex: enc_sex, emb: enc_emb, cabin: enc_cabin}
 
    scl = preprocessing.StandardScaler()
    scl.fit_transform(df[[age, fare]].dropna().values)
        
    return enc_dict, scl

def scale(df, scl):
    
    df.loc[df[[age, fare]].dropna().index, [age, fare]] = scl.transform(df[[age, fare]].dropna().values)
    return df

def naive_bayes_data_fill(df, enc_dict):

    df.loc[df[emb].dropna().index, emb] = enc_dict[emb].transform(df[emb].dropna().values)
    df.loc[:, sex] = enc_dict[sex].transform(df[sex].values)
    df.loc[:, cabin] = enc_dict[cabin].transform(df[cabin].values)
    df.loc[:, name] = enc_dict[name].transform(df[name].values)
    
    tmp = df[[pclass, sex, par, emb]].dropna()
    index = df.index.isin(tmp.index)
    clf = MultinomialNB()
    X = tmp[[pclass, sex, par]].values.astype(int)
    Y = tmp[emb].values.astype(int)
    clf.fit(X, Y)
    tmp2 = df.loc[~index, [pclass, sex, par]]
    if len(tmp2) > 0:
        df.loc[~index, emb] = clf.predict(tmp2)
    
    return df, clf


def mean_data_fill(df):
    
    group = [par, pclass, sex, cabin]
    tmp1 = df.groupby(group).mean()[[age, fare]]    
    nan_ages = df[age].isnull() 
    nan_fares = df[fare].isnull() 
     
    tmp2 = df.loc[nan_ages][group]
    ind = pd.MultiIndex.from_arrays(tmp2.values.T, names=tmp1.index.names)
    df.loc[nan_ages, age] = tmp1.loc[ind, age].fillna(0).values
    
    tmp3 = df.loc[nan_fares][group]
    ind = pd.MultiIndex.from_arrays(tmp3.values.T, names=tmp1.index.names)
    df.loc[nan_fares, fare] = tmp1.loc[ind, fare].fillna(0).values 
    
    return df

def preprocess(df, enc_dict=None, scl=None, clf=None):
    
    df = pre_encoding(df)
    if enc_dict is None:
        enc_dict, scl = build_encoders(df)
    df = scale(df, scl)
    df, clf = naive_bayes_data_fill(df, enc_dict)
    df = mean_data_fill(df)
    return df, enc_dict, scl, clf

df_train = pre_encoding(df_train)
enc_dict, scl = build_encoders(df_train)
df_train = naive_bayes_data_fill(df_train, enc_dict)
df_train = mean_data_fill(df_train)

In [9]:
df_train, enc_dict, scl, clf = preprocess(df_train)
df_test, _, _, _ = preprocess(df_test, enc_dict, scl, clf)

In [11]:
X_train = pd.get_dummies(df_train, columns=dummy_cols).drop(dep_vars+indices+["Ticket"], axis=1)
X_test = pd.get_dummies(df_test, columns=dummy_cols).drop(indices+["Ticket"], axis=1)

X_test = X_test.join(pd.DataFrame({x: 0 for x in X_train.columns if x not in X_test.columns}, index=X_test.index))
X_test = X_test[X_train.columns]

Y_train = df_train[dep_vars]

Y_test = t[[dep_vars[0].lower()]].fillna(0)

In [13]:
parameters = {
        "n_estimators": [100],
        "max_depth": [3, 4, 5, 6, 7, 8, 10, 12]
}
rf_clf = RandomForestClassifier()
clf = GridSearchCV(rf_clf, parameters, cv=5)
clf.fit(X_train, Y_train.values.ravel())

pd.DataFrame(clf.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.108086,0.000971,0.00927,0.000299,3,100,"{'max_depth': 3, 'n_estimators': 100}",0.837989,0.820225,0.814607,0.775281,0.837079,0.817036,0.022805,6
1,0.107379,0.002196,0.009289,0.000352,4,100,"{'max_depth': 4, 'n_estimators': 100}",0.821229,0.825843,0.820225,0.803371,0.842697,0.822673,0.012583,5
2,0.109916,0.00287,0.009295,0.000346,5,100,"{'max_depth': 5, 'n_estimators': 100}",0.843575,0.825843,0.825843,0.803371,0.853933,0.830513,0.017317,1
3,0.119208,0.002504,0.010537,0.000949,6,100,"{'max_depth': 6, 'n_estimators': 100}",0.843575,0.814607,0.837079,0.803371,0.853933,0.830513,0.018719,1
4,0.119655,0.004883,0.010074,0.000495,7,100,"{'max_depth': 7, 'n_estimators': 100}",0.815642,0.808989,0.842697,0.803371,0.842697,0.822679,0.0168,4
5,0.127147,0.006806,0.010462,0.00106,8,100,"{'max_depth': 8, 'n_estimators': 100}",0.837989,0.797753,0.837079,0.808989,0.853933,0.827148,0.020627,3
6,0.118504,0.001085,0.009685,9.1e-05,10,100,"{'max_depth': 10, 'n_estimators': 100}",0.804469,0.786517,0.848315,0.792135,0.842697,0.814826,0.025775,7
7,0.126369,0.004059,0.010425,0.000392,12,100,"{'max_depth': 12, 'n_estimators': 100}",0.804469,0.792135,0.842697,0.792135,0.842697,0.814826,0.023197,8


In [14]:
pred = clf.predict(X_test)
results = pd.DataFrame({indices[0]: df_test[indices[0]].values, dep_vars[0]: pred})
results.to_csv(results_fn, index=False)

from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
print(accuracy_score(Y_test, pred))
pd.DataFrame(confusion_matrix(Y_test, pred))

0.7559808612440191


Unnamed: 0,0,1
0,217,56
1,46,99
