In [1]:
import os
import pandas as pd
import re
import numpy as np

from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVR
from sklearn.linear_model import LogisticRegression

data_name = "data"
train_bn = "train.csv"
test_bn = "test.csv"
results_bn = "results.csv"

full_bn = "full.csv"

proj_dir = os.path.abspath(
        os.path.join(os.path.abspath(__name__), os.pardir, os.pardir))
data_dir = os.path.join(proj_dir, data_name)
train_fn = os.path.join(data_dir, train_bn)
test_fn = os.path.join(data_dir, test_bn)
results_fn = os.path.join(data_dir, results_bn)

full_fn = os.path.join(data_dir, full_bn)

if not os.path.exists(data_dir):
    raise OSError("Data directory not properly setup.")

In [2]:
import sklearn.preprocessing as preprocessing

class LabelEncoderExt(preprocessing.LabelEncoder):
    
    UNK = "UNK"
    
    def __init__(self):
        
        super().__init__()
        
    def fit(self, y):
        
        if not isinstance(y, np.ndarray):
            y = np.array(y)
        assert (len(y.shape) == 1), "Require 1D array"
        y = np.concatenate((y, np.array([self.UNK])))
        super().fit(y)
        
    def transform(self, y):
        
        y[~np.isin(y, self.classes_, assume_unique=True)] = self.UNK
        return super().transform(y)
    
    def fit_transform(self, y):
        
        self.fit(y)
        return self.transform(y)

In [3]:
try: 
    df_train = pd.read_csv(train_fn)
except OSError as e:
    print("Training file missing.")
try:
    df_test = pd.read_csv(test_fn)
except OSError as e:
    print("Test file missing.")
    
try:
    df_full = pd.read_csv(full_fn, quotechar='"')
except OSError as e:
    print("Test file missing.")

In [4]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

pd.concat([df_train, df_test]).sort_values("Name")

In [5]:
t = df_test.join(df_full.set_index("name"), how="left", on="Name").drop_duplicates(subset=["Name"])

df_train

In [6]:
name = "Name"
sex = "Sex"
emb = "Embarked"
cabin = "Cabin"
age = "Age"
fare = "Fare"
ticket = "Ticket"
sib = "SibSp"
par = "Parch"
pclass = "Pclass"

family = "family"

dummy_cols = [
        pclass, 
        #ticket,
        "fam_size",
        name, 
        cabin, 
        emb]
pid = "PassengerId"
dep_vars = ["Survived"]
indices = [pid]
ind_vars = [x for x in df_train.columns if x not in (dep_vars+indices+[ticket])]

In [7]:
df_X = pd.concat([df_train, df_test], ignore_index=True)

In [8]:
def pre_encoding(df):
    
    reg_ex = "\w+\s?\w*(\.)"
    reg = re.compile(reg_ex)
    f = lambda x: x.split(',')[1].strip()
    g = lambda x: reg.match(x).group()
    h = lambda x: x[0] if len(x) > 0 else ''
    thresh = 0.01
    unk = "UNK"
    
    df[sex] = df[sex].apply(h)
    
    df[name] = df[name].apply(f).apply(g)
    freq = df[name].value_counts(normalize=True)
    k = lambda x: x if freq[x] >= thresh else unk
    df[name] = df[name].apply(k)    
    
    tmp = df.groupby(ticket).count()
    df['ticket_count'] = df[ticket].apply(lambda x: tmp.at[x, pid])

    df[cabin] = df[cabin].fillna('')
    numb_cab = lambda x: len(x.split())
    df.loc[df[cabin].notnull(), "num_cab"] = (
            df.loc[df[cabin].notnull(), cabin].apply(numb_cab)) 
    mapping = {'': '', 'A': 'A', 'B': 'B', 'C': 'B',
               'D': 'D', 'E':'D', 'F':'F', 'G': 'F', 'T': 'A'}
    comb = lambda x: mapping[x]
    df.loc[df[cabin].notnull(), cabin] = (
            df.loc[df[cabin].notnull(), cabin].apply(h).apply(comb))
    
    df[family] = df[par] + df[sib] + 1
    
    df["fam_per_tickets"] = df[family]/df["ticket_count"]
    #df[fare] = df[fare]/df[par]
    #df[fare] = np.divide(df[fare].values, df["num_cab"].values, where=df["num_cab"]!=0)
    #df[fare] = np.divide(df[fare].values, df[par], where=df["num_cab"]==0)
    #df[fare] = df[fare]/df['ticket_count']
    # extra

    #ticket_prefix = lambda x: 1 if len(x.split()) > 1 else 0
    #ticket_prefix = lambda x: len(x.split()) - 1
    #df[ticket] = df[ticket].apply(ticket_prefix) 
    
    df["single"] = df[family].apply(lambda x: 1 if x==1 else 0)
    
    f_size = {1: 0, 2: 0, 3: 0, 4: 1, 5: 1, 6: 2, 7: 2, 8: 3, 9: 3, 10: 3, 11: 3}
    df["fam_size"] = df[family].apply(lambda x: f_size[x])

    return df


def build_encoders(df):
   
    enc_name = LabelEncoderExt()
    enc_sex = preprocessing.LabelEncoder()
    enc_emb = preprocessing.LabelEncoder()
    enc_cabin = preprocessing.LabelEncoder()

    enc_name.fit(df[name])
    enc_sex.fit(df[sex].dropna())
    enc_emb.fit(df[emb].dropna())
    enc_cabin.fit(df[cabin].dropna())
    enc_dict = {name: enc_name, sex: enc_sex, emb: enc_emb, cabin: enc_cabin}
 
    scl_age = preprocessing.StandardScaler()
    scl_age.fit(df[[age]].dropna().values)
    scl_fare = preprocessing.StandardScaler()
    scl_fare.fit(df[[fare]].dropna().values)
    scl = {age: scl_age, fare: scl_fare}
        
    return enc_dict, scl


def scale(df, scl):
    
    df.loc[df[[age]].dropna().index, age] = (
            scl[age].transform(df[[age]].dropna().values))
    df.loc[df[[fare]].dropna().index, fare] = (
            scl[fare].transform(df[[fare]].dropna().values))    
    return df


def naive_bayes_data_fill(df, enc_dict):

    df.loc[df[emb].dropna().index, emb] = (
            enc_dict[emb].transform(df[emb].dropna().values))
    df.loc[:, sex] = enc_dict[sex].transform(df[sex].values)
    df.loc[df[cabin].dropna().index, cabin] = (
            enc_dict[cabin].transform(df[cabin].dropna().values))
    df.loc[:, name] = enc_dict[name].transform(df[name].values)
    
    tmp = df[[pclass, sex, par, emb]].dropna()
    index = df.index.isin(tmp.index)
    
    X = tmp[[pclass, sex, par]].values.astype(int)
    Y = tmp[emb].values.astype(int)
    tmp2 = df.loc[~index, [pclass, sex, par]]
    if len(tmp2) > 0:
        clf = MultinomialNB()
        clf.fit(X, Y)
        df.loc[~index, emb] = clf.predict(tmp2)
    
    return df


def mean_data_fill(df):
    
    from sklearn.linear_model import LinearRegression
    
    group = [pclass, emb]
    nan_fares = df[fare].isnull()
    
    '''
    tmp = df[group+[fare]].dropna()
    clf = SVR()
    reg = clf.fit(tmp[group].values, tmp[fare].values)
    df.loc[nan_fares, fare] = reg.predict(df.loc[nan_fares, group].values)
    ''' 
    
    tmp = df.groupby(group).mean()[[fare]]
    tmp2 = df.loc[nan_fares][group]
    ind = pd.MultiIndex.from_arrays(tmp2.values.T, names=tmp.index.names)
    df.loc[nan_fares, fare] = tmp.loc[ind, fare].fillna(0).values
    
    
    group = [name, "fam_size"]
    nan_ages = df[age].isnull()
    
    '''
    tmp1 = df[group+[age]].dropna()
    clf = SVR()
    reg = clf.fit(tmp1[group].values, tmp1[age].values)
    df.loc[nan_ages, age] = reg.predict(df.loc[nan_ages, group].values)
    '''
    
    tmp1 = df.groupby(group).mean()[[age]]
    tmp2 = df.loc[nan_ages][group]
    ind = pd.MultiIndex.from_arrays(tmp2.values.T, names=tmp1.index.names)
    df.loc[nan_ages, age] = tmp1.loc[ind, age].fillna(0).values

    '''
    group = [pclass, emb, par, sib, fare]
    nan_cabin = df[cabin].isnull()
    
    tmp = df.loc[~nan_cabin, group+[cabin]]
    from sklearn.svm import LinearSVC
    clf = LinearSVC()
    #clf = LogisticRegression(max_iter=1000, multi_class="multinomial")
    #clf = MultinomialNB()
    #reg = clf.fit(tmp[group].values, tmp[[x for x in tmp.columns if x not in cabin]].values)
    reg = clf.fit(tmp[group].values, tmp[cabin].values.astype(int))
    df.loc[nan_cabin, cabin] = reg.predict(df.loc[nan_cabin, group].values)
    df.loc[:, cabin] = df[cabin].astype(int)
    '''
    
    return df

def preprocess(df, enc_dict=None, scl=None):
    
    df = pre_encoding(df)
    if enc_dict is None:
        enc_dict, scl = build_encoders(df)
    df = scale(df, scl)
    df = naive_bayes_data_fill(df, enc_dict)
    
    df = mean_data_fill(df)
    
    min_age = 10
    df["child"] = df[age].apply(lambda x: 1 if x <= scl[age].transform([[min_age]])[0, 0] else 0)
    #df["child_women"] = df["child"] | (1 - df["Sex"])

    #df = df.drop(["num_cab"], axis=1)
    #df = df.drop([sex, "child"], axis=1)
    df = df.drop([ticket], axis=1)
    df = df.drop([par, sib, family], axis=1)
    
    return df, enc_dict, scl

In [9]:
df_X, enc_dict, scl= preprocess(df_X)

In [10]:
df_train = df_X.iloc[:df_train.shape[0]]
df_test = df_X.iloc[df_train.shape[0]:]

import matplotlib.pyplot as plt

tmp = df_train[[fare, cabin]]

fig = plt.figure(figsize=(15, 10))
ax = fig.add_subplot()
ax.scatter(tmp[cabin], tmp[fare])

In [11]:
X_train = pd.get_dummies(df_train, columns=dummy_cols).drop(dep_vars+indices, axis=1)
X_test = pd.get_dummies(df_test, columns=dummy_cols).drop(indices, axis=1)

X_test = X_test.join(pd.DataFrame({x: 0 for x in X_train.columns if x not in X_test.columns}, index=X_test.index))
X_test = X_test[X_train.columns]

Y_train = df_train[dep_vars]

Y_test = t[[dep_vars[0].lower()]]

In [12]:
X_train.columns

Index(['Sex', 'Age', 'Fare', 'ticket_count', 'num_cab', 'fam_per_tickets',
       'single', 'child', 'Pclass_1', 'Pclass_2', 'Pclass_3', 'fam_size_0',
       'fam_size_1', 'fam_size_2', 'fam_size_3', 'Name_0', 'Name_1', 'Name_2',
       'Name_3', 'Name_4', 'Cabin_0', 'Cabin_1', 'Cabin_2', 'Cabin_3',
       'Cabin_4', 'Embarked_0', 'Embarked_1', 'Embarked_2'],
      dtype='object')

In [None]:
parameters = {
        "n_estimators": [100],
        #"criterion": ["gini", "entropy"],
        "max_depth": [3, 4, 5, 6, 7, 8, 9]
}
rf_clf = RandomForestClassifier()

from sklearn.ensemble import GradientBoostingClassifier
#clf =  GradientBoostingClassifier(learning_rate=0.11)

#clf = GridSearchCV(rf_clf, parameters, cv=5)
clf = RandomForestClassifier(n_estimators=700, max_depth=5,
                            max_leaf_nodes=32,
                            #criterion="gini",
                            min_samples_split=4, min_samples_leaf=5)
clf.fit(X_train, Y_train.values.ravel())

#pd.DataFrame(clf.cv_results_)

pred = clf.predict(X_test)
results = pd.DataFrame({indices[0]: df_test[indices[0]].values, dep_vars[0]: pred})

from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
print(accuracy_score(Y_test, pred))
pd.DataFrame(confusion_matrix(Y_test, pred))

results.to_csv(results_fn, index=False)