In [1]:
import os
import pandas as pd
import re
import numpy as np

from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVR

data_name = "data"
train_bn = "train.csv"
test_bn = "test.csv"
results_bn = "results.csv"

full_bn = "full.csv"

proj_dir = os.path.abspath(
        os.path.join(os.path.abspath(__name__), os.pardir, os.pardir))
data_dir = os.path.join(proj_dir, data_name)
train_fn = os.path.join(data_dir, train_bn)
test_fn = os.path.join(data_dir, test_bn)
results_fn = os.path.join(data_dir, results_bn)

full_fn = os.path.join(data_dir, full_bn)

if not os.path.exists(data_dir):
    raise OSError("Data directory not properly setup.")

In [2]:
import sklearn.preprocessing as preprocessing

class LabelEncoderExt(preprocessing.LabelEncoder):
    
    UNK = "UNK"
    
    def __init__(self):
        
        super().__init__()
        
    def fit(self, y):
        
        if not isinstance(y, np.ndarray):
            y = np.array(y)
        assert (len(y.shape) == 1), "Require 1D array"
        y = np.concatenate((y, np.array([self.UNK])))
        super().fit(y)
        
    def transform(self, y):
        
        y[~np.isin(y, self.classes_, assume_unique=True)] = self.UNK
        return super().transform(y)
    
    def fit_transform(self, y):
        
        self.fit(y)
        return self.transform(y)

In [3]:
try: 
    df_train = pd.read_csv(train_fn)
except OSError as e:
    print("Training file missing.")
try:
    df_test = pd.read_csv(test_fn)
except OSError as e:
    print("Test file missing.")
    
try:
    df_full = pd.read_csv(full_fn, quotechar='"')
except OSError as e:
    print("Test file missing.")

In [4]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

pd.concat([df_train, df_test]).sort_values("Name")

In [5]:
t = df_test.join(df_full.set_index("name"), how="left", on="Name").drop_duplicates(subset=["Name"])

df_train

In [6]:
name = "Name"
sex = "Sex"
emb = "Embarked"
cabin = "Cabin"
age = "Age"
fare = "Fare"
ticket = "Ticket"
sib = "SibSp"
par = "Parch"
pclass = "Pclass"

dummy_cols = [
        pclass, 
        #ticket,
        name, cabin, emb]
pid = "PassengerId"
dep_vars = ["Survived"]
indices = [pid]
ind_vars = [x for x in df_train.columns if x not in (dep_vars+indices+[ticket])]

In [7]:
df_X = pd.concat([df_train, df_test])

In [8]:
def pre_encoding(df):
    
    reg_ex = "\w+\s?\w*(\.)"
    reg = re.compile(reg_ex)
    f = lambda x: x.split(',')[1].strip()
    g = lambda x: reg.match(x).group()
    h = lambda x: x[0] if len(x) > 0 else ''
    thresh = 0.01
    unk = "UNK"
    
    tmp = df.groupby(ticket).count()
    df['ticket_count'] = df[ticket].apply(lambda x: tmp.at[x, pid])
    
    df[[cabin]] = df[cabin].fillna('')    
    
    numb_cab = lambda x: len(x.split())
    df["num_cab"] = df[cabin].apply(numb_cab)
    
        # extra
    f2 = lambda x: {'': '', "A": "A", "B": "B", "C": "C", "D": "D", "E": "D", "F": "D", "G": "D", "T": "D"}[x] 
    
    df[name] = df[name].apply(f).apply(g)
    freq = df[name].value_counts(normalize=True)
    k = lambda x: x if freq[x] >= thresh else unk
    df[name] = df[name].apply(k)
    
    df[cabin] = df[cabin].apply(h).apply(f2)
    
    df[sex] = df[sex].apply(h)
    
    #df[par] = df[par] + df[sib] + 1
    #df = df.drop(sib, axis=1)
    
    #df[fare] = df[fare]/df[par]
    #df[fare] = np.divide(df[fare].values, df["num_cab"].values, where=df["num_cab"]!=0)
    #df[fare] = np.divide(df[fare].values, df[par], where=df["num_cab"]==0)
    # extra

    ticket_prefix = lambda x: 1 if len(x.split()) > 1 else 0
    ticket_prefix = lambda x: len(x.split()) - 1
    df[ticket] = df[ticket].apply(ticket_prefix) 
    
    df["single"] = df[par].apply(lambda x: 1 if x==0 else 0)

    return df


def build_encoders(df):
   
    enc_name = LabelEncoderExt()
    enc_sex = preprocessing.LabelEncoder()
    enc_emb = preprocessing.LabelEncoder()
    enc_cabin = preprocessing.LabelEncoder()

    enc_name.fit(df[name])
    enc_sex.fit(df[sex].dropna())
    enc_emb.fit(df[emb].dropna())
    enc_cabin.fit(df[cabin])
    enc_dict = {name: enc_name, sex: enc_sex, emb: enc_emb, cabin: enc_cabin}
 
    scl_age = preprocessing.StandardScaler()
    scl_age.fit(df[[age]].dropna().values)
    scl_fare = preprocessing.StandardScaler()
    scl_fare.fit(df[[fare]].dropna().values)
    scl = {age: scl_age, fare: scl_fare}
        
    return enc_dict, scl


def scale(df, scl):
    
    df.loc[df[[age]].dropna().index, age] = scl[age].transform(df[age].dropna().values)
    df.loc[df[[fare]].dropna().index, fare] = scl.transform(df[fare].dropna().values)    
    return df


def naive_bayes_data_fill(df, enc_dict, clf=None):

    df.loc[df[emb].dropna().index, emb] = enc_dict[emb].transform(df[emb].dropna().values)
    df.loc[:, sex] = enc_dict[sex].transform(df[sex].values)
    df.loc[:, cabin] = enc_dict[cabin].transform(df[cabin].values)
    df.loc[:, name] = enc_dict[name].transform(df[name].values)
    
    tmp = df[[pclass, sex, par, emb]].dropna()
    index = df.index.isin(tmp.index)
    
    X = tmp[[pclass, sex, par]].values.astype(int)
    Y = tmp[emb].values.astype(int)
    tmp2 = df.loc[~index, [pclass, sex, par]]
    if len(tmp2) > 0:
        if clf is None:
            clf = MultinomialNB()
        clf.fit(X, Y)
        df.loc[~index, emb] = clf.predict(tmp2)
    
    return df, clf


def mean_data_fill(df):
    
    from sklearn.linear_model import LinearRegression
    
    group = [pclass, "num_cab", emb, par, "single"]
    nan_fares = df[fare].isnull()
    
    tmp1 = df[group+[fare]].dropna()
    clf = SVR()
    reg = clf.fit(tmp1[group].values, tmp1[fare].values)
    df.loc[nan_fares, fare] = reg.predict(df.loc[nan_fares, group].values)
     
    #tmp1 = df.groupby(group).mean()[[fare]]
    #tmp3 = df.loc[nan_fares][group]
    #ind = pd.MultiIndex.from_arrays(tmp3.values.T, names=tmp1.index.names)
    #df.loc[nan_fares, fare] = tmp1.loc[ind, fare].fillna(0).values
    
    
    group2 = [pclass, name, par, emb]
    nan_ages = df[age].isnull()
    
    tmp1 = df[group2+[age]].dropna()
    reg = clf.fit(tmp1[group2].values, tmp1[age].values)
    df.loc[nan_ages, age] = reg.predict(df.loc[nan_ages, group2].values)
    
    #tmp1 = df.groupby(group2).mean()[[age]]
    #tmp2 = df.loc[nan_ages][group2]
    #ind = pd.MultiIndex.from_arrays(tmp2.values.T, names=tmp1.index.names)
    #df.loc[nan_ages, age] = tmp1.loc[ind, age].fillna(0).values
    
    return df.iloc[:df1.shape[0]], df.iloc[df1.shape[0]:]

def preprocess(df, enc_dict=None, scl=None, clf=None):
    
    df = pre_encoding(df)
    if enc_dict is None:
        enc_dict, scl = build_encoders(df)
    df = scale(df, scl)
    df, clf = naive_bayes_data_fill(df, enc_dict, clf=clf)
    
    #df = mean_data_fill(df)
    

    return df, enc_dict, scl, clf

In [9]:
df_X, enc_dict, scl, clf = preprocess(df_X)

ValueError: shape mismatch: value array of shape (1045,) could not be broadcast to indexing result of shape (1711,)

In [None]:
def extra(df):

    min_age = 14
    df["child"] = df[age].apply(lambda x: 1 if x <= scl.transform([[min_age, 0]])[0, 0] else 0)
    df["child_women"] = df["child"] + (1 - df["Sex"])
    df["child_women"] = df[["child", "Sex"]].max(axis=1).values
    df = df.drop(["child", "Sex"], axis=1)
    
    return df

In [None]:
df_train = extra(df_train)
df_test = extra(df_test)

import matplotlib.pyplot as plt

tmp = df_train[[fare, cabin]]

fig = plt.figure(figsize=(15, 10))
ax = fig.add_subplot()
ax.scatter(tmp[cabin], tmp[fare])

In [None]:
X_train = pd.get_dummies(df_train, columns=dummy_cols).drop(dep_vars+indices, axis=1)
X_test = pd.get_dummies(df_test, columns=dummy_cols).drop(indices, axis=1)

X_test = X_test.join(pd.DataFrame({x: 0 for x in X_train.columns if x not in X_test.columns}, index=X_test.index))
X_test = X_test[X_train.columns]

Y_train = df_train[dep_vars]

Y_test = t[[dep_vars[0].lower()]]

In [None]:
parameters = {
        "n_estimators": [100],
        #"criterion": ["gini", "entropy"],
        "max_depth": [3, 4, 5, 6, 7, 8, 9]
}
rf_clf = RandomForestClassifier()
clf = GridSearchCV(rf_clf, parameters, cv=5)
clf = RandomForestClassifier(n_estimators=100, max_depth=5)
clf.fit(X_train, Y_train.values.ravel())

#pd.DataFrame(clf.cv_results_)

pred = clf.predict(X_test)
results = pd.DataFrame({indices[0]: df_test[indices[0]].values, dep_vars[0]: pred})

from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
print(accuracy_score(Y_test, pred))
pd.DataFrame(confusion_matrix(Y_test, pred))

results.to_csv(results_fn, index=False)