In [1]:
import os
import pandas as pd
import re
import numpy as np

from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC

data_name = "data"
train_bn = "train.csv"
test_bn = "test.csv"
results_bn = "results.csv"

full_bn = "full.csv"

proj_dir = os.path.abspath(
        os.path.join(os.path.abspath(__name__), os.pardir, os.pardir))
data_dir = os.path.join(proj_dir, data_name)
train_fn = os.path.join(data_dir, train_bn)
test_fn = os.path.join(data_dir, test_bn)
results_fn = os.path.join(data_dir, results_bn)

full_fn = os.path.join(data_dir, full_bn)

if not os.path.exists(data_dir):
    raise OSError("Data directory not properly setup.")

In [2]:
try: 
    df_train = pd.read_csv(train_fn)
except OSError as e:
    print("Training file missing.")
try:
    df_test = pd.read_csv(test_fn)
except OSError as e:
    print("Test file missing.")
    
try:
    df_full = pd.read_csv(full_fn)
except OSError as e:
    print("Test file missing.")
print(df_full.shape)

(1309, 14)


In [3]:
df_train.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [4]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [5]:
#df_train['Ticket'].value_counts()

In [6]:
t = df_test.join(df_full.set_index("name"), how="left", on="Name").drop_duplicates(subset=["Name"])

In [7]:
#t[t['survived'].isnull()]

In [8]:
name = "Name"
sex = "Sex"
emb = "Embarked"
cabin = "Cabin"
age = "Age"
fare = "Fare"
ticket = "Ticket"
sib = "SibSp"
par = "Parch"

dummy_cols = ["Pclass", name, sex, cabin, emb]
dep_vars = ["Survived"]
indices = ["PassengerId"]
ind_vars = [x for x in df_train.columns if x not in (dep_vars+indices+[ticket])]

In [9]:
import sklearn.preprocessing as preprocessing

class LabelEncoderExt(preprocessing.LabelEncoder):
    
    UNK = "UNK"
    
    def __init__(self):
        
        super().__init__()
        
    def fit(self, y):
        
        if not isinstance(y, np.ndarray):
            y = np.array(y)
        assert (len(y.shape) == 1), "Require 1D array"
        y = np.concatenate((y, np.array([self.UNK])))
        super().fit(y)
        
    def transform(self, y):
        
        y[~np.isin(y, self.classes_, assume_unique=True)] = self.UNK
        return super().transform(y)
    
    def fit_transform(self, y):
        
        self.fit(y)
        return self.transform(y)

In [10]:
def preprocess(df, enc_dict=None, scl=None):

    reg_ex = "\w+\s?\w*(\.)"
    reg = re.compile(reg_ex)
    f = lambda x: x.split(',')[1].strip()
    g = lambda x: reg.match(x).group()
    h = lambda x: x[0] if len(x) > 0 else ''
    thresh = 0.01
    unk = "UNK"
    
    df[[emb]] = df[emb].fillna('')
    df[[cabin]] = df[cabin].fillna('')
    
    if enc_dict is None:
        enc_name = LabelEncoderExt()
        enc_sex = preprocessing.LabelEncoder()
        enc_emb = preprocessing.LabelEncoder()
        enc_cabin = preprocessing.LabelEncoder()

        tmp = df[name].apply(f).apply(g)
        freq = tmp.value_counts(normalize=True)
        k = lambda x: x if freq[x] >= thresh else unk
        df[name] = enc_name.fit_transform(tmp.apply(k))
        df[sex] = enc_sex.fit_transform(df[sex])
        df[emb] = enc_emb.fit_transform(df[emb])
        df[cabin] = enc_cabin.fit_transform(df[cabin].apply(h))
        enc_dict = {name: enc_name, sex: enc_sex, emb: enc_emb, cabin: enc_cabin}
    else:
        df[name] = enc_dict[name].transform(df[name].apply(f).apply(g))
        df[sex] = enc_dict[sex].transform(df[sex])
        df[emb] = enc_dict[emb].transform(df[emb])
        df[cabin] = enc_dict[cabin].transform(df[cabin].apply(h))
        
    if scl is None:
        scl = preprocessing.StandardScaler()
        df[[age, fare]] = scl.fit_transform(df[[age, fare]])
    else:
        df[[age, fare]] = scl.transform(df[[age, fare]])
        
    df[par] = df[par] + df[sib]
    df = df.drop(sib, axis=1)
        
    #print(df['Cabin'])
    #print(enc_dict[cabin].classes_)
    
    #df = df.fillna(0)
    
    return df, enc_dict, scl

In [11]:
df_train, enc_dict, scl = preprocess(df_train)
df_test, _, _ = preprocess(df_test, enc_dict, scl)



In [17]:
enc_dict[name].classes_, enc_dict[cabin].classes_, enc_dict[emb].classes_, enc_dict[cabin].classes_

(array(['Master.', 'Miss.', 'Mr.', 'Mrs.', 'UNK'], dtype=object),
 array(['', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'T'], dtype=object),
 array(['', 'C', 'Q', 'S'], dtype=object),
 array(['', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'T'], dtype=object))

In [12]:
X_train = pd.get_dummies(df_train, columns=dummy_cols).drop(dep_vars+indices+["Ticket"], axis=1)
X_test = pd.get_dummies(df_test, columns=dummy_cols).drop(indices+["Ticket"], axis=1)

X_test = X_test.join(pd.DataFrame({x: 0 for x in X_train.columns if x not in X_test.columns}, index=X_test.index))
X_test = X_test[X_train.columns]

Y_train = df_train[dep_vars]

Y_test = t[[dep_vars[0].lower()]].fillna(0)

In [13]:
parameters = {
        "n_estimators": [100],
        "max_depth": [3, 4, 5, 6, 7, 8, 10, 12]
}
rf_clf = RandomForestClassifier()
clf = GridSearchCV(rf_clf, parameters, cv=5)
clf.fit(X_train, Y_train.values.ravel())

pd.DataFrame(clf.cv_results_)

Traceback (most recent call last):
  File "/home/jimmy/.local/lib/python3.6/site-packages/sklearn/model_selection/_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/jimmy/.local/lib/python3.6/site-packages/sklearn/ensemble/_forest.py", line 304, in fit
    accept_sparse="csc", dtype=DTYPE)
  File "/home/jimmy/.local/lib/python3.6/site-packages/sklearn/base.py", line 432, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "/home/jimmy/.local/lib/python3.6/site-packages/sklearn/utils/validation.py", line 72, in inner_f
    return f(**kwargs)
  File "/home/jimmy/.local/lib/python3.6/site-packages/sklearn/utils/validation.py", line 802, in check_X_y
    estimator=estimator)
  File "/home/jimmy/.local/lib/python3.6/site-packages/sklearn/utils/validation.py", line 72, in inner_f
    return f(**kwargs)
  File "/home/jimmy/.local/lib/python3.6/site-packages/sklearn/utils/validation.py", line 645, in check_array
    allo

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

In [None]:
pred = clf.predict(X_test)
results = pd.DataFrame({indices[0]: df_test[indices[0]].values, dep_vars[0]: pred})
results.to_csv(results_fn, index=False)

from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
print(accuracy_score(Y_test, pred))
pd.DataFrame(confusion_matrix(Y_test, pred))


In [None]:
parameters = {
        "n_estimators": [100],
        "max_depth": [3, 5, 8, 10, 12, 15]
}
rf_clf = GradientBoostingClassifier()
clf = GridSearchCV(rf_clf, parameters, cv=5)
clf.fit(X_train, Y_train.values.ravel())

pd.DataFrame(clf.cv_results_)

In [None]:
pred = clf.predict(X_test)
results = pd.DataFrame({indices[0]: df_test[indices[0]].values, dep_vars[0]: pred})
results.to_csv(results_fn, index=False)

from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
print(accuracy_score(Y_test, pred))
pd.DataFrame(confusion_matrix(Y_test, pred))