In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import accuracy_score


pd.set_option('display.max_columns', 500)

### Prepare dataset


In [55]:
train = pd.read_csv('titanic/train.csv', sep=',')

def substrings_in_string(big_string, substrings):
    for substring in substrings:
        if str(big_string).find(substring) != -1:
            return substring
    return np.nan

cabin_list = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'Unknown']

# create new columns
train['Deck'] = train['Cabin'].map(lambda x: substrings_in_string(x, cabin_list))
train.drop(['Cabin'], axis=1, inplace=True)

In [56]:
# remove columns
train.drop(['Embarked','Ticket', 'Name', 'PassengerId'], axis=1, inplace=True)


# replace "NaN" with the mean for the 'Age' column
avg = train['Age'].astype("float").mean(axis = 0)
train['Age'].replace(np.nan, avg, inplace = True)

# replace "?" (if exists) to NaN
train.replace("?", np.nan, inplace = True)

# replace "NaN" with 'X' for 'Deck' column and with 0 for all Dataframe
train['Deck'].fillna('X', inplace=True)
train.fillna(0, inplace=True)

In [57]:
# one-hot encoding
train = pd.concat([train, pd.get_dummies(train['Sex'])], axis=1)
train = pd.concat([train, pd.get_dummies(train['Deck'])], axis=1)
train.drop(['Sex'], axis=1, inplace=True)
train.drop(['Deck'], axis=1, inplace=True)
train.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,female,male,A,B,C,D,E,F,G,X
0,0,3,22.0,1,0,7.25,0,1,0,0,0,0,0,0,0,1
1,1,1,38.0,1,0,71.2833,1,0,0,0,1,0,0,0,0,0
2,1,3,26.0,0,0,7.925,1,0,0,0,0,0,0,0,0,1
3,1,1,35.0,1,0,53.1,1,0,0,0,1,0,0,0,0,0
4,0,3,35.0,0,0,8.05,0,1,0,0,0,0,0,0,0,1


In [19]:
train['Fare'].dtype

dtype('float64')

### Test preprocessing

In [77]:
test_df = pd.read_csv('titanic/test.csv', sep=',')

testIds = test_df[['PassengerId']]

# remove columns
test = test_df.drop(['Embarked','Ticket', 'Name', 'PassengerId'], axis=1)

def substrings_in_string(big_string, substrings):
    for substring in substrings:
        if str(big_string).find(substring) != -1:
            return substring
    return np.nan

cabin_list = ['A', 'B', 'C', 'D', 'E', 'F', 'T', 'G', 'Unknown']

# create new columns
test['Deck'] = test['Cabin'].map(lambda x: substrings_in_string(x, cabin_list))
test.drop(['Cabin'], axis=1, inplace=True)

# replace "NaN" with the median for the 'Age' column
avg = test['Age'].astype("float").median(axis = 0)
test['Age'].replace(np.nan, avg, inplace = True)

# replace "?" (if exists) to NaN
test.replace("?", np.nan, inplace = True)

# replace "NaN" with 'X' for 'Deck' column and with 0 for all Dataframe
test['Deck'].fillna('X', inplace=True)
test.fillna(0, inplace=True)


# one-hot encoding
test = pd.concat([test, pd.get_dummies(test['Sex'])], axis=1)
test = pd.concat([test, pd.get_dummies(test['Deck'])], axis=1)
test.drop(['Sex'], axis=1, inplace=True)
test.drop(['Deck'], axis=1, inplace=True)
test.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,female,male,A,B,C,D,E,F,G,X
0,3,34.5,0,0,7.8292,0,1,0,0,0,0,0,0,0,1
1,3,47.0,1,0,7.0,1,0,0,0,0,0,0,0,0,1
2,2,62.0,0,0,9.6875,0,1,0,0,0,0,0,0,0,1
3,3,27.0,0,0,8.6625,0,1,0,0,0,0,0,0,0,1
4,3,22.0,1,1,12.2875,1,0,0,0,0,0,0,0,0,1


In [None]:
# remove columns
test.drop(['Embarked','Ticket', 'Name', 'PassengerId'], axis=1, inplace=True)


# replace "NaN" with the mean for the 'Age' column
avg = test['Age'].astype("float").mean(axis = 0)
test['Age'].replace(np.nan, avg, inplace = True)

# replace "?" (if exists) to NaN
test.replace("?", np.nan, inplace = True)

# replace "NaN" with 'X' for 'Deck' column and with 0 for all Dataframe
test['Deck'].fillna('X', inplace=True)
test.fillna(0, inplace=True)

In [72]:
# one-hot encoding
test = pd.concat([test, pd.get_dummies(test['Sex'])], axis=1)
test = pd.concat([test, pd.get_dummies(test['Deck'])], axis=1)
test.drop(['Sex'], axis=1, inplace=True)
test.drop(['Deck'], axis=1, inplace=True)

In [61]:
test.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,female,male,A,B,C,D,E,F,G,X
0,3,34.5,0,0,7.8292,0,1,0,0,0,0,0,0,0,1
1,3,47.0,1,0,7.0,1,0,0,0,0,0,0,0,0,1
2,2,62.0,0,0,9.6875,0,1,0,0,0,0,0,0,0,1
3,3,27.0,0,0,8.6625,0,1,0,0,0,0,0,0,0,1
4,3,22.0,1,1,12.2875,1,0,0,0,0,0,0,0,0,1


### Set some parameters

In [78]:
ntrain = train.shape[0] # or len(train)
ntest = test.shape[0] # or len(test)
SEED = 4590 # for reproducibility
NFOLDS = 5 # set folds for out-of-fold prediction
folds = KFold(n_splits= NFOLDS, random_state=SEED) #(cross validation)
str_folds = StratifiedKFold(n_splits=NFOLDS, shuffle=True, random_state=SEED)

In [79]:
df_train_columns = [c for c in train.columns if c not in ['Survived']]

y_train = train['Survived'].ravel() #ravel coverts a series to a numpy array
target = train['Survived']
x_train = train[df_train_columns].values # converts a dataframe to a numpy array
x_test = test[df_train_columns].values

### Model

In [63]:
#function gia cross validation (mono gia classification)

from collections import Counter
def get_oof(clf, x_train, y_train, x_test):
    oof_train = np.zeros((ntrain,))
    predictions = np.zeros((ntest,))
    oof_test = np.empty((NFOLDS, ntest))

    for i, (trn_idx, val_idx) in enumerate(folds.split(x_train,y_train)):
        print("Fold {}".format(i + 1))
        x_tr = x_train[trn_idx]
        y_tr = y_train[trn_idx]
        x_val = x_train[val_idx]

        clf.fit(x_tr, y_tr)

        oof_train[val_idx] = clf.predict(x_val)
        oof_test[i, :] = clf.predict(x_test)

    #predictions[:] = oof_test.mean(axis=0) #use this for regression
    
    pred_list = []
    p = pd.DataFrame(oof_test)
    for c in p:
        pred_list.append(p[c].value_counts().idxmax())
    predictions = np.asarray(pred_list)
    
    return oof_train, predictions

In [65]:
#function gia cross validation (mono gia regression)

from collections import Counter
def get_oof(clf, x_train, y_train, x_test):
    oof_train = np.zeros((ntrain,))
    predictions = np.zeros((ntest,))
    oof_test = np.empty((NFOLDS, ntest))

    for i, (trn_idx, val_idx) in enumerate(folds.split(x_train,y_train)):
        print("Fold {}".format(i + 1))
        x_tr = x_train[trn_idx]
        y_tr = y_train[trn_idx]
        x_val = x_train[val_idx]

        clf.fit(x_tr, y_tr)

        oof_train[val_idx] = clf.predict(x_val)
        oof_test[i, :] = clf.predict(x_test)

    predictions[:] = oof_test.mean(axis=0) #use this for regression
    
    return oof_train, predictions

In [80]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=100, max_depth=2,
                             random_state=SEED)
oof_rf, predictions_rf = get_oof(rf,x_train, y_train, x_test)
print("Random Forest: {:.5f}".format(accuracy_score(oof_rf, y_train)))

Fold 1
Fold 2
Fold 3
Fold 4
Fold 5
Random Forest: 0.78227


In [81]:
predictions_rf = predictions_rf.astype(int)
sub_df = pd.DataFrame({"PassengerId":test_df["PassengerId"]})
sub_df["Survived"] = predictions_rf
sub_df.to_csv("submission.csv", index=False)