# Data Preprocessing

In [1]:
import pandas as pd
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

### Original data format

In [3]:
df_train.describe(include="all")

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
count,891.0,891.0,891.0,891,891,714.0,891.0,891.0,891,891.0,204,889
unique,,,,891,2,,,,681,,147,3
top,,,,"Graham, Mr. George Edward",male,,,,CA. 2343,,C23 C25 C27,S
freq,,,,1,577,,,,7,,4,644
mean,446.0,0.383838,2.308642,,,29.699118,0.523008,0.381594,,32.204208,,
std,257.353842,0.486592,0.836071,,,14.526497,1.102743,0.806057,,49.693429,,
min,1.0,0.0,1.0,,,0.42,0.0,0.0,,0.0,,
25%,223.5,0.0,2.0,,,,0.0,0.0,,7.9104,,
50%,446.0,0.0,3.0,,,,0.0,0.0,,14.4542,,
75%,668.5,1.0,3.0,,,,1.0,0.0,,31.0,,


In [4]:
df_test.describe(include="all")

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
count,418.0,418.0,418,418,332.0,418.0,418.0,418,417.0,91,418
unique,,,418,2,,,,363,,76,3
top,,,"Rosenbaum, Miss. Edith Louise",male,,,,PC 17608,,B57 B59 B63 B66,S
freq,,,1,266,,,,5,,3,270
mean,1100.5,2.26555,,,30.27259,0.447368,0.392344,,35.627188,,
std,120.810458,0.841838,,,14.181209,0.89676,0.981429,,55.907576,,
min,892.0,1.0,,,0.17,0.0,0.0,,0.0,,
25%,996.25,1.0,,,,0.0,0.0,,,,
50%,1100.5,3.0,,,,0.0,0.0,,,,
75%,1204.75,3.0,,,,1.0,0.0,,,,


### Convert text format column to integer class

In [5]:
# Sex : male -> 0, female -> 1
for df in [df_train, df_test]:
    df.loc[ df["Sex"] == "male", "Sex"] = 0
    df.loc[ df["Sex"] == "female", "Sex"] = 1
    df["Sex"] = df["Sex"].astype(int)

In [6]:
# Embarked : S -> 0, Q -> 1, C -> 2
for df in [df_train, df_test]:
    df.loc[ df["Embarked"] == "S", "Embarked"] = 0
    df.loc[ df["Embarked"] == "Q", "Embarked"] = 1
    df.loc[ df["Embarked"] == "C", "Embarked"] = 2
    df["Embarked"] = df["Embarked"].fillna(0)  # because S is most frequent
    df["Embarked"] = df["Embarked"].astype(int)

### Fill nan value by mean of column

In [7]:
for df in [df_train, df_test]:
    df["Age"] = df["Age"].fillna(df["Age"].mean())
    df["Fare"] = df["Fare"].fillna(df["Fare"].mean())

### See feature relation
>Pclass is a proxy for socio-economic status (SES)
 1st ~ Upper; 2nd ~ Middle; 3rd ~ Lower

In [8]:
df_train.corr()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
PassengerId,1.0,-0.005007,-0.035144,-0.042939,0.033207,-0.057527,-0.001652,0.012658,-0.013128
Survived,-0.005007,1.0,-0.338481,0.543351,-0.069809,-0.035322,0.081629,0.257307,0.167675
Pclass,-0.035144,-0.338481,1.0,-0.1319,-0.331339,0.083081,0.018443,-0.5495,-0.162098
Sex,-0.042939,0.543351,-0.1319,1.0,-0.084153,0.114631,0.245489,0.182333,0.108262
Age,0.033207,-0.069809,-0.331339,-0.084153,1.0,-0.232625,-0.179191,0.091566,0.026749
SibSp,-0.057527,-0.035322,0.083081,0.114631,-0.232625,1.0,0.414838,0.159651,-0.06823
Parch,-0.001652,0.081629,0.018443,0.245489,-0.179191,0.414838,1.0,0.216225,-0.039798
Fare,0.012658,0.257307,-0.5495,0.182333,0.091566,0.159651,0.216225,1.0,0.224719
Embarked,-0.013128,0.167675,-0.162098,0.108262,0.026749,-0.06823,-0.039798,0.224719,1.0


### Normalize features

In [9]:
def normalize(_df, colname):
    df = _df[:]  # Deepcopy here
    df[colname] = (df[colname] - df[colname].mean()) / df[colname].std()
    return df

df_train_normalize = df_train[:]
for colname in ["Pclass", "Age", "SibSp", "Parch", "Fare"]:
    df_train_normalize = normalize(df_train_normalize, colname)

df_test_normalize = df_test[:]
for colname in ["Pclass", "Age", "SibSp", "Parch", "Fare"]:
    df_test_normalize = normalize(df_test_normalize, colname)
    
df_train_normalize.describe(include="all")

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
count,891.0,891.0,891.0,891,891.0,891.0,891.0,891.0,891,891.0,204,891.0
unique,,,,891,,,,,681,,147,
top,,,,"Graham, Mr. George Edward",,,,,CA. 2343,,C23 C25 C27,
freq,,,,1,,,,,7,,4,
mean,446.0,0.383838,-7.575933000000001e-17,,0.352413,2.27278e-16,3.5886e-17,4.5854330000000005e-17,,-1.1962000000000002e-17,,0.463524
std,257.353842,0.486592,1.0,,0.47799,1.0,1.0,1.0,,1.0,,0.791503
min,1.0,0.0,-1.565228,,0.0,-2.251891,-0.4742788,-0.4734077,,-0.6480577,,0.0
25%,223.5,0.0,-0.3691575,,0.0,-0.592148,-0.4742788,-0.4734077,,-0.4888737,,0.0
50%,446.0,0.0,0.8269128,,0.0,0.0,-0.4742788,-0.4734077,,-0.3571902,,0.0
75%,668.5,1.0,0.8269128,,1.0,0.407697,0.4325504,-0.4734077,,-0.02423274,,1.0


# Solution

### Drop unnecessary column (unneccesary for learning)
- PasserngerId
- Survived (target feature)
- Name (text format column)
- Ticket (text format column
- Cabin

In [10]:
df_trainX = df_train.drop(["PassengerId", "Survived","Name", "Ticket", "Cabin"], axis=1)
df_norm_trainX = df_train_normalize.drop(["PassengerId", "Survived","Name", "Ticket", "Cabin"], axis=1)
df_testX = df_test.drop(["PassengerId","Name", "Ticket", "Cabin"], axis=1)
df_norm_testX = df_test_normalize.drop(["PassengerId","Name", "Ticket", "Cabin"], axis=1)

### Convert data format

In [12]:
from keras.utils import np_utils
trainX = df_trainX.as_matrix()
testX = df_testX.as_matrix()
norm_trainX = df_norm_trainX.as_matrix()
norm_testX = df_norm_testX.as_matrix()
trainy = df_train["Survived"].as_matrix()
trainY = np_utils.to_categorical(trainy, 2)

### Training forest

In [94]:
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier()
forest = forest.fit(trainX, trainY)

### See training error

In [108]:
import numpy as np
prediction = forest.predict(trainX)
correct_num = len([match for match in np.argmax(prediction, axis=1) == trainy if match])
correct_rate = correct_num * 1.0 / trainX.shape[0]
print "Accuracy on training set = %f" % correct_rate

Accuracy on training set = 0.961841


### See test error by submitting answer

In [148]:
feature_names = list(df_trainX.columns.values)
for name, importance in zip(feature_names, forest.feature_importances_):
    print "%s = %f" % (name, importance)

Pclass = 0.087191
Sex = 0.230376
Age = 0.277852
SibSp = 0.049041
Parch = 0.053136
Fare = 0.265980
Embarked = 0.036424


### See test error by submitting answer

In [132]:
test_prediction = forest.predict(testX)
one_hot = np.argmax(test_prediction, axis=1)
test_passenger_ids = df_test["PassengerId"].as_matrix()
ans = [[pid, pred] for pid, pred in zip(test_passenger_ids, one_hot)]
np.savetxt("forest_ans.csv", np.array(ans), fmt="%d", header="PassengerId,Survived")

`>> score = 0.78469`  
Overfitting ...

## GridSearch best hyper parameter

In [135]:
from sklearn import grid_search

parameters = {
        'n_estimators'      : [30, 40, 50],
        'max_features'      : [2,  4,  6],
        'min_samples_split' : [1,2, 3, 5],
        'max_depth'         : [1, 3, 5, 10]
}
clf = grid_search.GridSearchCV(RandomForestClassifier(), parameters)
clf.fit(trainX, trainY)
 
print(clf.best_estimator_)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=5, max_features=6, max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=40, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)


### See training error

In [136]:
import numpy as np
prediction = clf.predict(trainX)
correct_num = len([match for match in np.argmax(prediction, axis=1) == trainy if match])
correct_rate = correct_num * 1.0 / trainX.shape[0]
print "Accuracy on training set = %f" % correct_rate

Accuracy on training set = 0.859708


### See feature importance

In [150]:
feature_names = list(df_trainX.columns.values)
for name, importance in zip(feature_names, clf.best_estimator_.feature_importances_):
    print "%s = %f" % (name, importance)

Pclass = 0.162901
Sex = 0.511353
Age = 0.131115
SibSp = 0.047395
Parch = 0.009590
Fare = 0.117129
Embarked = 0.020516


### See test error by submitting answer

In [138]:
test_prediction = clf.predict(testX)
one_hot = np.argmax(test_prediction, axis=1)
test_passenger_ids = df_test["PassengerId"].as_matrix()
ans = [[pid, pred] for pid, pred in zip(test_passenger_ids, one_hot)]
np.savetxt("forest_gs_ans.csv", np.array(ans), delimiter=",", fmt="%d", header="PassengerId,Survived")

`>> score = 0.78469`  
Training error is reduced and test score does not change

## Use Normalized features

In [15]:
from sklearn.ensemble import RandomForestClassifier
from sklearn import grid_search

parameters = {
        'n_estimators'      : [30, 40, 50],
        'max_features'      : [2,  4,  6],
        'min_samples_split' : [1,2, 3, 5],
        'max_depth'         : [1, 3, 5, 10]
}
clf = grid_search.GridSearchCV(RandomForestClassifier(), parameters)
clf.fit(norm_trainX, trainY)
print(clf.best_estimator_)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=10, max_features=4, max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=5,
            min_weight_fraction_leaf=0.0, n_estimators=40, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)


### See training error

In [16]:
import numpy as np
prediction = clf.predict(norm_trainX)
correct_num = len([match for match in np.argmax(prediction, axis=1) == trainy if match])
correct_rate = correct_num * 1.0 / trainX.shape[0]
print "Accuracy on training set = %f" % correct_rate

Accuracy on training set = 0.922559


### See feature importance

In [17]:
feature_names = list(df_trainX.columns.values)
for name, importance in zip(feature_names, clf.best_estimator_.feature_importances_):
    print "%s = %f" % (name, importance)

Pclass = 0.112824
Sex = 0.352087
Age = 0.202905
SibSp = 0.047462
Parch = 0.032528
Fare = 0.224556
Embarked = 0.027637


### See test error by submitting answer

In [18]:
test_prediction = clf.predict(norm_testX)
one_hot = np.argmax(test_prediction, axis=1)
test_passenger_ids = df_test["PassengerId"].as_matrix()
ans = [[pid, pred] for pid, pred in zip(test_passenger_ids, one_hot)]
np.savetxt("forest_gs_norm_ans.csv", np.array(ans), delimiter=",", fmt="%d", header="PassengerId,Survived")

`>> score = 0.75120,`  
Training error is reduced but test score gets worse