# Library Import

In [1]:
import pandas as pd
from pandas import DataFrame
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
from sklearn.preprocessing import MinMaxScaler
from sklearn.neural_network import MLPRegressor, MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, AdaBoostClassifier
import xgboost as xgb



# Define Functions

In [2]:
def get_title(name):
    title_search = re.search(' ([A-Za-z]+)\.', name)
    if title_search:
        return title_search.group(1)
    return ""

# Data Import

In [3]:
data = pd.read_csv("D:/rawDataFiles/titanic_train.csv")
question = pd.read_csv("D:/rawDataFiles/titanic_test.csv")

# Data Preprocessing

### Encoding & Dealing with NA

In [4]:
data['Title'] = data['Name'].apply(get_title)
question['Title'] = question['Name'].apply(get_title)

data['HasCabin'] = data['Cabin'].apply(lambda x: 0 if type(x) == float else 1)
question['HasCabin'] = question['Cabin'].apply(lambda x: 0 if type(x) == float else 1)

data['FamilySize'] = data['SibSp'] + data['Parch'] + 1
question['FamilySize'] = question['SibSp'] + question['Parch'] + 1

data['Solo'] = data['FamilySize'].apply(lambda x: 1 if x==1 else 0)
question['Solo'] = question['FamilySize'].apply(lambda x: 1 if x==1 else 0)

data['Embarked'] = data['Embarked'].fillna('S')
question['Embarked'] = question['Embarked'].fillna('S')
data['Embarked'] = data['Embarked'].apply(lambda x: 1 if x=='C' else x).apply(lambda x: 2 if x=='Q' else x).apply(lambda x: 3 if x=='S' else x)
question['Embarked'] = question['Embarked'].apply(lambda x: 1 if x=='C' else x).apply(lambda x: 2 if x=='Q' else x).apply(lambda x: 3 if x=='S' else x)

data['Fare'] = data['Fare'].fillna(data['Fare'].median())
question['Fare'] = question['Fare'].fillna(data['Fare'].median())

data['Sex'] = data['Sex'].apply(lambda x: 1 if x=='male' else 0)
question['Sex'] = question['Sex'].apply(lambda x: 1 if x=='male' else 0)

title_list = data.Title.unique()
title_age_avg = []
for title in title_list :
    title_age_avg.append(title + " : " + str(data[data.Title == title]['Age'].mean()))
title_age_dic = {'Mr':4, 'Mrs':4, 'Miss':3, 'Master':1, 'Don':4, 'Dona':4, 'Rev':4, 'Dr':4, 'Mme':3, 'Ms':3, 'Major':5, 'Lady':5, 'Sir':5, 'Mlle':3, 'Col':5, 'Capt':6, 'Countess':4, 'Jonkheer':4}
data['TitleEncoding'] = data['Title'].apply(lambda x: title_age_dic[x])
question['TitleEncoding'] = question['Title'].apply(lambda x: title_age_dic[x])

### Fill in NA in Age

In [5]:
age_col = ['Age','Pclass','SibSp','FamilySize','TitleEncoding','Solo']
age_data = data[age_col].dropna()
y = age_data['Age']
X = age_data[age_col[1:]]
scaler = MinMaxScaler()
scaler.fit(X)
X= DataFrame(scaler.transform(X))

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=1)

ageNN = MLPRegressor(
    hidden_layer_sizes=(10,9,8),
    activation='relu',
    alpha=0.1,
    max_iter=10000,
    random_state=1
)
ageNN.fit(X_train,y_train)
print('train R^2 : ' + str(ageNN.score(X_train, y_train)))
print('test R^2 : ' + str(ageNN.score(X_test, y_test)))
ageNN.fit(X,y)
nanlist = []
for i in range(891):
    if np.isnan(data.Age[i]):
        nanlist.append(i)
X=[]
for i in nanlist :
    X.append([data.Pclass[i], data.SibSp[i], data.FamilySize[i], data.TitleEncoding[i], data.Solo[i]])
X= DataFrame(scaler.fit(X).transform(X))
Age_pred_result = ageNN.predict(X)
c=0
for i in range(891):
   if np.isnan(data.Age[i]):
       data.Age[i] = Age_pred_result[c]
       c+=1

nanlist = []
for i in range(418):
    if np.isnan(question.Age[i]):
        nanlist.append(i)
X=[]
for i in nanlist :
    X.append([question.Pclass[i], question.SibSp[i], question.FamilySize[i], question.TitleEncoding[i], question.Solo[i]])
X= DataFrame(scaler.fit(X).transform(X))
Age_pred_result = ageNN.predict(X)
c=0
for i in range(418):
   if np.isnan(question.Age[i]):
       question.Age[i] = Age_pred_result[c]
       c+=1

train R^2 : 0.402443198814
test R^2 : 0.438191402185


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


### Preparing Final Dataset

In [6]:
columns_to_use = ['Survived','Pclass','Sex','Age','SibSp','Parch','Fare','Embarked','HasCabin','FamilySize','TitleEncoding','Solo']
data_nontree = data.copy()[columns_to_use]
question_nontree = question.copy()[columns_to_use[1:]]
data_tree = data.copy()[columns_to_use]
question_tree = question.copy()[columns_to_use[1:]]

#NONTREE
data_nontree_X = data_nontree[columns_to_use[1:]]
data_nontree_y = data_nontree['Survived']
nt_train_X, nt_test_X, nt_train_y, nt_test_y = train_test_split(data_nontree_X, data_nontree_y,random_state=1)
scaler.fit(nt_train_X[columns_to_use[1:]])
nt_train_X = DataFrame(scaler.transform(nt_train_X))
nt_test_X = DataFrame(scaler.transform(nt_test_X))

#TREE
data_tree_X = data_tree[columns_to_use[1:]]
data_tree_y = data_tree['Survived']
t_train_X, t_test_X, t_train_y, t_test_y = train_test_split(data_tree_X, data_tree_y, random_state=1)

# Fitting ML Models

## [ Non-Tree Classifiers ]

### ANN

In [7]:
ANN = MLPClassifier(
    hidden_layer_sizes=(10,9),
    activation='relu',
    alpha=3,
    max_iter=100000,
    random_state=1
)
ANN.fit(nt_train_X,nt_train_y)
print("$$ ANN $$")
print('train accuracy : ' + str(ANN.score(nt_train_X, nt_train_y)))
print('test accuracy : ' + str(ANN.score(nt_test_X, nt_test_y)))
print('train AUC : ' + str(roc_auc_score(np.array(nt_train_y),np.array(ANN.predict(nt_train_X)))))
print('test AUC : ' + str(roc_auc_score(np.array(nt_test_y),np.array(ANN.predict(nt_test_X)))))

$$ ANN $$
train accuracy : 0.814371257485
test accuracy : 0.811659192825
train AUC : 0.794166578515
test AUC : 0.795230263158


### SVM

In [55]:
SVM = svm.SVC(
    C = 1
)
SVM.fit(nt_train_X,nt_train_y)
print("$$ SVM $$")
print('train accuracy : ' + str(SVM.score(nt_train_X, nt_train_y)))
print('test accuracy : ' + str(SVM.score(nt_test_X, nt_test_y)))
print('train AUC : ' + str(roc_auc_score(np.array(nt_train_y),np.array(SVM.predict(nt_train_X)))))
print('test AUC : ' + str(roc_auc_score(np.array(nt_test_y),np.array(SVM.predict(nt_test_X)))))

$$ SVM $$
train accuracy : 0.796407185629
test accuracy : 0.789237668161
train AUC : 0.773221652707
test AUC : 0.772985197368


### Logistic Regression

In [9]:
GLM = LogisticRegression()
GLM.fit(nt_train_X,nt_train_y)
print("$$ LOG $$")
print('train accuracy : ' + str(GLM.score(nt_train_X, nt_train_y)))
print('test accuracy : ' + str(GLM.score(nt_test_X, nt_test_y)))
print('train AUC : ' + str(roc_auc_score(np.array(nt_train_y),np.array(GLM.predict(nt_train_X)))))
print('test AUC : ' + str(roc_auc_score(np.array(nt_test_y),np.array(GLM.predict(nt_test_X)))))

$$ LOG $$
train accuracy : 0.812874251497
test accuracy : 0.811659192825
train AUC : 0.790469000933
test AUC : 0.795230263158


## [ Tree Classifiers ]

### Random Forest

In [10]:
RF = RandomForestClassifier(
    n_estimators=600,
    max_depth=3,
    max_features=6,
    random_state=0,
    n_jobs=-1
)
RF.fit(t_train_X, t_train_y)
print("$$ RF $$")
print('train accuracy : ' + str(RF.score(t_train_X, t_train_y)))
print('test accuracy : ' + str(RF.score(t_test_X, t_test_y)))
print('train AUC : ' + str(roc_auc_score(np.array(t_train_y),np.array(RF.predict(t_train_X)))))
print('test AUC : ' + str(roc_auc_score(np.array(t_test_y),np.array(RF.predict(t_test_X)))))

$$ RF $$
train accuracy : 0.844311377246
test accuracy : 0.807174887892
train AUC : 0.826285978055
test AUC : 0.789967105263


### Extra Trees

In [11]:
ET = ExtraTreesClassifier(
    n_estimators=600,
    max_depth=3,
    max_features=6,
    random_state=1,
    n_jobs=-1   
)
ET.fit(t_train_X, t_train_y)
print("$$ ET $$")
print('train accuracy : ' + str(ET.score(t_train_X, t_train_y)))
print('test accuracy : ' + str(ET.score(t_test_X, t_test_y)))
print('train AUC : ' + str(roc_auc_score(np.array(t_train_y),np.array(ET.predict(t_train_X)))))
print('test AUC : ' + str(roc_auc_score(np.array(t_test_y),np.array(ET.predict(t_test_X)))))

$$ ET $$
train accuracy : 0.835329341317
test accuracy : 0.80269058296
train AUC : 0.812466942983
test AUC : 0.784703947368


### GDBT

In [12]:
GBDT = GradientBoostingClassifier(
    learning_rate=0.01,
    n_estimators=100,
    max_depth=3,
    max_features=6,
    random_state=None
)
GBDT.fit(t_train_X, t_train_y)
print("$$ GBDT $$")
print('train accuracy : ' + str(GBDT.score(t_train_X, t_train_y)))
print('test accuracy : ' + str(GBDT.score(t_test_X, t_test_y)))
print('train AUC : ' + str(roc_auc_score(np.array(t_train_y),np.array(GBDT.predict(t_train_X)))))
print('test AUC : ' + str(roc_auc_score(np.array(t_test_y),np.array(GBDT.predict(t_test_X)))))

$$ GBDT $$
train accuracy : 0.859281437126
test accuracy : 0.798206278027
train AUC : 0.825612816987
test AUC : 0.771299342105


### AdaBoost

In [13]:
ADA = AdaBoostClassifier(
    learning_rate=0.01,
    n_estimators=300,
    random_state=1
)
ADA.fit(t_train_X, t_train_y)
print("$$ ADA $$")
print('train accuracy : ' + str(ADA.score(t_train_X, t_train_y)))
print('test accuracy : ' + str(ADA.score(t_test_X, t_test_y)))
print('train AUC : ' + str(roc_auc_score(np.array(t_train_y),np.array(ADA.predict(t_train_X)))))
print('test AUC : ' + str(roc_auc_score(np.array(t_test_y),np.array(ADA.predict(t_test_X)))))

$$ ADA $$
train accuracy : 0.796407185629
test accuracy : 0.784753363229
train AUC : 0.778241510958
test AUC : 0.770435855263


### XGBoost

In [14]:
XGB = xgb.XGBClassifier(
    learning_rate=0.01,
    n_estimators=100,
    max_depth=2,
    gamma=0.9,
    nthread=-1
)
XGB.fit(t_train_X, t_train_y)
print("$$ XGB $$")
print('train accuracy : ' + str(XGB.score(t_train_X, t_train_y)))
print('test accuracy : ' + str(XGB.score(t_test_X, t_test_y)))
print('train AUC : ' + str(roc_auc_score(np.array(t_train_y),np.array(XGB.predict(t_train_X)))))
print('test AUC : ' + str(roc_auc_score(np.array(t_test_y),np.array(XGB.predict(t_test_X)))))

$$ XGB $$
train accuracy : 0.823353293413
test accuracy : 0.80269058296
train AUC : 0.798782540125
test AUC : 0.784703947368


# Stacking ML Models

### ANN + SVM + LOG + ADA

In [15]:
result = DataFrame({'ANN':ANN.predict(nt_train_X),'SVM':SVM.predict(nt_train_X),'LOG':GLM.predict(nt_train_X),'ADA':ADA.predict(t_train_X),'Actual':t_train_y})
x=['ADA','ANN','LOG','SVM']
X = result[x]
y = result['Actual']

##### STACK : ANN

In [33]:
FINnn4 = MLPClassifier(
    hidden_layer_sizes=(10,9),
    activation='relu',
    alpha=1,
    max_iter=100000,
    random_state=10
)
FINnn4.fit(X,y)
print("$$ FINnn $$")
print('accuracy : ' + str(FINnn4.score(X,y)))
print('AUC : ' + str(roc_auc_score(np.array(y),np.array(FINnn4.predict(X)))))

$$ FINnn $$
accuracy : 0.835329341317
AUC : 0.812466942983


##### STACK : LOGISTIC REGRESSION

In [34]:
FINlog4 = LogisticRegression()
FINlog4.fit(X,y)
print("$$ FINlog $$")
print('accuracy : ' + str(FINlog4.score(X,y)))
print('AUC : ' + str(roc_auc_score(np.array(y),np.array(FINlog4.predict(X)))))

$$ FINlog $$
accuracy : 0.832335329341
AUC : 0.810928289113


### ANN + SVM + LOG + ADA + ET + XGB

In [35]:
result = DataFrame({'ANN':ANN.predict(nt_train_X),'SVM':SVM.predict(nt_train_X),'LOG':GLM.predict(nt_train_X),
                    'ADA':ADA.predict(t_train_X),'ET':ET.predict(t_train_X),'XGB':XGB.predict(t_train_X),
                    'Actual':t_train_y})
x=['ADA','ANN','LOG','SVM','ET','XGB']
X = result[x]
y = result['Actual']

##### STACK : ANN

In [46]:
FINnn6 = MLPClassifier(
    hidden_layer_sizes=(50,25),
    activation='relu',
    alpha=1,
    max_iter=100000,
    random_state=None
)
FINnn6.fit(X,y)
print("$$ FINnn $$")
print('accuracy : ' + str(FINnn6.score(X,y)))
print('AUC : ' + str(roc_auc_score(np.array(y),np.array(FINnn6.predict(X)))))

$$ FINnn $$
accuracy : 0.829341317365
AUC : 0.808552992201


##### STACK : LOGISTIC REGRESSION

In [47]:
FINlog6 = LogisticRegression()
FINlog6.fit(X,y)
print("$$ FINlog $$")
print('accuracy : ' + str(FINlog6.score(X,y)))
print('AUC : ' + str(roc_auc_score(np.array(y),np.array(FINlog6.predict(X)))))

$$ FINlog $$
accuracy : 0.832335329341
AUC : 0.810928289113


# Submission

In [48]:
scaler.fit(data[columns_to_use[1:]])
data_nontree_X = DataFrame(scaler.transform(data_nontree_X))
question_nontree = scaler.transform(question_nontree)

ANN = MLPClassifier(
    hidden_layer_sizes=(10,9),
    activation='relu',
    alpha=0.01,
    max_iter=100000,
    random_state=1
)
ANN.fit(data_nontree_X,data_nontree_y)

SVM = svm.SVC()
SVM.fit(data_nontree_X,data_nontree_y)

GLM = LogisticRegression()
GLM.fit(data_nontree_X,data_nontree_y)

ADA = AdaBoostClassifier(
    learning_rate=0.01,
    n_estimators=300,
    random_state=0
)
ADA.fit(data_tree_X,data_tree_y)

result = DataFrame({'ANN':ANN.predict(data_nontree_X),'SVM':SVM.predict(data_nontree_X),'LOG':GLM.predict(data_nontree_X),'ADA':ADA.predict(data_tree_X),'Actual':data_nontree_y})
x=['ADA','ANN','LOG','SVM']
FINlog4 = LogisticRegression()
FINlog4.fit(result[x],result['Actual'])

total = DataFrame({'ANN':ANN.predict(question_nontree),'SVM':SVM.predict(question_nontree),'GLM':GLM.predict(question_nontree),'ADA':ADA.predict(question_tree)})

submission = pd.DataFrame({'PassengerId': question['PassengerId'], 'Survived':FINlog4.predict(total)})
submission.to_csv("C:/Users/Froilan/Desktop/Repository/kaggleResultCSV/Titanic_python_stacking.csv",index=False)