In [8]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from time import time

from keras.models import Sequential
from keras.layers import Dense, Activation
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

np.random.seed(1337)
df = pd.read_csv('train.csv') 
df = df.drop(['Name','Ticket', 'Cabin'], axis=1)

In [9]:
df = df.dropna() # drop rows with missing ages.

In [10]:
# To convert categorical data to numerical data, a few different ways.

# 1.
# df['Sex'] = df['Sex'].map({'female':0, 'male':1}) # change sex to integer values

# 2.
# embarked_sex = pd.get_dummies(df.Sex, prefix="Sex").iloc[:,1:] # change Sex to Sex_male
# embarked_dummies = pd.get_dummies(df.Embarked, prefix="Embarked").iloc[:, 1:] # Embarked_S, Embarked_Q

# concat back to original data frame.
# pd = pd.concat([df, embarked_sex], axis = 1)
# pd = pd.concat([df, embarked_dummies], axis = 1)

# 3.
# A better way:
df = pd.get_dummies(df, columns = ['Sex', 'Embarked'])
df.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,1,0,3,22.0,1,0,7.25,0,1,0,0,1
1,2,1,1,38.0,1,0,71.2833,1,0,1,0,0
2,3,1,3,26.0,0,0,7.925,1,0,0,0,1
3,4,1,1,35.0,1,0,53.1,1,0,0,0,1
4,5,0,3,35.0,0,0,8.05,0,1,0,0,1


In [11]:
# randomly select 80% rows for training and 20% rows for testing.
msk = np.random.rand(len(df)) < 0.8
train = df[msk]
test = df[~msk]

In [12]:
# Normalize features to the same scale.
scaler = StandardScaler()
features = ['Pclass','Sex_female','Sex_male','Age','SibSp','Parch','Fare','Embarked_C','Embarked_Q','Embarked_S']

X_train = scaler.fit_transform(train[features].values)
Y_train = scaler.fit_transform(train['Survived'].values)
y_train_onehot = pd.get_dummies(train['Survived']).values

X_test = scaler.transform(test[features].values)
y_test = test['Survived'].values




In [13]:


start = time()

model = Sequential()
model.add(Dense(input_dim=10, output_dim=100))
model.add(Dense(output_dim=100))
model.add(Dense(output_dim=2))
model.add(Activation("softmax"))

model.compile(loss='categorical_crossentropy', optimizer='sgd', metrics=['accuracy'])

model.fit(X_train, y_train_onehot)

# print('\ntime taken %s seconds' % str(time() - start))

y_prediction = model.predict_classes(X_test)
print("\n\naccuracy", np.sum(y_prediction == y_test) / float(len(y_test)))

  
  import sys
  


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

accuracy 0.730769230769


In [None]:

print('5 x 5 cross validation ...')

scaler = StandardScaler()
X = df[features]
y = df['Survived']

models = [] # to hold all the temporary models.
model_scores = [] # to hold the accuracy scores 

i = 0
skf = StratifiedKFold(n_splits=5, random_state=1234)
for train_indices_out, test_indices_out in skf.split(X, y):
    
    i+=1
    # train/test split - outer loop.
    X_train_out = X.iloc[train_indices_out] 
    y_train_out = y.iloc[train_indices_out]
    
    X_test_out = X.iloc[test_indices_out]
    y_test_out = y.iloc[test_indices_out]
    
    print("------ Round {} ------".format(i))
    
    j = 0
    print("Training Phase ...")
    for train_indices_in, test_indices_in in skf.split(X_train_out, y_train_out):
        
        print('Training Model {}'.format(j+1))
       
        j += 1
        # train/test split - inner loop.
        X_train_in = scaler.fit_transform(X_train_out.iloc[train_indices_in].values)
        y_train_in = scaler.fit_transform(y_train_out.iloc[train_indices_in].values)
        y_train_in_onehot = pd.get_dummies(y_train_out.iloc[train_indices_in]).values
        
        X_test_in = scaler.transform(X_train_out.iloc[test_indices_in].values)
        y_test_in = y_train_out.iloc[test_indices_in]
       
        model = Sequential()
        model.add(Dense(input_dim=10, output_dim=100))
        model.add(Dense(output_dim=100))
        model.add(Dense(output_dim=2))
        model.compile(loss='categorical_crossentropy', optimizer='sgd', metrics=['accuracy'])
        
        model.fit(X_train_in, y_train_in_onehot)
        
        # Add to the model list.
        models.append(model)
        
        # Predict
        y_pred_in = model.predict_classes(X_test_in)

        # Show results
        accuracy_in = accuracy_score(y_test_in, y_pred_in)
        cm_in = classification_report(y_test_in, y_pred_in)

        model_scores.append(accuracy_in)       
               
        print('\nTraining Accuracy: {}'.format(accuracy_in))
        print('\nTraining CM: \n',cm_in)
        
    # Retrieve the index of the model with highest score.
    highest_score_index = model_scores.index(max(model_scores))
    print('Best Model is: {}'.format(highest_score_index+1))
    # Retrieve that model.
    best_model = models[highest_score_index]
    
    
    # Predict on best model on this round.
    y_pred_out = best_model.predict_classes(scaler.transform(X.iloc[test_indices_out].values))
    
    # Show results
    accuracy_out = accuracy_score(y_test_out, y_pred_out)
    cm_out = classification_report(y_test_out, y_pred_out)
    
    
    print("\nTesting Accuracy round {}: {}".format(i, accuracy_out))
    print(cm_out)
    
    # empty results for the next round of execution.
    models = [] 
    model_scores = [] 
    

5 x 5 cross validation ...
------ Round 1 ------
Training Phase ...
Training Model 1




Epoch 1/10
