# Kaggle Titanic competition
Linear regression

### Install and import packages

In [None]:
%pip install pandas numpy tensorflow

In [192]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

### Open input data

In [1354]:
df = pd.read_csv("data/train.csv")

### Engineer new features

In [1355]:
decks = ["A", "B", "C", "D", "E", "F", "G"]
def extract_deck_from_cabin(cabin):
    if pd.isna(cabin):
        return np.nan

    return cabin[0] if cabin[0] in decks else np.nan

def engineer_features(dataframe):
    new_df = dataframe.copy()
    new_df["Deck"] = new_df["Cabin"].map(extract_deck_from_cabin)
    new_df["FamilySize"] = new_df["SibSp"] + dataframe["Parch"]
    
    return new_df

### Prepare data

In [1356]:
deck_dict = {
    "A": 1,
    "B": 2,
    "C": 3,
    "D": 4,
    "E": 5,
    "F": 6,
    "G": 7,
    np.nan: 0
}

def port_map(port):
    match port:
        case "S":
            return 1
        case "C":
            return 2
        case "Q":
            return 3
        case _:
            return None
        
def fill_age_with_mean(group):
    mean_age = group['Age'].mean()
    group['Age'] = group['Age'].fillna(round(mean_age))
        
    return group

def prepare_data(dataframe):
    new_df = dataframe.copy()
    new_df['Sex'] = new_df['Sex'].map(lambda x: 1 if x == 'female' else 0)

    new_df['Deck'] = new_df['Deck'].map(lambda x: deck_dict[x])
        
    new_df['Embarked'] = new_df['Embarked'].map(lambda x: port_map(x))
    
    # Fill missing age data with mean value of Pclass
    new_df['Age'] = new_df['Age'].fillna(new_df.groupby('Pclass')['Age'].transform('mean'))

    return new_df

In [1357]:
prepared_df = prepare_data(engineer_features(df))

# Select data used for training (tried sets of data)
dataset = prepared_df[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Deck', 'Embarked', 'Survived']]
# dataset = prepared_df[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Deck', 'Survived']]
# dataset = prepared_df[['Pclass', 'Sex', 'Age', 'FamilySize', 'Fare', 'Deck', 'Survived']]
# dataset = prepared_df[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Survived']]
# dataset = prepared_df[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Deck', 'Survived']]
# dataset = prepared_df[['Pclass', 'Sex', 'Age', 'Survived']]

# Remove rows with empty values
dataset = dataset.dropna()

In [1358]:
dataset.tail()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Deck,Embarked,Survived
886,2,0,27.0,0,0,13.0,0,1.0,0
887,1,1,19.0,0,0,30.0,2,1.0,1
888,3,1,25.14062,1,2,23.45,0,1.0,0
889,1,0,26.0,0,0,30.0,3,2.0,1
890,3,0,32.0,0,0,7.75,0,3.0,0


In [1359]:
train_data = dataset.sample(frac=0.80)
test_data = dataset.drop(train_data.index)

In [1360]:
# Split features from labels
train_features = train_data.copy()
test_features = test_data.copy()

train_labels = train_features.pop("Survived")
test_labels = test_features.pop("Survived")

### Build model

In [1361]:
normalizer = layers.Normalization()
normalizer.adapt(np.array(train_features))


model = keras.Sequential()

model.add(layers.InputLayer(shape=(train_features.shape[1],)))

model.add(normalizer)

model.add(layers.Dense(units=16, activation='relu'))
# model.add(layers.Dense(units=32, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))

In [1362]:
adam = keras.optimizers.Adam(learning_rate=0.001)

In [1363]:
model.compile(loss='binary_crossentropy', optimizer=adam, metrics=["accuracy"])

### Train

In [None]:
model.fit(train_features, train_labels, epochs=150, batch_size=32)

### Evaluate

In [1365]:
loss_and_metrics = model.evaluate(test_features, test_labels)
print(loss_and_metrics)
print('Loss = ',loss_and_metrics[0])
print('Accuracy = ',loss_and_metrics[1])

[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.8628 - loss: 0.3841  
[0.3991720974445343, 0.8483145833015442]
Loss =  0.3991720974445343
Accuracy =  0.8483145833015442


### Run predictions

In [1366]:
submit_df = pd.read_csv("data/test.csv")

In [1367]:
# Prepare data
submit_df = prepare_data(engineer_features(submit_df))
submit_dataset = submit_df[np.array(["PassengerId", *train_features.columns])]
submit_dataset = submit_dataset.dropna()

passengerIds = submit_dataset.pop('PassengerId')

submit_dataset.tail()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Deck,Embarked
413,3,0,24.027945,0,0,8.05,0,1
414,1,1,39.0,0,0,108.9,3,2
415,3,0,38.5,0,0,7.25,0,1
416,3,0,24.027945,0,0,8.05,0,1
417,3,0,24.027945,1,1,22.3583,0,2


In [1368]:
predicted = model.predict(submit_dataset)

predictedPassengers = pd.DataFrame(columns=["PassengerId", "Survived"])
predictedPassengers.set_index("PassengerId")
predictedPassengers["PassengerId"] = passengerIds
predictedPassengers["Survived"] = predicted.round()

# predictedPassengers.tail(30)

[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step


In [1369]:
results = submit_df[['PassengerId']].copy()
results["Survived"] = 0

results.update(predictedPassengers)
# results.tail(50)

In [1370]:
results.to_csv("result.csv", sep=",", index=False)