# Kaggle Titanic competition
Linear regression

### Install and import packages

In [None]:
%pip install pandas numpy tensorflow

In [192]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

### Open input data

In [1012]:
df = pd.read_csv("data/train.csv")

### Engineer new features

In [1013]:
decks = ["A", "B", "C", "D", "E", "F", "G"]
def extract_deck_from_cabin(cabin):
    if pd.isna(cabin):
        return np.nan

    return cabin[0] if cabin[0] in decks else np.nan

def engineer_features(dataframe):
    new_df = dataframe.copy()
    new_df["Deck"] = new_df["Cabin"].map(extract_deck_from_cabin)
    new_df["FamilySize"] = new_df["SibSp"] + dataframe["Parch"]
    
    return new_df

### Prepare data

In [1014]:
deck_dict = {
    "A": 1,
    "B": 2,
    "C": 3,
    "D": 4,
    "E": 5,
    "F": 6,
    "G": 7,
    np.nan: 0
}

def port_map(port):
    match port:
        case "S":
            return 1
        case "C":
            return 2
        case "Q":
            return 3
        case _:
            return None
        
def prepare_data(dataframe):
    new_df = dataframe.copy()
    new_df['Sex'] = new_df['Sex'].map(lambda x: 1 if x == 'female' else 0)

    new_df['Deck'] = new_df['Deck'].map(lambda x: deck_dict[x])
        
    new_df['Embarked'] = new_df['Embarked'].map(lambda x: port_map(x))
 
    return new_df

In [None]:
prepared_df = prepare_data(engineer_features(df))

# Select data used for training (tried sets of data)
dataset = prepared_df[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Deck', 'Embarked', 'Survived']]
# dataset = prepared_df[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Deck', 'Survived']]
# dataset = prepared_df[['Pclass', 'Sex', 'Age', 'FamilySize', 'Fare', 'Deck', 'Survived']]
# dataset = prepared_df[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Survived']]
# dataset = prepared_df[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Deck', 'Survived']]
# dataset = prepared_df[['Pclass', 'Sex', 'Age', 'Survived']]

# Remove rows with empty values
dataset = dataset.dropna()


# prepared_df.head()

In [1016]:

dataset.tail()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Deck,Survived
885,3,1,39.0,0,5,29.125,0,0
886,2,0,27.0,0,0,13.0,0,0
887,1,1,19.0,0,0,30.0,2,1
889,1,0,26.0,0,0,30.0,3,1
890,3,0,32.0,0,0,7.75,0,0


In [1017]:
train_data = dataset.sample(frac=0.80)
test_data = dataset.drop(train_data.index)

In [1018]:
# Split features from labels
train_features = train_data.copy()
test_features = test_data.copy()

train_labels = train_features.pop("Survived")
test_labels = test_features.pop("Survived")

### Build model

In [1019]:
normalizer = layers.Normalization()
normalizer.adapt(np.array(train_features))


model = keras.Sequential()

model.add(layers.InputLayer(shape=(train_features.shape[1],)))

model.add(normalizer)

model.add(layers.Dense(units=16, activation='relu'))
# model.add(layers.Dense(units=32, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))

In [1020]:
adam = keras.optimizers.Adam(learning_rate=0.001)

In [1021]:
model.compile(loss='binary_crossentropy', optimizer=adam, metrics=["accuracy"])

### Train

In [None]:
model.fit(train_features, train_labels, epochs=150, batch_size=32)

### Evaluate

In [1023]:
loss_and_metrics = model.evaluate(test_features, test_labels)
print(loss_and_metrics)
print('Loss = ',loss_and_metrics[0])
print('Accuracy = ',loss_and_metrics[1])

[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7944 - loss: 0.5366  
[0.450256884098053, 0.8181818127632141]
Loss =  0.450256884098053
Accuracy =  0.8181818127632141


### Run predictions

In [1024]:
submit_df = pd.read_csv("data/test.csv")

In [1025]:
# Prepare data
submit_df = prepare_data(engineer_features(submit_df))
submit_dataset = submit_df[np.array(["PassengerId", *train_features.columns])]
submit_dataset = submit_dataset.dropna()

passengerIds = submit_dataset.pop('PassengerId')

submit_dataset.tail()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Deck
409,3,1,3.0,1,1,13.775,0
411,1,1,37.0,1,0,90.0,3
412,3,1,28.0,0,0,7.775,0
414,1,1,39.0,0,0,108.9,3
415,3,0,38.5,0,0,7.25,0


In [None]:
predicted = model.predict(submit_dataset)

predictedPassengers = pd.DataFrame(columns=["PassengerId", "Survived"])
predictedPassengers.set_index("PassengerId")
predictedPassengers["PassengerId"] = passengerIds
predictedPassengers["Survived"] = predicted.round()

# predictedPassengers.tail(30)

In [1027]:
results = submit_df[['PassengerId']].copy()
results["Survived"] = 0

results.update(predictedPassengers)
# results.tail(50)

In [1028]:
results.to_csv("result.csv", sep=",", index=False)