# Kaggle Titanic competition

### Install and import packages

In [None]:
%pip install pandas numpy matplotlib tensorflow

In [67]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

### Open input data

In [115]:
df = pd.read_csv("data/train.csv")

### Engineer new features

In [116]:
decks = ["A", "B", "C", "D", "E", "F", "G"]
def extract_deck_from_cabin(cabin):
    if pd.isna(cabin):
        return np.nan

    return cabin[0] if cabin[0] in decks else np.nan

def engineer_features(dataframe):
    dataframe["Deck"] = dataframe["Cabin"].map(extract_deck_from_cabin)
    dataframe["FamilySize"] = dataframe["SibSp"] + dataframe["Parch"]
    
    return dataframe

### Prepare data

In [117]:
deck_dict = {
	"A": 1,
	"B": 2,
	"C": 3,
	"D": 4,
	"E": 5,
	"F": 6,
	"G": 7,
	np.nan: 0
}

def prepare_data(dataframe):
	dataframe['Sex'] = dataframe['Sex'].map(lambda x: 1 if x == 'female' else 0)

	dataframe['Deck'] = dataframe['Deck'].map(lambda x: deck_dict[x])
 
	return dataframe

In [118]:
df = engineer_features(df)
df = prepare_data(df)
# Select data used for training
dataset = df[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Deck', 'Survived']]

# Remove rows with empty values
dataset = dataset.dropna()

In [119]:

dataset.tail()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Deck,Survived
885,3,1,39.0,0,5,29.125,0,0
886,2,0,27.0,0,0,13.0,0,0
887,1,1,19.0,0,0,30.0,2,1
889,1,0,26.0,0,0,30.0,3,1
890,3,0,32.0,0,0,7.75,0,0


In [120]:
train_data = dataset.sample(frac=0.80)
test_data = dataset.drop(train_data.index)

In [121]:
# Split features from labels
train_features = train_data.copy()
test_features = test_data.copy()

train_labels = train_features.pop("Survived")
test_labels = test_features.pop("Survived")

### Build model

In [122]:
normalizer = layers.Normalization()
normalizer.adapt(np.array(train_features))

model = keras.Sequential()

model.add(normalizer)

model.add(layers.Dense(units=16, activation='relu', input_shape=(7,)))
model.add(layers.Dense(1, activation='sigmoid'))

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [123]:
adam = keras.optimizers.Adam(learning_rate=0.001)

In [124]:
model.compile(loss='binary_crossentropy', optimizer=adam, metrics=["accuracy"])

### Train

In [125]:
model.fit(train_features, train_labels, epochs=100, batch_size=32)

Epoch 1/100
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1ms/step - accuracy: 0.3750 - loss: 0.8430
Epoch 2/100
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.4012 - loss: 0.7897 
Epoch 3/100
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.5614 - loss: 0.7331 
Epoch 4/100
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7098 - loss: 0.6804 
Epoch 5/100
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7218 - loss: 0.6415 
Epoch 6/100
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.6837 - loss: 0.6227 
Epoch 7/100
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 982us/step - accuracy: 0.7152 - loss: 0.5945
Epoch 8/100
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7420 - loss: 0.5492 
Epoch 9/100
[1m18/18[0m [32m━━━━━━━━━

<keras.src.callbacks.history.History at 0x21952f4ba50>

### Evaluate

In [126]:
loss_and_metrics = model.evaluate(test_features, test_labels)
print(loss_and_metrics)
print('Loss = ',loss_and_metrics[0])
print('Accuracy = ',loss_and_metrics[1])

[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7502 - loss: 0.5853  
[0.5150721073150635, 0.7832167744636536]
Loss =  0.5150721073150635
Accuracy =  0.7832167744636536


### Run predictions

In [127]:
submit_df = pd.read_csv("data/test.csv")

In [153]:
# Prepare data
submit_df = prepare_data(engineer_features(submit_df))
submit_dataset = submit_df[['PassengerId', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Deck']]
submit_dataset = submit_dataset.dropna()

passengerIds = submit_dataset.pop('PassengerId')

submit_dataset.tail()


Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Deck
409,3,0,3.0,1,1,13.775,0
411,1,0,37.0,1,0,90.0,3
412,3,0,28.0,0,0,7.775,0
414,1,0,39.0,0,0,108.9,3
415,3,0,38.5,0,0,7.25,0


In [175]:
predicted = model.predict(submit_dataset)

predictedPassengers = pd.DataFrame(columns=["PassengerId", "Survived"])
predictedPassengers.set_index("PassengerId")
predictedPassengers["PassengerId"] = passengerIds
predictedPassengers["Survived"] = predicted.round()

# predictedPassengers.tail(30)

[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 962us/step


In [176]:
results = submit_df[['PassengerId']]
results["Survived"] = 0

results.update(predictedPassengers)
# results.tail(50)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  results["Survived"] = 0


In [179]:
results.to_csv("result.csv", sep=",", index=False)