# Kaggle Titanic competition
Random forest

### Install and import packages

In [None]:
%pip install pandas numpy ydf

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import ydf

### Open input data

In [4]:
df = pd.read_csv("data/train.csv")

### Engineer new features

In [5]:
decks = ["A", "B", "C", "D", "E", "F", "G"]
def extract_deck_from_cabin(cabin):
    if pd.isna(cabin):
        return np.nan

    return cabin[0] if cabin[0] in decks else np.nan

def engineer_features(dataframe):
    new_df = dataframe.copy()
    new_df["Deck"] = new_df["Cabin"].map(extract_deck_from_cabin)
    new_df["FamilySize"] = new_df["SibSp"] + dataframe["Parch"]
    
    return new_df

### Prepare data

In [6]:
deck_dict = {
    "A": 1,
    "B": 2,
    "C": 3,
    "D": 4,
    "E": 5,
    "F": 6,
    "G": 7,
    np.nan: 0
}

def port_map(port):
    match port:
        case "S":
            return 1
        case "C":
            return 2
        case "Q":
            return 3
        case _:
            return None
        
def prepare_data(dataframe):
    new_df = dataframe.copy()
    new_df['Sex'] = new_df['Sex'].map(lambda x: 1 if x == 'female' else 0)

    new_df['Deck'] = new_df['Deck'].map(lambda x: deck_dict[x])
        
    new_df['Embarked'] = new_df['Embarked'].map(lambda x: port_map(x))
    
    # Fill missing age data with mean value of Pclass
    new_df['Age'] = new_df['Age'].fillna(new_df.groupby('Pclass')['Age'].transform('mean'))

    return new_df

In [7]:
prepared_df = prepare_data(engineer_features(df))

# Select data used for training (tried sets of data)
dataset = prepared_df[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'FamilySize', 'Fare', 'Deck', 'Embarked', 'Survived']]
# dataset = prepared_df[['Pclass', 'Sex', 'Age', 'Survived']]

# Remove rows with empty values
dataset = dataset.dropna()


# prepared_df.head()

In [8]:
dataset.tail()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,FamilySize,Fare,Deck,Embarked,Survived
886,2,0,27.0,0,0,0,13.0,0,1.0,0
887,1,1,19.0,0,0,0,30.0,2,1.0,1
888,3,1,25.14062,1,2,3,23.45,0,1.0,0
889,1,0,26.0,0,0,0,30.0,3,2.0,1
890,3,0,32.0,0,0,0,7.75,0,3.0,0


In [9]:
train_data = dataset.sample(frac=0.80)
test_data = dataset.drop(train_data.index)

### Build model

In [10]:
model = ydf.RandomForestLearner(label="Survived", num_trees=1000)

### Train

In [11]:
model = model.train(train_data)

Train model on 711 examples
Model trained in 0:00:00.230108


### Evaluate

In [12]:
model.evaluate(test_data)

Label \ Pred,0,1
0,94,21
1,8,55


In [None]:
model.describe()

### Run predictions

In [14]:
submit_df = pd.read_csv("data/test.csv")

In [15]:
# Prepare data
submit_df = prepare_data(engineer_features(submit_df))
submit_dataset = submit_df[['PassengerId', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'FamilySize', 'Fare', 'Deck', 'Embarked']]
submit_dataset = submit_dataset.dropna()

passengerIds = submit_dataset.pop('PassengerId')

submit_dataset.tail()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,FamilySize,Fare,Deck,Embarked
413,3,0,24.027945,0,0,0,8.05,0,1
414,1,1,39.0,0,0,0,108.9,3,2
415,3,0,38.5,0,0,0,7.25,0,1
416,3,0,24.027945,0,0,0,8.05,0,1
417,3,0,24.027945,1,1,2,22.3583,0,2


In [16]:
predicted = model.predict(submit_dataset)

predictedPassengers = pd.DataFrame(columns=["PassengerId", "Survived"])
predictedPassengers.set_index("PassengerId")
predictedPassengers["PassengerId"] = passengerIds
predictedPassengers["Survived"] = predicted.round()

# predictedPassengers.tail(30)

In [17]:
results = submit_df[['PassengerId']].copy()
results["Survived"] = 0

results.update(predictedPassengers)

# results.tail(50)

In [18]:
results.to_csv("result.csv", sep=",", index=False)