In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

def encode_features(data):
    data = data.copy()
    SEX_MAP = {'Male': 0, 'Female': 1}
    PCLASS_MAP = {'First class': 1, 'Second class': 2, 'Third class': 3}
    EMBARKED_MAP = {'S': 0, 'C': 1, 'Q': 2}

    data['Sex_num'] = data['Sex'].map(SEX_MAP)
    data['Pclass_num'] = data['Pclass'].map(PCLASS_MAP)
    data['Embarked_num'] = data['Embarked'].map(EMBARKED_MAP)

    data[['Sex_num', 'Pclass_num', 'Embarked_num']] = data[['Sex_num', 'Pclass_num', 'Embarked_num']].fillna(-1)
    return data

def load_and_prepare_data(path):
    data = pd.read_csv(path)
    data['Age'] = data['Age'].fillna(data.groupby(['Sex', 'Pclass'])['Age'].transform('mean'))
    data['Cabin'] = data['Cabin'].fillna('Unknown')
    data['Embarked'] = data['Embarked'].fillna('Unknown')
    data['Fare'] = data['Fare'].fillna(data.groupby(['Sex', 'Pclass'])['Fare'].transform('mean'))

    data['Pclass'] = data['Pclass'].replace({
        1: 'First class',
        2: 'Second class',
        3: 'Third class'
    })
    data['Sex'] = data['Sex'].replace({
        'male': 'Male',
        'female': 'Female'
    })
    data = encode_features(data)
    return data

def find_optimal_threshold(data):
    best_accuracy = 0
    best_threshold = 0.5
    for threshold in np.arange(0.1, 0.9, 0.01):
        predictions = (data['PredictedChance'] >= threshold).astype('int')
        accuracy = (predictions == data['Survived']).mean()
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_threshold = threshold
    return best_threshold

def train_classifier(train_x, val_x, train_y, val_y):
    best_acc = 0
    best_model = None
    for max_leaf_nodes in [2, 5, 10, 20, 48, 50, 75, 100, 200, 500]:
        titanic_model = RandomForestClassifier(max_leaf_nodes = max_leaf_nodes, random_state = 1)
        titanic_model.fit(train_x, train_y)  
        val_probs = titanic_model.predict_proba(val_x)[:, 1]
        predicted_val = (val_probs >= 0.5).astype(int)
        acc = accuracy_score(val_y, predicted_val)
        if acc > best_acc:
            best_acc = acc
            best_model = titanic_model
    return best_model
 
def train_random_forest(train_data):
    features = ['Sex_num', 'Pclass_num', 'SibSp', 'Parch', 'Embarked_num', 'Age', 'Fare']
    x = train_data[features]
    y = train_data['Survived']
    
    train_x, val_x, train_y, val_y = train_test_split(x, y, random_state = 1)
    best_model = train_classifier(train_x, val_x, train_y, val_y)

    val_set = val_x.copy()
    val_set['PredictedChance'] = best_model.predict_proba(val_x)[:, 1]
    val_set['Survived'] = val_y
    best_threshold = find_optimal_threshold(val_set)
    
    return best_model, val_set, best_threshold

def predict_test(test_data, model, threshold):
    features = ['Sex_num', 'Pclass_num', 'SibSp', 'Parch', 'Embarked_num', 'Age', 'Fare']
    predicted_prob = model.predict_proba(test_data[features])[:, 1]
    predictions = (predicted_prob >= threshold).astype('int')
    return predictions

def main():
    train_path = '/kaggle/input/titanic/train.csv'
    test_path = '/kaggle/input/titanic/test.csv'
    submission_path = 'submission.csv'

    train_data = load_and_prepare_data(train_path)
    test_data = load_and_prepare_data(test_path)

    model, val_set, threshold = train_random_forest(train_data)
    test_data['Survived'] = predict_test(test_data, model, threshold)
    
    submission = test_data[['PassengerId', 'Survived']]
    submission.to_csv(submission_path, index = False)

main()