In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import mean_absolute_error

In [2]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [3]:
cols_to_drop = ['Ticket', 'Name', 'Cabin']
train.drop(columns=cols_to_drop, inplace=True)
test.drop(columns=cols_to_drop, inplace=True)

In [4]:
train.dropna(subset=['Embarked'], inplace=True)
test.dropna(subset=['Embarked'], inplace=True)

In [5]:
age_imputer = SimpleImputer(strategy='median')
train['Age'] = age_imputer.fit_transform(train[['Age']])
test['Age'] = age_imputer.transform(test[['Age']])

In [6]:
fare_imputer = SimpleImputer(strategy='median')
test['Fare'] = fare_imputer.fit_transform(test[['Fare']])

In [7]:
categorical_cols = ['Sex', 'Embarked']
encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
train[categorical_cols] = encoder.fit_transform(train[categorical_cols])
test[categorical_cols] = encoder.transform(test[categorical_cols])

In [8]:
X = train.drop(columns=['Survived'])
y = train['Survived']

In [9]:
train_X, val_X, train_y, val_y = train_test_split(X, y, test_size=0.2, random_state=1)

In [15]:
model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1)
model.fit(train_X, train_y)

In [16]:
val_predictions = model.predict(val_X)
val_mae = mean_absolute_error(val_y, val_predictions)
print(f"Validation MAE: {val_mae:.4f}")

Validation MAE: 0.1685


In [17]:
test_X = test[train_X.columns]
test_predictions = model.predict(test_X)

In [19]:
output = pd.DataFrame({'PassengerId': test.PassengerId, 'Survived': test_predictions})
output.to_csv('submission.csv', index=False)