In [21]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# The updated file paths reflecting the Kaggle input directory
TRAIN_PATH = '/kaggle/input/dma-25-kaggle-competition/train.csv'
TEST_PATH = '/kaggle/input/dma-25-kaggle-competition/test.csv'
RANDOM_SEED = 42

# 1. Load Data
train_df = pd.read_csv(TRAIN_PATH)
test_df = pd.read_csv(TEST_PATH)

test_ids = test_df['PassengerId']
full_df = pd.concat([train_df.drop('Survived', axis=1), test_df], ignore_index=True)

# 2. Feature Engineering and Preprocessing
full_df['Title'] = full_df['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
rare_titles = full_df['Title'].value_counts()[full_df['Title'].value_counts() < 10].index
full_df['Title'] = full_df['Title'].replace(rare_titles, 'Rare')
full_df['Title'] = full_df['Title'].replace(['Mme', 'Mlle', 'Ms'], 'Miss')

full_df['FamilySize'] = full_df['SibSp'] + full_df['Parch'] + 1
full_df['IsAlone'] = (full_df['FamilySize'] == 1).astype(int)

# Impute Missing Values (using direct assignment)
full_df['Age'] = full_df['Age'].fillna(full_df['Age'].median())
full_df['Fare'] = full_df['Fare'].fillna(full_df['Fare'].median())
full_df['Embarked'] = full_df['Embarked'].fillna(full_df['Embarked'].mode()[0])

# Drop non-essential columns
full_df = full_df.drop(['Name', 'Ticket', 'Cabin', 'SibSp', 'Parch', 'PassengerId'], axis=1)

# 3. Encode Categorical Features
categorical_cols = ['Sex', 'Embarked', 'Pclass', 'Title']
full_df = pd.get_dummies(full_df, columns=categorical_cols, drop_first=True)

# 4. Split Data
X_train = full_df.iloc[:len(train_df)]
X_test = full_df.iloc[len(train_df):]
y_train = train_df['Survived']

# 4b. Validation Split (For internal testing)
X_train_model, X_val, y_train_model, y_val = train_test_split(
    X_train, y_train, test_size=0.2, random_state=RANDOM_SEED
)

# 5. Model Training and Evaluation
model = RandomForestClassifier(n_estimators=300, max_depth=11, min_samples_leaf=3, random_state=RANDOM_SEED)
model.fit(X_train_model, y_train_model)

val_predictions = model.predict(X_val)
val_accuracy = accuracy_score(y_val, val_predictions)
print(f"Validation Accuracy: {val_accuracy:.4f}")

# 6. Final Predictions and Submission
predictions = model.predict(X_test)

submission_df = pd.DataFrame({
    'PassengerId': test_ids,
    'Survived': predictions.astype(int)
})

print("--- Submission Preview ---")
print(submission_df.head())
submission_df.to_csv('submission_data144_lab5a_t3.csv', index=False)

/kaggle/input/dma-25-kaggle-competition/train.csv
/kaggle/input/dma-25-kaggle-competition/test.csv
/kaggle/input/dma-25-kaggle-competition/gender_submission.csv
Validation Accuracy: 0.8436
--- Submission Preview ---
   PassengerId  Survived
0          892         0
1          893         0
2          894         0
3          895         0
4          896         1
