In [None]:
import pandas as pd

# Load train and test data
train_data = pd.read_csv('/kaggle/input/titanic/train.csv')
test_data = pd.read_csv('/kaggle/input/titanic/test.csv')

# Display the first few rows to understand the structure
print("Train data preview:")
display(train_data.head())
print("Test data preview:")
display(test_data.head())


In [None]:
# Add missing columns if necessary with default values
if 'Embarked' not in train_data.columns:
    train_data['Embarked'] = 'S'
if 'Embarked' not in test_data.columns:
    test_data['Embarked'] = 'S'

if 'Sex' not in train_data.columns:
    train_data['Sex'] = 'male'
if 'Sex' not in test_data.columns:
    test_data['Sex'] = 'male'

# Convert 'Sex' to numerical values
train_data['Sex'] = train_data['Sex'].map({'male': 0, 'female': 1})
test_data['Sex'] = test_data['Sex'].map({'male': 0, 'female': 1})

# Create 'Title' column if missing, based on the gender
if 'Title' not in train_data.columns:
    train_data['Title'] = 'Unknown'
    train_data.loc[train_data['Sex'] == 0, 'Title'] = 'Mr'
    train_data.loc[train_data['Sex'] == 1, 'Title'] = 'Mrs'

if 'Title' not in test_data.columns:
    test_data['Title'] = 'Unknown'
    test_data.loc[test_data['Sex'] == 0, 'Title'] = 'Mr'
    test_data.loc[test_data['Sex'] == 1, 'Title'] = 'Mrs'

# Check the updates
print("Updated train data with added columns if necessary:")
display(train_data.head())
print("Updated test data with added columns if necessary:")
display(test_data.head())


In [None]:
# Add 'FamilySize' feature
train_data['FamilySize'] = train_data['SibSp'] + train_data['Parch']
test_data['FamilySize'] = test_data['SibSp'] + test_data['Parch']

# Verify the addition of 'FamilySize'
print("Train data with FamilySize:")
display(train_data[['SibSp', 'Parch', 'FamilySize']].head())
print("Test data with FamilySize:")
display(test_data[['SibSp', 'Parch', 'FamilySize']].head())


In [None]:
# Mark the datasets for easier splitting later
train_data['is_train'] = 1
test_data['is_train'] = 0

# Concatenate datasets for one-hot encoding
combined_data = pd.concat([train_data, test_data], axis=0).reset_index(drop=True)

# Apply one-hot encoding to categorical columns
combined_data = pd.get_dummies(combined_data, columns=['Embarked', 'Title'], drop_first=True)

# Split data back into train_data and test_data
train_data = combined_data[combined_data['is_train'] == 1].drop(columns=['is_train'])
test_data = combined_data[combined_data['is_train'] == 0].drop(columns=['is_train'])

# Check the one-hot encoded columns
print("One-hot encoded train data preview:")
display(train_data.head())
print("One-hot encoded test data preview:")
display(test_data.head())


In [None]:
# Define X and y
X_final = train_data.drop(columns=['Survived', 'PassengerId', 'Name', 'Ticket', 'Cabin'], errors='ignore')
y_final = train_data['Survived']
X_test = test_data.drop(columns=['Survived', 'PassengerId', 'Name', 'Ticket', 'Cabin'], errors='ignore')

# Fill any missing values in features with the median
X_final = X_final.fillna(X_final.median())
X_test = X_test.fillna(X_test.median())

# Check the final features and target variable
print("Final training features:")
display(X_final.head())
print("Training target variable preview:")
display(y_final.head())
print("Final test features:")
display(X_test.head())


In [None]:
from sklearn.preprocessing import StandardScaler

# Initialize and fit scaler on the training data
scaler = StandardScaler()
X_final_scaled = scaler.fit_transform(X_final)
X_test_scaled = scaler.transform(X_test)

# Check the scaled data
print("Scaled training features (preview):")
display(pd.DataFrame(X_final_scaled, columns=X_final.columns).head())
print("Scaled test features (preview):")
display(pd.DataFrame(X_test_scaled, columns=X_test.columns).head())


In [None]:
from sklearn.ensemble import RandomForestClassifier

# Train RandomForest model
ensemble_model = RandomForestClassifier(n_estimators=100, random_state=42)
ensemble_model.fit(X_final_scaled, y_final)

# Display model performance on training data
print("Model training complete. Model performance on training data:")
print(f"Training accuracy: {ensemble_model.score(X_final_scaled, y_final):.4f}")


In [None]:
# Make predictions
test_predictions = ensemble_model.predict(X_test_scaled)

# Check the predictions
print("Predictions on test data (preview):")
print(test_predictions[:10])


In [None]:
# Prepare the submission DataFrame
submission = pd.DataFrame({
    'PassengerId': test_data['PassengerId'],
    'Survived': test_predictions
})

# Save to CSV
submission.to_csv('submission.csv', index=False)
print("Submission file created as 'submission.csv'. Here is a preview:")
display(submission.head())
