In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
from sklearn.impute import SimpleImputer

# Load the datasets
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

def preprocess_data(df):
    # Create a copy of the DataFrame
    X = df.copy()

    # Select features based on importance
    features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
    X = X[features]


    # Encode categorical variables
    le = LabelEncoder()
    X['Sex'] = le.fit_transform(X['Sex'])
    X['Embarked'] = le.fit_transform(X['Embarked'])

    # Feature engineering
    X['FamilySize'] = X['SibSp'] + X['Parch'] + 1
    X['FarePerPerson'] = X['Fare'] / X['FamilySize']

    return X

X_train = preprocess_data(train_df)
y_train = train_df['Survived']

# Cross-validation function
def evaluate_model(model, X, y):
    scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')
    print(f"Cross-validation scores: {scores}")
    print(f"Mean accuracy: {scores.mean():.3f} (+/- {scores.std() * 2:.3f})")

# Base Random Forest model
base_rf = RandomForestClassifier(random_state=42)
print("Base Random Forest Model:")
evaluate_model(base_rf, X_train, y_train)

# Hyperparameter tuning
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

print("\nBest parameters:", grid_search.best_params_)
print("Best cross-validation score:", grid_search.best_score_)

# Train the model with best parameters
best_model = grid_search.best_estimator_
best_model.fit(X_train, y_train)

# Feature importance
feature_importance = pd.DataFrame({'feature': X_train.columns, 'importance': best_model.feature_importances_})
feature_importance = feature_importance.sort_values('importance', ascending=False)
print("\nFeature Importance:")
print(feature_importance)

# Make predictions on test set
X_test = preprocess_data(test_df)
y_pred = best_model.predict(X_test)

# Create submission file
submission = pd.DataFrame({
    'PassengerId': test_df['PassengerId'],
    'Survived': y_pred
})
submission['Survived'] = submission['Survived'].astype(int)
submission.to_csv('titanic_submission.csv', index=False)

print("\nSubmission file 'titanic_submission.csv' has been created.")
print(f"Number of predictions: {len(submission)}")

# Print final model accuracy on training set
train_accuracy = best_model.score(X_train, y_train)
print(f"\nFinal model accuracy on training set: {train_accuracy:.3f}")