In [5]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE, SelectKBest, chi2
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import optuna
import joblib

# Load and preprocess the training data
train_df = pd.read_csv('train.csv')
train_df['Age'].fillna(train_df['Age'].median(), inplace=True)
train_df['Embarked'].fillna(train_df['Embarked'].mode()[0], inplace=True)
train_df.drop(['Name', 'Ticket', 'Cabin'], axis=1, inplace=True)
train_df = pd.get_dummies(train_df, columns=['Sex', 'Embarked'], drop_first=True)

# Separate features and target variable
X = train_df.drop('Survived', axis=1)
y = train_df['Survived']

# Feature Selection (RFE)
model = RandomForestClassifier(n_estimators=100, random_state=42)
selector = RFE(model, n_features_to_select=5)
selector.fit(X, y)
selected_features_rfe = X.columns[selector.support_]
print("Selected Features using RFE:", selected_features_rfe.tolist())

# Optuna for hyperparameter optimization
def objective(trial):
    n_estimators = trial.suggest_int('n_estimators', 50, 200)
    max_depth = trial.suggest_int('max_depth', 1, 20)
    
    X_train, X_test, y_train, y_test = train_test_split(X[selected_features_rfe], y, test_size=0.2, random_state=42)
    
    model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy

# Run Optuna
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

print("Best Trial:")
trial = study.best_trial
print(f"  Value: {trial.value}")
print("  Params:")
for key, value in trial.params.items():
    print(f"    {key}: {value}")

# Retrain the model with best parameters
best_params = trial.params
final_model = RandomForestClassifier(n_estimators=best_params['n_estimators'], max_depth=best_params['max_depth'], random_state=42)
final_model.fit(X[selected_features_rfe], y)

# Save the model
joblib.dump(final_model, 'titanic_model.pkl')

# Load and preprocess the test data
test_df = pd.read_csv('test.csv')
test_df['Age'].fillna(test_df['Age'].median(), inplace=True)
test_df['Embarked'].fillna(test_df['Embarked'].mode()[0], inplace=True)
test_df.drop(['Name', 'Ticket', 'Cabin'], axis=1, inplace=True)
test_df = pd.get_dummies(test_df, columns=['Sex', 'Embarked'], drop_first=True)

# Ensure the test DataFrame has the same columns as the training set
# Get columns used during training
train_columns = X[selected_features_rfe].columns

# Reindex the test DataFrame
X_test_final = test_df.reindex(columns=train_columns, fill_value=0)

# Make predictions on the test set
predictions = final_model.predict(X_test_final)

# Create a submission DataFrame
submission_df = pd.DataFrame({
    'PassengerId': test_df['PassengerId'],
    'Survived': predictions
})

# Save submission to CSV
submission_df.to_csv('submission.csv', index=False)
print("Submission file created: submission.csv")

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_df['Age'].fillna(train_df['Age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_df['Embarked'].fillna(train_df['Embarked'].mode()[0], inplace=True)
[I 2024-10-30 19:11:20,223] A new study created in memory with name: no-name-38079fed-ac9b-49dd

Selected Features using RFE: ['PassengerId', 'Pclass', 'Age', 'Fare', 'Sex_male']


[I 2024-10-30 19:11:20,595] Trial 1 finished with value: 0.8044692737430168 and parameters: {'n_estimators': 141, 'max_depth': 14}. Best is trial 0 with value: 0.8268156424581006.
[I 2024-10-30 19:11:20,771] Trial 2 finished with value: 0.8100558659217877 and parameters: {'n_estimators': 119, 'max_depth': 17}. Best is trial 0 with value: 0.8268156424581006.
[I 2024-10-30 19:11:20,932] Trial 3 finished with value: 0.8100558659217877 and parameters: {'n_estimators': 108, 'max_depth': 18}. Best is trial 0 with value: 0.8268156424581006.
[I 2024-10-30 19:11:21,047] Trial 4 finished with value: 0.7988826815642458 and parameters: {'n_estimators': 82, 'max_depth': 7}. Best is trial 0 with value: 0.8268156424581006.
[I 2024-10-30 19:11:21,321] Trial 5 finished with value: 0.8044692737430168 and parameters: {'n_estimators': 187, 'max_depth': 14}. Best is trial 0 with value: 0.8268156424581006.
[I 2024-10-30 19:11:21,467] Trial 6 finished with value: 0.8156424581005587 and parameters: {'n_estima

Best Trial:
  Value: 0.8268156424581006
  Params:
    n_estimators: 99
    max_depth: 16
Submission file created: submission.csv
