# Import the neccessary libraries

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, roc_curve

ModuleNotFoundError: No module named 'numpy'

# Load the Dataset

In [None]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

# Display the first few rows

In [None]:
train_data.head()

# Check for missing values

In [None]:
print(train_data.isnull().sum())

# Survival rate

In [None]:
print(f"Survival Rate: {train_data['Survived'].mean()*100:.2f}%")

# Visualizations

In [None]:
sns.barplot(x='Sex', y='Survived', data=train_data)
plt.title('Survival Rate by Gender')
plt.show()

# Fill missing Age values with median

In [None]:
imputer = SimpleImputer(strategy='median')
train_data['Age'] = imputer.fit_transform(train_data[['Age']])

# Encode categorical variables

In [None]:
encoder = LabelEncoder()
train_data['Sex'] = encoder.fit_transform(train_data['Sex'])

# Select features and target

In [None]:
X = train_data[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch']]
y = train_data['Survived']

# Split the Data into training and validationn sets

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Random Forest Classifier

In [None]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions

In [None]:
rf_predictions = rf_model.predict(X_val)

# Evaluate the model

In [None]:
# Accuracy, Precision, and Recall
accuracy = accuracy_score(y_val, rf_predictions)
precision = precision_score(y_val, rf_predictions)
recall = recall_score(y_val, rf_predictions)

print(f"Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}")

In [None]:
# AUC-ROC
rf_predictions_proba = rf_model.predict_proba(X_val)
auc = roc_auc_score(y_val, rf_predictions_proba[:, 1])

print(f"AUC: {auc:.4f}")

In [None]:
# Plot ROC Curve
fpr, tpr, _ = roc_curve(y_val, rf_predictions_proba[:, 1])
plt.plot(fpr, tpr, label='Random Forest')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.show()

# Advanced Model Tuning

In [None]:
# Define the parameter grid to search
param_grid = {
    'n_estimators': [100, 200, 500],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [4, 5, 6, 7, 8],
    'criterion': ['gini', 'entropy']
}

In [None]:
# Create a base model
rf = RandomForestClassifier()

In [None]:
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                           cv = 3, n_jobs = -1, verbose = 2)

In [None]:
# Fit the grid search to the data
grid_search.fit(X_train, y_train)
grid_search.best_params_

Gradient Boosting Example:

In [None]:
# Initialize and fit the Gradient Boosting classifier
gb_clf = GradientBoostingClassifier(n_estimators=100, learning_rate=0.05, max_depth=3, random_state=42)
gb_clf.fit(X_train, y_train)


In [None]:
# Predict on the training set
gb_predictions = gb_clf.predict(X_val_split)

In [None]:
# Evaluate the model
gb_accuracy = accuracy_score(y_val_split, gb_predictions)
print(f"Validation Accuracy of Gradient Boosting: {gb_accuracy*100:.2f}%")

Feature importance:

In [None]:
# Feature importance from the Gradient Boosting model
feature_importance = gb_clf.feature_importances_

# Plot
sns.barplot(x=feature_importance, y=features)
plt.title('Feature Importance')
plt.show()

XGBoost Example:

In [None]:
from xgboost import XGBClassifier

# Initialize and fit the XGBoost classifier
xgb_clf = XGBClassifier(n_estimators=100, learning_rate=0.05, max_depth=3, use_label_encoder=False, eval_metric='logloss')
xgb_clf.fit(X_train_split, y_train_split)

# Predict on the validation set
xgb_predictions = xgb_clf.predict(X_val_split)

# Evaluate the model
xgb_accuracy = accuracy_score(y_val_split, xgb_predictions)