Accuracy: 0.7988826815642458


In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

# Load the training data
train_data = pd.read_csv('train.csv')

# Drop unnecessary columns and fill missing values
train_data.drop(['Name', 'Ticket', 'Cabin', 'Embarked'], axis=1, inplace=True)
train_data['Age'].fillna(train_data['Age'].median(), inplace=True)
train_data['Fare'].fillna(train_data['Fare'].median(), inplace=True)

# Convert 'Sex' column to numerical values
label_encoder = LabelEncoder()
train_data['Sex'] = label_encoder.fit_transform(train_data['Sex'])

# Create additional features
train_data['FamilySize'] = train_data['SibSp'] + train_data['Parch'] + 1
train_data['IsAlone'] = 1
train_data.loc[train_data['FamilySize'] > 1, 'IsAlone'] = 0
train_data['AgeGroup'] = pd.cut(train_data['Age'], bins=[0, 18, 35, 50, 100], labels=[1, 2, 3, 4])

# Convert categorical features to one-hot encoding
train_data = pd.get_dummies(train_data, columns=['Pclass', 'Sex', 'AgeGroup'], drop_first=True)

# Split the data into features and target
X = train_data.drop('Survived', axis=1)
y = train_data['Survived']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train an ensemble of Random Forest and Gradient Boosting Classifiers
rf_model = RandomForestClassifier(n_estimators=300, max_depth=10, min_samples_split=8, min_samples_leaf=2, random_state=42)
gb_model = GradientBoostingClassifier(n_estimators=200, max_depth=4, learning_rate=0.1, random_state=42)
rf_model.fit(X_train, y_train)
gb_model.fit(X_train, y_train)

# Predict on test data
rf_pred = rf_model.predict(X_test)
gb_pred = gb_model.predict(X_test)

# Ensemble predictions using majority voting
y_pred = []
for i in range(len(X_test)):
    votes = rf_pred[i] + gb_pred[i]
    y_pred.append(1 if votes > 1 else 0)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')


Accuracy: 0.7988826815642458
