In [11]:
#Libraries
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

In [12]:
#Load Data 
train_df = pd.read_csv("/kaggle/input/titanic/train.csv")
test_df = pd.read_csv("/kaggle/input/titanic/test.csv")
test_passenger_id = test_df['PassengerId']

In [13]:
#Combine for Shared Processing 
full_data = [train_df, test_df]

In [14]:
for dataset in full_data:
    dataset['Title'] = dataset['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
    dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess', 'Capt', 'Col', 'Don', 'Dr',
                                                 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
    dataset['Title'] = dataset['Title'].replace({'Mlle': 'Miss', 'Ms': 'Miss', 'Mme': 'Mrs'})
    dataset['Title'] = dataset['Title'].map({'Mr': 1, 'Miss': 2, 'Mrs': 3,
                                             'Master': 4, 'Rare': 5}).fillna(0)

In [15]:
#Drop Columns
train_df = train_df.drop(['Name', 'Ticket', 'Cabin', 'PassengerId'], axis=1)
test_df = test_df.drop(['Name', 'Ticket', 'Cabin'], axis=1)

#Convert Sex and Embarked 
for dataset in [train_df, test_df]:
    dataset['Sex'] = dataset['Sex'].map({'female': 1, 'male': 0}).astype(int)
    dataset['Embarked'] = dataset['Embarked'].fillna('S')
    dataset['Embarked'] = dataset['Embarked'].map({'S': 0, 'C': 1, 'Q': 2}).astype(int)

In [16]:
#Fill and Bin Age 
guess_ages = np.zeros((2, 3))
for dataset in [train_df, test_df]:
    for i in range(2):
        for j in range(3):
            guess_df = dataset[(dataset['Sex'] == i) & (dataset['Pclass'] == j+1)]['Age'].dropna()
            guess_ages[i, j] = guess_df.median()
    for i in range(2):
        for j in range(3):
            dataset.loc[(dataset['Age'].isnull()) & 
                        (dataset['Sex'] == i) & 
                        (dataset['Pclass'] == j+1), 'Age'] = guess_ages[i, j]
    dataset['Age'] = dataset['Age'].astype(int)
    dataset.loc[dataset['Age'] <= 16, 'Age'] = 0
    dataset.loc[(dataset['Age'] > 16) & (dataset['Age'] <= 32), 'Age'] = 1
    dataset.loc[(dataset['Age'] > 32) & (dataset['Age'] <= 48), 'Age'] = 2
    dataset.loc[(dataset['Age'] > 48) & (dataset['Age'] <= 64), 'Age'] = 3
    dataset.loc[dataset['Age'] > 64, 'Age'] = 4

In [17]:
#Create Family Features
for dataset in [train_df, test_df]:
    dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1
    dataset['IsAlone'] = (dataset['FamilySize'] == 1).astype(int)
    dataset.drop(['SibSp', 'Parch', 'FamilySize'], axis=1, inplace=True)

#Add Interaction Term
for dataset in [train_df, test_df]:
    dataset['Age*Class'] = dataset['Age'] * dataset['Pclass']

#Fix Fare in Test 
test_df['Fare'] = test_df['Fare'].replace([np.inf, -np.inf], np.nan)
test_df['Fare'] = test_df['Fare'].fillna(test_df['Fare'].median())


In [18]:
#Bin Fare 
train_df['FareBand'] = pd.qcut(train_df['Fare'], 4)

for dataset in [train_df, test_df]:
    dataset['Fare'] = dataset['Fare'].replace([np.inf, -np.inf], np.nan)
    dataset['Fare'] = dataset['Fare'].fillna(0)
    dataset.loc[dataset['Fare'] <= 7.91, 'Fare'] = 0
    dataset.loc[(dataset['Fare'] > 7.91) & (dataset['Fare'] <= 14.454), 'Fare'] = 1
    dataset.loc[(dataset['Fare'] > 14.454) & (dataset['Fare'] <= 31), 'Fare'] = 2
    dataset.loc[dataset['Fare'] > 31, 'Fare'] = 3
    dataset['Fare'] = dataset['Fare'].astype(int)

train_df.drop(['FareBand'], axis=1, inplace=True)

In [19]:
# Final Train/Test Data 
X_train = train_df.drop("Survived", axis=1)
y_train = train_df["Survived"]
X_test = test_df.drop("PassengerId", axis=1)

In [20]:
#Ensemble: Soft Voting Classifier
log_clf = LogisticRegression(max_iter=2000)
svc_clf = SVC(probability=True)
rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)

ensemble = VotingClassifier(
    estimators=[('lr', log_clf), ('svc', svc_clf), ('rf', rf_clf)],
    voting='soft'
)

In [21]:
#Fit and Predict
ensemble.fit(X_train, y_train)
y_pred = ensemble.predict(X_test)


In [22]:
#Submission
submission = pd.DataFrame({
    "PassengerId": test_passenger_id,
    "Survived": y_pred
})
submission.to_csv("submission_now.csv", index=False)

#Show Accuracy 
print("Training Accuracy:", round(ensemble.score(X_train, y_train) * 100, 2), "%")


Training Accuracy: 84.29 %
