BASELINE

In [1]:
#Import Libraries 
import numpy as np
import pandas as pd
import os
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score

In [2]:
#Check Available Files
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

#Load Datasets 
train_data = pd.read_csv("/kaggle/input/titanic/train.csv")
test_data = pd.read_csv("/kaggle/input/titanic/test.csv")

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv


In [3]:
#Preview Training Data 
print(train_data.head())


   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  


In [4]:
#Analyze Survival Rates by Gender
women = train_data.loc[train_data.Sex == 'female']["Survived"]
men = train_data.loc[train_data.Sex == 'male']["Survived"]

rate_women = sum(women) / len(women)
rate_men = sum(men) / len(men)

print("% of women who survived:", rate_women)
print("% of men who survived:", rate_men)

% of women who survived: 0.7420382165605095
% of men who survived: 0.18890814558058924


In [5]:
#Feature Selection
features = ["Pclass", "Sex", "SibSp", "Parch"]
X = pd.get_dummies(train_data[features])
X_test = pd.get_dummies(test_data[features])
y = train_data["Survived"]


In [6]:
#Train Random Forest Model 
model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1)
model.fit(X, y)


In [7]:
#Make Predictions on Test Set
predictions = model.predict(X_test)

In [8]:
#Export Submission File 
output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})
output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")


Your submission was successfully saved!


In [9]:
#Evaluate on Training Set 
train_predictions = model.predict(X)

accuracy = accuracy_score(y, train_predictions)
conf_matrix = confusion_matrix(y, train_predictions)
f1 = f1_score(y, train_predictions)

print("\n--- Evaluation on Training Data ---")
print("Accuracy:", accuracy)
print("Confusion Matrix:\n", conf_matrix)
print("F1 Score:", f1)


--- Evaluation on Training Data ---
Accuracy: 0.8159371492704826
Confusion Matrix:
 [[492  57]
 [107 235]]
F1 Score: 0.7413249211356466


My machine learning implementation 

In [10]:
#Libraries
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

In [11]:
#Load Data 
train_df = pd.read_csv("/kaggle/input/titanic/train.csv")
test_df = pd.read_csv("/kaggle/input/titanic/test.csv")
test_passenger_id = test_df['PassengerId']

In [12]:
#Combine for Shared Processing 
full_data = [train_df, test_df]

In [13]:
for dataset in full_data:
    dataset['Title'] = dataset['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
    dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess', 'Capt', 'Col', 'Don', 'Dr',
                                                 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
    dataset['Title'] = dataset['Title'].replace({'Mlle': 'Miss', 'Ms': 'Miss', 'Mme': 'Mrs'})
    dataset['Title'] = dataset['Title'].map({'Mr': 1, 'Miss': 2, 'Mrs': 3,
                                             'Master': 4, 'Rare': 5}).fillna(0)

In [14]:
#Drop Columns
train_df = train_df.drop(['Name', 'Ticket', 'Cabin', 'PassengerId'], axis=1)
test_df = test_df.drop(['Name', 'Ticket', 'Cabin'], axis=1)

#Convert Sex and Embarked 
for dataset in [train_df, test_df]:
    dataset['Sex'] = dataset['Sex'].map({'female': 1, 'male': 0}).astype(int)
    dataset['Embarked'] = dataset['Embarked'].fillna('S')
    dataset['Embarked'] = dataset['Embarked'].map({'S': 0, 'C': 1, 'Q': 2}).astype(int)

In [15]:
#Fill and Bin Age 
guess_ages = np.zeros((2, 3))
for dataset in [train_df, test_df]:
    for i in range(2):
        for j in range(3):
            guess_df = dataset[(dataset['Sex'] == i) & (dataset['Pclass'] == j+1)]['Age'].dropna()
            guess_ages[i, j] = guess_df.median()
    for i in range(2):
        for j in range(3):
            dataset.loc[(dataset['Age'].isnull()) & 
                        (dataset['Sex'] == i) & 
                        (dataset['Pclass'] == j+1), 'Age'] = guess_ages[i, j]
    dataset['Age'] = dataset['Age'].astype(int)
    dataset.loc[dataset['Age'] <= 16, 'Age'] = 0
    dataset.loc[(dataset['Age'] > 16) & (dataset['Age'] <= 32), 'Age'] = 1
    dataset.loc[(dataset['Age'] > 32) & (dataset['Age'] <= 48), 'Age'] = 2
    dataset.loc[(dataset['Age'] > 48) & (dataset['Age'] <= 64), 'Age'] = 3
    dataset.loc[dataset['Age'] > 64, 'Age'] = 4

In [16]:
#Create Family Features
for dataset in [train_df, test_df]:
    dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1
    dataset['IsAlone'] = (dataset['FamilySize'] == 1).astype(int)
    dataset.drop(['SibSp', 'Parch', 'FamilySize'], axis=1, inplace=True)

#Add Interaction Term
for dataset in [train_df, test_df]:
    dataset['Age*Class'] = dataset['Age'] * dataset['Pclass']

#Fix Fare in Test 
test_df['Fare'] = test_df['Fare'].replace([np.inf, -np.inf], np.nan)
test_df['Fare'] = test_df['Fare'].fillna(test_df['Fare'].median())


In [17]:
#Bin Fare 
train_df['FareBand'] = pd.qcut(train_df['Fare'], 4)

for dataset in [train_df, test_df]:
    dataset['Fare'] = dataset['Fare'].replace([np.inf, -np.inf], np.nan)
    dataset['Fare'] = dataset['Fare'].fillna(0)
    dataset.loc[dataset['Fare'] <= 7.91, 'Fare'] = 0
    dataset.loc[(dataset['Fare'] > 7.91) & (dataset['Fare'] <= 14.454), 'Fare'] = 1
    dataset.loc[(dataset['Fare'] > 14.454) & (dataset['Fare'] <= 31), 'Fare'] = 2
    dataset.loc[dataset['Fare'] > 31, 'Fare'] = 3
    dataset['Fare'] = dataset['Fare'].astype(int)

train_df.drop(['FareBand'], axis=1, inplace=True)

In [18]:
# Final Train/Test Data 
X_train = train_df.drop("Survived", axis=1)
y_train = train_df["Survived"]
X_test = test_df.drop("PassengerId", axis=1)

In [19]:
#Ensemble: Soft Voting Classifier
log_clf = LogisticRegression(max_iter=2000)
svc_clf = SVC(probability=True)
rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)

ensemble = VotingClassifier(
    estimators=[('lr', log_clf), ('svc', svc_clf), ('rf', rf_clf)],
    voting='soft'
)

In [20]:
#Fit and Predict
ensemble.fit(X_train, y_train)
y_pred = ensemble.predict(X_test)


In [21]:
#Submission ===
submission = pd.DataFrame({
    "PassengerId": test_passenger_id,
    "Survived": y_pred
})
submission.to_csv("submission_now.csv", index=False)

#Show Accuracy 
print("Training Accuracy:", round(ensemble.score(X_train, y_train) * 100, 2), "%")

#Evaluation on Training Data
train_preds = ensemble.predict(X_train)

acc = accuracy_score(y_train, train_preds)
conf = confusion_matrix(y_train, train_preds)
f1 = f1_score(y_train, train_preds)

print("\n=== Model Evaluation on Training Data ===")
print("Accuracy:", round(acc, 4))
print("F1 Score:", round(f1, 4))
print("Confusion Matrix:\n", conf)

Training Accuracy: 84.18 %

=== Model Evaluation on Training Data ===
Accuracy: 0.8418
F1 Score: 0.778
Confusion Matrix:
 [[503  46]
 [ 95 247]]
