In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv


In [2]:
# cleaning and preprocessing
train_data = pd.read_csv("/kaggle/input/titanic/train.csv")
test_data = pd.read_csv("/kaggle/input/titanic/test.csv")
gender_data = pd.read_csv("/kaggle/input/titanic/gender_submission.csv")

# add family_size feature
train_data["family_size"] = train_data["SibSp"] + train_data["Parch"]
test_data["family_size"] = test_data["SibSp"] + test_data["Parch"]

# fill missing age values with mean for both train and test datasets
train_mean_age = round(train_data["Age"].mean())
train_data.fillna({"Age": train_mean_age}, inplace=True)

test_mean_age = round(test_data["Age"].mean())
test_data.fillna({"Age": test_mean_age}, inplace=True)

# change Sex (Categorical) to Numerical for train and test
pd.set_option('future.no_silent_downcasting', True) # need this or else Downcasting behavior
train_data.replace({"Sex": {"female": 1, "male": 0}}, inplace=True)
test_data.replace({"Sex": {"female": 1, "male": 0}}, inplace=True)

train_data["Sex"] = train_data["Sex"].astype(int)
test_data["Sex"] = test_data["Sex"].astype(int)

# scale fare and add it as a feature
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

train_data['Fare'] = scaler.fit_transform(train_data[['Fare']])
test_data['Fare'] = scaler.fit_transform(test_data[['Fare']])

# fill NaN for test data Fare
test_data.fillna({"Fare": round(test_data['Fare'].mean())}, inplace=True)

In [3]:
# begin ML trying with SVM Classification first
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

y = train_data["Survived"]
features = ["Pclass", "Sex", "family_size", "Age"] # "Fare", "Embarked"

X = pd.get_dummies(train_data[features])
X_test = pd.get_dummies(test_data[features])

# np.any(np.isnan(X_test))
# np.isfinite(X_test.all())
# Check for infinite values
# print(np.any(~np.isfinite(X_test)))  # Should return False
# misvals = X_test.isnull().sum()
# print(misvals)

clf = GradientBoostingClassifier( # 77.9% !! from 77.7%
        n_estimators=400,
        learning_rate=0.01,
        max_depth=3,
        min_samples_split=10,
        min_samples_leaf=5,
        max_features='sqrt',
        subsample=1,
        random_state=42
    )
clf.fit(X, y)

predictions = clf.predict(X_test)

# output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})
# output.to_csv('submission.csv', index=False)
# print("submission written!")

In [4]:
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, mean_absolute_error
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

y = train_data["Survived"]
features = ["Pclass", "Sex", "family_size", "Age", "Fare", "Embarked"]

X = pd.get_dummies(train_data[features])
X_test = pd.get_dummies(test_data[features])


kf = KFold(n_splits=10, shuffle=True, random_state=1)
kf.get_n_splits(X)

# store accuracies
train_accuracies = []
val_accuracies = []

for i, (train_index, test_index) in enumerate(kf.split(X, y)):
    clf = GradientBoostingClassifier(
        n_estimators=400,
        learning_rate=0.01,
        max_depth=3,
        min_samples_split=10,
        min_samples_leaf=5,
        max_features='sqrt',
        subsample=1,
        random_state=42
    )

    clf.fit(X.iloc[train_index], y.iloc[train_index])
    
    # evaluate on the training set
    y_train_pred = clf.predict(X.iloc[train_index])
    train_accuracy = accuracy_score(y.iloc[train_index], y_train_pred)
    train_accuracies.append(train_accuracy)
    
    # evaluate on the validation set
    y_val_pred = clf.predict(X.iloc[test_index])
    val_accuracy = accuracy_score(y.iloc[test_index], y_val_pred)
    val_accuracies.append(val_accuracy)
    
    print(f"Fold: {i}, Training Accuracy: {train_accuracy:.4f}, Validation Accuracy: {val_accuracy:.4f}")

print(f"\nAverage Training Accuracy: {sum(train_accuracies) / len(train_accuracies):.4f}")
print(f"Average Validation Accuracy: {sum(val_accuracies) / len(val_accuracies):.4f}")

Fold: 0, Training Accuracy: 0.8564, Validation Accuracy: 0.7667
Fold: 1, Training Accuracy: 0.8441, Validation Accuracy: 0.8090
Fold: 2, Training Accuracy: 0.8504, Validation Accuracy: 0.7528
Fold: 3, Training Accuracy: 0.8404, Validation Accuracy: 0.7753
Fold: 4, Training Accuracy: 0.8516, Validation Accuracy: 0.8202
Fold: 5, Training Accuracy: 0.8416, Validation Accuracy: 0.8427
Fold: 6, Training Accuracy: 0.8404, Validation Accuracy: 0.8764
Fold: 7, Training Accuracy: 0.8379, Validation Accuracy: 0.8427
Fold: 8, Training Accuracy: 0.8429, Validation Accuracy: 0.8764
Fold: 9, Training Accuracy: 0.8466, Validation Accuracy: 0.8202

Average Training Accuracy: 0.8452
Average Validation Accuracy: 0.8182


In [5]:
# PCA
from sklearn.decomposition import PCA

X.to_numpy()
y.to_numpy()
X_test.to_numpy()

pca = PCA(n_components=5, svd_solver='auto')
pca.fit(X)
pca

print("Explained variance ratio:", pca.explained_variance_ratio_)

Explained variance ratio: [0.978439   0.01416397 0.00379728 0.00181927 0.00118099]


In [6]:
X_train_pca = pca.transform(X)
X_test_pca = pca.transform(X_test)

X_train_pca.shape, X_test_pca.shape

((891, 5), (418, 5))

In [7]:
# Cross validate with PCA

kf = KFold(n_splits=10, shuffle=True, random_state=1)
kf.get_n_splits(X_train_pca)

# store accuracies
train_accuracies = []
val_accuracies = []

for i, (train_index, val_index) in enumerate(kf.split(X_train_pca, y)):
    pca_clf = GradientBoostingClassifier(
        n_estimators=400,
        learning_rate=0.01,
        max_depth=3,
        min_samples_split=10,
        min_samples_leaf=5,
        max_features='sqrt',
        subsample=1,
        random_state=42
    )

    pca_clf.fit(X_train_pca[train_index], y.iloc[train_index])
    
    # evaluate on the training fold
    y_train_pred = pca_clf.predict(X_train_pca[train_index])
    train_accuracy = accuracy_score(y.iloc[train_index], y_train_pred)
    train_accuracies.append(train_accuracy)
    
    # evaluate on the validation fold
    y_val_pred = pca_clf.predict(X_train_pca[val_index])
    val_accuracy = accuracy_score(y.iloc[val_index], y_val_pred)
    val_accuracies.append(val_accuracy)
    
    print(f"Fold: {i}, Training Accuracy: {train_accuracy:.4f}, Validation Accuracy: {val_accuracy:.4f}")

print(f"\nAverage Training Accuracy: {sum(train_accuracies) / len(train_accuracies):.4f}")
print(f"Average Validation Accuracy: {sum(val_accuracies) / len(val_accuracies):.4f}")

Fold: 0, Training Accuracy: 0.8677, Validation Accuracy: 0.7444
Fold: 1, Training Accuracy: 0.8641, Validation Accuracy: 0.7978
Fold: 2, Training Accuracy: 0.8616, Validation Accuracy: 0.7640
Fold: 3, Training Accuracy: 0.8616, Validation Accuracy: 0.7640
Fold: 4, Training Accuracy: 0.8603, Validation Accuracy: 0.8090
Fold: 5, Training Accuracy: 0.8728, Validation Accuracy: 0.8427
Fold: 6, Training Accuracy: 0.8529, Validation Accuracy: 0.8764
Fold: 7, Training Accuracy: 0.8603, Validation Accuracy: 0.7978
Fold: 8, Training Accuracy: 0.8541, Validation Accuracy: 0.8539
Fold: 9, Training Accuracy: 0.8591, Validation Accuracy: 0.8315

Average Training Accuracy: 0.8615
Average Validation Accuracy: 0.8082
