## Importing Necessary Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Loading data into dataframe

In [None]:
data = pd.read_csv("titanic.csv")
data.head()

## Preprocessing Data

In [None]:
data1 = data.drop(['Name','PassengerId', 'Ticket', 'Cabin', 'Survived'], axis=1)
y = data['Survived']
data_cat =data1.select_dtypes(include=['object'])  
data_num1 =data1.select_dtypes(include=['number'])
data_num = data_num1.fillna(data_num1.mean())
mean = data_num.mean(axis=0)
std = data_num.std(axis=0)
data_num_norm = (data_num - mean) / std

data_cat_proc = pd.get_dummies(data_cat, drop_first=True) 
boolean_cols = data_cat_proc.columns[data_cat_proc.dtypes == 'bool']
data_cat_proc[boolean_cols] = data_cat_proc[boolean_cols].astype(int)
data_proc = pd.concat([data_num_norm, data_cat_proc], axis=1) 
plt.figure(figsize=(12,10))
data_for = pd.concat([data_proc, y], axis=1) 
cor = data_for.corr()
sns.heatmap(cor, annot=True, cmap = 'viridis')

## Performing PCA
### Forming covariance matrix, sorting them in order

In [None]:
cov_matrix = data_proc.cov()
eigenvalues, eigenvectors = np.linalg.eig(cov_matrix)

sorted_indices = np.argsort(eigenvalues)[::-1]
sorted_eigenvalues = eigenvalues[sorted_indices]
sorted_eigenvectors = eigenvectors[:, sorted_indices]

print(f'Eigenvalues of P:\n{sorted_eigenvalues}\n\nEigenvectors of P\n{sorted_eigenvectors}')

### Creating Scree Plot to see importance of eigenvectors

In [None]:
plt.plot(sorted_eigenvalues)
plt.xlabel('Index')
plt.ylabel('Eigenvalue')
plt.title('Scree Plot')
plt.show()

### Selecting Principle components

In [None]:
k = 7
principal_components = eigenvectors[:, :k]
pca_features = pd.DataFrame(np.dot(data_proc, principal_components))
print(pca_features)


## Splitting Train and Test data for both normal and PCA features

In [None]:
test_size = 0.3

n_samples = data_proc.shape[0]
n_samplespca = pca_features.shape[0]

n_test_samples = int(n_samples * test_size)
n_test_samplespca = int(n_samplespca * test_size)

indices = np.arange(n_samples)
indicespca = np.arange(n_samplespca)
np.random.shuffle(indices)
np.random.shuffle(indicespca)

# Split the data into training and testing sets
train_indices = indices[n_test_samples:]
test_indices = indices[:n_test_samples]
train_indicespca = indicespca[n_test_samplespca:]
test_indicespca = indicespca[:n_test_samplespca]

X_train = data_proc.iloc[train_indices]
y_train = y.iloc[train_indices]
X_trainpca = pca_features.iloc[train_indicespca]
y_trainpca = y.iloc[train_indicespca]



X_test = data_proc.iloc[test_indices]
y_test = y.iloc[test_indices]
X_testpca = pca_features.iloc[test_indicespca]
y_testpca = y.iloc[test_indicespca]
ll_values = []

## Defining Functions for Logistic Regression

In [None]:
def sigmoid(x):
    return 1/(1+np.exp(-x))
def log_loss(y_true, y_pred):
    # Calculate the log loss
    epsilon = 1e-15
    loss = -np.mean(y_true * np.log(y_pred + epsilon) + (1 - y_true) * np.log(1 - y_pred + epsilon))
    return loss

## Creating Class for Logistic Regression

In [None]:
class LogisticRegression:
    def __init__(self, lr = 0.1, n_iters = 700):
        self.lr = lr
        self.n_iters = n_iters
        self.weights = None
        self.bias = 0
    def fit(self, X, y):
        n_samples, n_features = X.shape
        self.weights = np.zeros(n_features)
        self.bias = 0
        prev_mse = float('inf')
        for i in range(self.n_iters):
            linearpred = np.dot(X, self.weights) + self.bias
            y_predicted = sigmoid(linearpred)
            delw = (1/n_samples)*np.dot(X.T, (y_predicted - y))
            delb = (1/n_samples)*np.sum(y_predicted - y)
            self.weights = self.weights - self.lr*delw
            self.bias = self.bias - self.lr*delb
            loglossv = log_loss(y, y_predicted)
            ll_values.append(loglossv)
#            if i > 0:
#                if prev_mse - mse < 1e3:
#                    break
#            prev_mse = mse
    def predict(self, X):
        linearpred = np.dot(X, self.weights) + self.bias
        y_predicted = sigmoid(linearpred)
        predicted = [0 if y<=0.5 else 1 for y in y_predicted]
        return predicted

## Defining evaluation parameters

In [None]:
def f1_score(tp,fp,tn,fn):
    precision = tp / (tp + fp) if tp + fp != 0 else 0
    recall = tp / (tp + fn) if tp + fn != 0 else 0
    print("Precision:", precision)
    print("Recall:", recall)
    f1 = 2 * precision * recall / (precision + recall) if precision + recall != 0 else 0
    return f1
def confusion_matrix(tp, fp, tn, fn):
    data = [[tn, fp], [fn, tp]]
    plt.imshow(data, cmap='Blues')
    plt.title('Confusion Matrix')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.xticks([0, 1], ['Negative', 'Positive'])
    plt.yticks([0, 1], ['Negative', 'Positive'])

    for i in range(2):
        for j in range(2):
            if i == 0 and j == 0:
                plt.text(j, i, f'TN: {tn}', ha='center', va='center', color='white')
            elif i == 0 and j == 1:
                plt.text(j, i, f'FP: {fp}', ha='center', va='center', color='black')
            elif i == 1 and j == 0:
                plt.text(j, i, f'FN: {fn}', ha='center', va='center', color='black')
            elif i == 1 and j == 1:
                plt.text(j, i, f'TP: {tp}', ha='center', va='center', color='black')


## Executing Logistic Regression on Normal Dataset

In [None]:
classifier = LogisticRegression()
classifier.fit(X_train, y_train)
predictions = classifier.predict(X_test)
def accuracy(y_pred,y_test):
    return np.sum(y_pred==y_test)/len(y_test)
predictions_array = np.array(predictions)
y_test_array = np.array(y_test)
tp = np.sum((y_test_array == 1) & (predictions_array == 1))
fp = np.sum((y_test_array == 0) & (predictions_array == 1))
tn = np.sum((y_test_array == 0) & (predictions_array == 0))
fn = np.sum((y_test_array == 1) & (predictions_array == 0)) 
plt.show()
plt.plot(range(1, len(ll_values) + 1), ll_values)
plt.xlabel("Iteration")
plt.ylabel("Log Loss")
plt.title("Log Loss over Iterations")
plt.show()
f1 = f1_score(tp,fp,tn,fn)
print("F1 score:",f1)
confusion_matrix(tp,fp,tn,fn)
ll_values = []

## Executing Logistic Regression on Dataset with PCA

In [None]:
classifier2 = LogisticRegression(lr=0.1)
classifier2.fit(X_trainpca, y_trainpca)
predictionspca = classifier2.predict(X_testpca)
predictions_arraypca = np.array(predictionspca)
y_test_arraypca = np.array(y_testpca)
tp = np.sum((y_test_arraypca == 1) & (predictions_arraypca == 1))
fp = np.sum((y_test_arraypca == 0) & (predictions_arraypca == 1))
tn = np.sum((y_test_arraypca == 0) & (predictions_arraypca == 0))
fn = np.sum((y_test_arraypca == 1) & (predictions_arraypca == 0))
plt.show()
plt.plot(range(1, len(ll_values) + 1), ll_values)
plt.xlabel("Iteration")
plt.ylabel("Log Loss")
plt.title("Log Loss over Iterations")
plt.show()
f1 = f1_score(tp,fp,tn,fn)
print("F1 score:",f1)
confusion_matrix(tp,fp,tn,fn)
ll_values = []

## Creating Class for Support Vector Machines

In [None]:

class SupportVectorMachine:
    def __init__(self, reg_strength=0.1, n_iters=700, lr=0.1):
        self.reg_strength = reg_strength
        self.n_iters = n_iters
        self.lr = lr
        self.weights = None
        self.bias = 0

    def fit(self, X, y):
        n_samples, n_features = X.shape
        self.weights = np.zeros(n_features)
        self.bias = 0
        y = np.array(y)
        X = np.array(X)
        y = np.where(y == 0, -1, 1)

        for epoch in range(self.n_iters):
            linear_output = np.dot(X, self.weights) + self.bias
            distances = 1 - y * linear_output
            dw = np.zeros(n_features)
            db = 0

            for i in range(n_samples):
                if distances[i] > 0:
                    dw += -y[i] * X[i]
                    db += -y[i]
            dw = dw / n_samples + (self.reg_strength * self.weights / n_samples)
            db = db / n_samples
            self.weights -= self.lr * dw
            self.bias -= self.lr * db
            cost = (1 / n_samples) * np.sum(np.maximum(0, 1 - y * (np.dot(X, self.weights) + self.bias))) + (self.reg_strength / 2) * np.sum(self.weights ** 2)
            if epoch % 100 == 0 or epoch == self.n_iters - 1:
                print(f"Epoch {epoch}/{self.n_iters} | Cost1: {cost}")
            ll_values.append(cost)

    def predict(self, X):
        linear_output = np.dot(X, self.weights) + self.bias
        return np.where(linear_output >= 0, 1, 0)


## Executing Support Vector Machine Classification on Normal Dataset

In [None]:
svmobj = SupportVectorMachine(lr=0.017)
svmobj.fit(X_train, y_train)
predictions = svmobj.predict(X_test)
predictions_array = np.array(predictions)
y_test_array = np.array(y_test)
tp = np.sum((y_test_array == 1) & (predictions_array == 1))
fp = np.sum((y_test_array == 0) & (predictions_array == 1))
tn = np.sum((y_test_array == 0) & (predictions_array == 0))
fn = np.sum((y_test_array == 1) & (predictions_array == 0))
plt.show()
plt.plot(range(1, len(ll_values) + 1), ll_values)
plt.xlabel("Iteration")
plt.ylabel("Log Loss")
plt.title("Cost over Iterations")
plt.show()
f1 = f1_score(tp,fp,tn,fn)
print("F1 score:",f1)
confusion_matrix(tp,fp,tn,fn)
ll_values = []

## Executing Support Vector Machine Classification on Dataset with PCA

In [None]:
svmobj2 = SupportVectorMachine(lr=0.015,reg_strength=0.05)
svmobj2.fit(X_trainpca, y_trainpca)
predictionspca = svmobj2.predict(X_testpca)
predictions_arraypca = np.array(predictionspca)
y_test_arraypca = np.array(y_testpca)
tp = np.sum((y_test_arraypca == 1) & (predictions_arraypca == 1))
fp = np.sum((y_test_arraypca == 0) & (predictions_arraypca == 1))
tn = np.sum((y_test_arraypca == 0) & (predictions_arraypca == 0))
fn = np.sum((y_test_arraypca == 1) & (predictions_arraypca == 0))
plt.show()
plt.plot(range(1, len(ll_values) + 1), ll_values)
plt.xlabel("Iteration")
plt.ylabel("Log Loss")
plt.title("Cost over Iterations")
plt.show()
f1 = f1_score(tp,fp,tn,fn)
print("F1 score:",f1)
confusion_matrix(tp,fp,tn,fn)
ll_values = []