In [3]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

def encode_features(data):
    """Adds numerical expressions of categorical variables to the dataset"""
    data = data.copy()
    SEX_MAP = {'Male': 0, 'Female': 1}
    PCLASS_MAP = {'First class': 1, 'Second class': 2, 'Third class': 3}
    EMBARKED_MAP = {'S': 0, 'C': 1, 'Q': 2, 'Unknown': -1}

    data['Sex_num'] = data['Sex'].map(SEX_MAP)
    data['Pclass_num'] = data['Pclass'].map(PCLASS_MAP)
    data['Embarked_num'] = data['Embarked'].map(EMBARKED_MAP)
    return data

def load_and_prepare_data(path):
    """Loads the Titanic dataset, fills missing values, 
    applies formatting and encodes categorical variables.
    """
    data = pd.read_csv(path)

    #fill missing age by average based on sex and passenger class
    data['Age'] = data['Age'].fillna(data.groupby(['Sex', 'Pclass'])['Age'].transform('mean'))

    #fill the rest of the missing data
    data['Cabin'] = data['Cabin'].fillna('Unknown')
    data['Embarked'] = data['Embarked'].fillna('Unknown')
    data['Fare'] = data['Fare'].fillna(data.groupby(['Pclass'])['Fare'].transform('mean'))
   
    #replace data names to preferred format
    data['Pclass'] = data['Pclass'].replace({
        1: 'First class',
        2: 'Second class',
        3: 'Third class'
    })

    data['Sex'] = data['Sex'].replace({
        'male': 'Male',
        'female': 'Female'
    })

    #add numerical expressions of variables to the dataset
    data = encode_features(data)
    return data

def normalize_features(X):
    return (X - X.min(axis = 0)) / (X.max(axis = 0) - X.min(axis = 0))

#this function is used for logistic regression algorithm
def sigmoid(z):
    """Computes the sigmoid activation function used in logistic regression."""
    return 1 / (1 + np.exp(-z))

#computes logistic regression probabilities based on input features and weights
def predict(X, weights):
    """Computes predicted probabilities using logistic regression."""
    z = np.dot(X, weights)
    return sigmoid(z)

def train_logistic_regression(data):
    """
    Trains a logistic regression model using gradient descent.
    Returns prediction and trained weights.
    """
    features = ['Sex_num', 'Pclass_num', 'SibSp', 'Parch', 'Embarked_num', 'Age', 'Fare']
    X = data[features].values
    Y = data['Survived'].values

    X = normalize_features(X)

    repeats = 2000
    weights = np.zeros(X.shape[1])
    lr = 0.1
    for i in range(repeats):
        prediction = predict(X, weights)
        error = prediction - Y
        gradient = np.dot(X.T, error) / len(Y)
        weights -= lr * gradient
    return prediction, weights

def find_best_threshold(data):
    """
    Finds the optimal threshold for classification based on accuracy.
    """
    best_accuracy = 0
    best_threshold = 0
    for threshold in np.arange(0.3, 0.7, 0.01):
        predictions = (data['PredictedChance'] >= threshold).astype('int')
        accuracy = (predictions == data['Survived']).mean()
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_threshold = threshold
    return best_threshold

def train_logistic_model(data):
    """Trains the logistic model and returns weights, threshold, and updated data."""
    predictions, weights = train_logistic_regression(data)
    data['PredictedChance'] = predictions
    threshold = find_best_threshold(data)
    return weights, threshold, data
    
def predict_test(data, weights, threshold):
    """Predicts survival on the test set and returns submission-ready DataFrame."""
    features = ['Sex_num', 'Pclass_num', 'SibSp', 'Parch', 'Embarked_num', 'Age', 'Fare']
    X = data[features].values
    predictions = predict(X, weights)
    data['PredictedChance'] = predictions
    data['Survived'] = (predictions >= threshold).astype('int')
    return data[['PassengerId', 'Survived']]

def evaluate_predictions(data, threshold):
    predictions = (data['PredictedChance'] >= threshold).astype('int')
    actual = data['Survived']
    correct = (predictions == actual).sum()
    total = len(actual)
    accuracy = correct / total
    print(f"Model accuracy: {accuracy: .4f}")

def main():
    train = load_and_prepare_data('/kaggle/input/titanic/train.csv')
    test = load_and_prepare_data('/kaggle/input/titanic/test.csv')

    weights, threshold, train = train_logistic_model(train)
    submission = predict_test(test, weights, threshold)

    evaluate_predictions(train, threshold)

    submission.to_csv('submission.csv', index = False)

main()

Model accuracy:  0.8182
