In [38]:
from sklearn.datasets import load_breast_cancer
import numpy as np
import pandas as pd

data = load_breast_cancer()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = pd.DataFrame(data.target, columns=['TARGET'])

#X.head(), y.head()

# Split into Training and Test

In [39]:
positive_ratio = np.mean(y.values.ravel()) * 100
negative_ratio = 100 - positive_ratio
print(f"Positive Ratio: {positive_ratio:.2f}")
print(f"Positive Ratio: {positive_ratio:.2f}")

Positive Ratio: 62.74
Positive Ratio: 62.74


In [40]:
# 80% training 20% testing

def manual_train_test_split(X, y, test_size=0.2):
    # Get the number of data points
    num_data_points = X.shape[0]

    # Generate shuffled indices
    shuffled_indices = np.random.permutation(num_data_points)

    # Calculate the number of test samples
    test_set_size = int(num_data_points * test_size)

    # Split indices into test and train
    test_indices = shuffled_indices[:test_set_size]
    train_indices = shuffled_indices[test_set_size:]

    # Use the indices to split the data
    X_train = X.iloc[train_indices]
    X_test = X.iloc[test_indices]
    y_train = y.iloc[train_indices]
    y_test = y.iloc[test_indices]

    return X_train, X_test, y_train, y_test

# Perform the manual train-test split
X_train, X_test, y_train, y_test = manual_train_test_split(X, y, test_size=0.2)

In [41]:
X_train.shape, X_test.shape

((456, 30), (113, 30))

In [42]:
X_train_intercept = np.column_stack([np.ones(X_train.shape[0]), X_train])
X_test_intercept = np.column_stack([np.ones(X_test.shape[0]), X_test])

# Sigmoid Function


In [43]:
def sigmoid(z):
  return 1 / (1 + np.exp(-z))

# cost function (log - loss)
def compute_cost(X, y, theta):
  m = len(y)
  h = sigmoid(np.dot(X, theta))
  epsilon = 1e-5
  cost = -(1/m) * np.sum(y * np.log(h + epsilon) + (1-y) * np.log(1 - h + epsilon))
  return cost

# Gradient Descent

In [44]:
def gradient_descent(X, y, theta, alpha, iterations):
  m = len(y)
  cost_history = []

  for i in range(iterations):
    h = sigmoid(np.dot(X, theta))
    gradient = np.dot(X.T, (h-y)) / m
    theta = theta - alpha * gradient
    cost = compute_cost(X, y, theta)
    cost_history.append(cost)
    if i % 100 == 0:
      print(f"iterations: {i}, loss:{cost}")
  return theta, cost_history

In [45]:
# initialize parameters
theta = np.zeros(X_train_intercept.shape[1])

# convert y_train and y_test to 1D numpy arrays
y_train = y_train.values.ravel() # flatten dataframe to 1d array
y_test = y_test.values.ravel()  # flatten dataframe to 1d array

In [46]:
# train model
alpha = 0.00001
iterations = 1000
theta, cost_history = gradient_descent(X_train_intercept, y_train, theta, alpha, iterations)

iterations: 0, loss:0.7632479662291021
iterations: 100, loss:0.4966270349432795
iterations: 200, loss:0.35293776113654957
iterations: 300, loss:0.32055304092153064
iterations: 400, loss:0.2980142090140103
iterations: 500, loss:0.2816363509090987
iterations: 600, loss:0.26926637427271366
iterations: 700, loss:0.2595775390775874
iterations: 800, loss:0.25175425981300875
iterations: 900, loss:0.2452812651925


In [47]:
y_pred_prob = sigmoid(np.dot(X_test_intercept, theta))
y_pred_test = (y_pred_prob >= 0.5).astype(int)

In [49]:
from sklearn.metrics import precision_score, recall_score, f1_score

accuracy = np.mean(y_pred_test == y_test) * 100
print(f"Accuracy on test set (manual logistic regression): {accuracy}%")

precision_manual = precision_score(y_test, y_pred_test)
recall_manual = recall_score(y_test, y_pred_test)
f1_manual = f1_score(y_test, y_pred_test)

print(f"Precision (manual logistic regression): {precision_manual}")
print(f"Recall (manual logistic regression): {recall_manual}")
print(f"F1 Score (manual logistic regression): {f1_manual}")

Accuracy on test set (manual logistic regression): 88.49557522123894%
Precision (manual logistic regression): 0.8831168831168831
Recall (manual logistic regression): 0.9444444444444444
F1 Score (manual logistic regression): 0.912751677852349
