In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler



In [2]:
# Load the dataset
df = pd.read_csv('breast cancer.csv')

# Convert the diagnosis field to a binary format
df['diagnosis'] = df['diagnosis'].map({'M': 1, 'B': 0})

# Extract features (X) and target variable (y)
X = df.iloc[:, 2:32].values  # Exclude the 'id' and 'diagnosis' columns
y = df['diagnosis'].values

In [3]:
# Split the dataset into a training set and a test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Add a bias term to the features
X_train_bias = np.hstack((np.ones((X_train.shape[0], 1)), X_train))
X_test_bias = np.hstack((np.ones((X_test.shape[0], 1)), X_test))


In [4]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

# Logistic regression training function
def logistic_regression(X, y, learning_rate=0.01, iterations=1000):
    m, n = X.shape
    theta = np.zeros(n)  # Initialize weights to zeros
    
    for _ in range(iterations):
        z = np.dot(X, theta)
        h = sigmoid(z)
        gradient = np.dot(X.T, (h - y)) / m
        theta -= learning_rate * gradient
        
    return theta

In [5]:
X.shape

(569, 30)

In [6]:
y= y.reshape(-1, 1)
y.shape

(569, 1)

In [7]:
theta = logistic_regression(X_train_bias, y_train, learning_rate=0.01, iterations=10000)

# Make predictions
predictions = sigmoid(np.dot(X_test_bias, theta))
y_pred = np.round(predictions)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy of Logistic Regression:', accuracy)

Accuracy of Logistic Regression: 0.9824561403508771
