In [None]:
import copy
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Getting the data from the csv
df = pd.read_csv('breast_cancer.csv')
X = df[['Clump Thickness', 'Uniformity of Cell Size', 'Uniformity of Cell Shape', 'Marginal Adhesion',
        'Single Epithelial Cell Size', 'Bare Nuclei', 'Bland Chromatin', 'Normal Nucleoli', 'Mitoses']].to_numpy()

# Normalizing the data
mu = X.mean(axis=0)
sigma = X.std(axis=0)
X = (X - mu) / sigma
Y = df['Class'].to_numpy()
Y = Y / 2 - 1

# Initializing the weights and bias
global w_in, b_in
w_in = np.zeros_like(X[0])
b_in = 0.0

# Hyperparams
learning_rate = 0.001
iterations = 1000
regularization = 0.5
folds = 10


def model(x, w, b):
    z = np.dot(x, w) + b
    return 1 / (1 + np.exp(-z))


def cost(x, y, w, b, lambda_):
    m, n = x.shape
    c = 0.0
    for i in range(m):
        f_wb_i = model(x[i], w, b)
        c += (-y[i] * np.log(f_wb_i)) - ((1 - y[i]) * np.log(1 - f_wb_i))
    reg_cost = 0
    for j in range(n):
        reg_cost += w[j] ** 2

    return c / m + (lambda_ / (2 * m)) * reg_cost


def gradient(x, y, w, b, lambda_):
    m, n = x.shape
    dj_dw = np.zeros(n, )
    dj_db = 0
    for i in range(m):
        err = model(x[i], w, b) - y[i]
        dj_db += err
        for j in range(n):
            dj_dw_aux = err * x[i, j]
            dj_dw[j] += dj_dw_aux
    dj_dw /= m
    dj_db /= m
    for j in range(n):
        dj_dw[j] += (lambda_ / m) * w[j]
    return dj_db, dj_dw


def gradient_descent(x, y, w, b, alpha, iters, lambda_):
    J_wb = []
    w_aux = copy.deepcopy(w)
    b_aux = b
    for i in range(iters):
        dj_db, dj_dw = gradient(x, y, w_aux, b_aux, lambda_)
        w_aux = w_aux - alpha * dj_dw
        b_aux = b_aux - alpha * dj_db
        if i < 100000:  # prevent resource exhaustion
            J_wb.append(cost(x, y, w_aux, b_aux, lambda_))
        # Print cost every at intervals 10 times or as many iterations if < 10
        if i % math.ceil(iters / 10) == 0:
            print(f"Iteration {i:4d}: Cost {J_wb[-1]}   ")
    return w_aux, b_aux, J_wb


def predict(x, w, b):
    m, n = x.shape
    p = np.zeros(m)
    for i in range(m):
        f_wb = model(x[i], w, b)
        if f_wb >= 0.5:
            p[i] = 1
        else:
            p[i] = 0
    return p


def cross_validate(x, y, k, lrate, iters, reg):
    n_samples = x.shape[0]
    fold_size = n_samples // k
    accuracies = []
    cost_history = []
    for i in range(k):
        x_test = x[i * fold_size:(i + 1) * fold_size, :]
        y_test = y[i * fold_size:(i + 1) * fold_size]
        x_train = np.concatenate((x[:i * fold_size, :], x[(i + 1) * fold_size:, :]), axis=0)
        y_train = np.concatenate((y[:i * fold_size], y[(i + 1) * fold_size:]), axis=0)
        w_out, b_out, J_wb = gradient_descent(x_train, y_train, w_in, b_in, lrate, iters, reg)
        pr = predict(x_test, w_out, b_out)
        accuracy = np.mean(pr == y_test) * 100
        accuracies.append(accuracy)
        cost_history.append(J_wb)
    return accuracies, cost_history


accuracies, cost_history = cross_validate(X, Y, folds, learning_rate, iterations, regularization)

# Plotting the cost function over iterations for each fold
for i, J_wb in enumerate(cost_history):
    plt.plot(range(len(J_wb)), J_wb, label=f'Fold {i+1}')
plt.xlabel('Iterations')
plt.ylabel('Cost')
plt.title('Cost Function over Iterations for Each Fold')
plt.legend()
plt.show()

# Plotting the accuracies for each fold
plt.plot(range(1, folds + 1), accuracies, marker='o')
plt.xlabel('Fold')
plt.ylabel('Accuracy (%)')
plt.title('Accuracy for Each Fold')
plt.ylim(0, 100)
plt.show()

print('Train Accuracy: ', np.mean(accuracies))


In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Load the dataset
url = "https://raw.githubusercontent.com/plotly/datasets/master/diabetes.csv"
df = pd.read_csv(url)

# Display basic information about the dataset
print("Dataset Information:")
print(df.info())

print("\nFirst 5 rows of the dataset:")
print(df.head())

# Separate features and target variable
X = df.drop(columns=['Outcome'])
y = df['Outcome']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train a logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"\nAccuracy: {accuracy:.2f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Dataset Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB
None

First 5 rows of the dataset:
   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66  