In [None]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,f1_score
from sklearn.preprocessing import MinMaxScaler
import numpy as np
import pandas as pd


## Breast_Cancer dataset

In [None]:
data = load_breast_cancer()
X = data.data
y = data.target

### Processing

In [None]:
# Scale data
scaler = MinMaxScaler()
X = scaler.fit_transform(X)
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=28)

### Logistic Regression Sklearn

In [None]:
model = LogisticRegression()
model.fit(X_train,y_train)
y_pred = model.predict(X_test)
F1 = f1_score(y_pred,y_test)
acc = accuracy_score(y_pred,y_test)
print(f"F1 Score on testset: {F1}")
print(f'Accuracy Score on testset: {acc}')

F1 Score on testset: 0.9864864864864865
Accuracy Score on testset: 0.9824561403508771


### Logistic Regression by Numpy

In [None]:
X_train = X_train.T
X_test = X_test.T

$$\sigma(z) = \frac{1}{1 + e^{-z}}$$


In [None]:
def sigmoid(Z) :
  # sigmoid function
  return 1 / ( 1 + np.exp(-Z))

In [None]:
def propagation(X,y,W,b) :

  m = X.shape[1]

  # forward propagation
  Z = np.dot(W.T,X) + b
  A = sigmoid(Z)

  # cost function
  cost  = -1 / m * np.sum(y * np.log(A) + (1 - y) * np.log(1 - A))

  # backward propagation
  dZ = A - y
  dW = 1/m * np.dot(X,dZ.T)
  db = 1/m * np.sum(dZ)

  return A,cost,dW,db

In [None]:
def fit(X,y,W,b,learning_rate,iterations) :

  m = X.shape[1]
  W_best = W
  b_best = b
  cost_best = np.inf
  # loop
  for iter in range(iterations) :
    # get dW,db
    A,cost,dW,db = propagation(X,y,W,b)

    # update parameters
    W -= learning_rate * dW
    b -= learning_rate * db

    # get parameters bringing the best result
    if cost < cost_best :
      cost_best = cost
      W_best = W
      b_best = b

    # track the train process
    if iter % 1000 == 0 :
      print(f'Cost after {iter} iterations: {cost}')

  print(f'Minimum Cost: {cost_best}')
  return W_best,b_best


In [None]:
def predict(X_test,W,b) :
  # compte A
  Z = np.dot(W.T,X_test) + b
  A = sigmoid(Z)

  # convert from probability to 0 or 1
  y_pred = np.where(A > 0.5,1,0)

  # flatten the array
  y_pred = y_pred.flatten()

  return y_pred

In [None]:
# initialize parameters
W = np.ones((X_train.shape[0],1)) * 0.0001
b = 0.00001
learning_rate = 0.1
iterations = 6000
# fit data
W_,b_ = fit(X_train,y_train.reshape(1,-1),W,b,learning_rate,iterations)
# predict
y_pred = predict(X_test,W_,b_)

Cost after 0 iterations: 0.6931592839695893
Cost after 1000 iterations: 0.21647740839012258
Cost after 2000 iterations: 0.16881799185453045
Cost after 3000 iterations: 0.14706555351710085
Cost after 4000 iterations: 0.13367374298171164
Cost after 5000 iterations: 0.12428077487337578
Minimum Cost: 0.11720142643255961


In [None]:
f1 = f1_score(y_pred,y_test)
acc = accuracy_score(y_pred,y_test)
print(f'F1 Score: {f1}')
print(f'Accuracy Score: {acc}')

F1 Score: 0.9793103448275863
Accuracy Score: 0.9736842105263158


Very quite good! You can build choose threshold or track both the training and testing processes to choose suitable hyperparameters......

### Experiment with Neural Network

 We will experiment it for this problem (usually Neural Network will be the best choice for the large dataset)

In [None]:
def relu(Z) :
  return np.maximum(0,Z)

In [None]:
def propagation_NN(X,y,W1,b1,W2,b2) :

  m  = X.shape[1]
  n0 = X.shape[0]
  n1 = W1.shape[0]
  n2 = W2.shape[0]

  # forward propagation
  Z1 = np.dot(W1,X) + b1
  A1 = relu(Z1)
  Z2 = np.dot(W2,A1) + b2
  A2 = sigmoid(Z2)

  # cost function
  cost = -1/m * np.sum(y * np.log(A2) + (1 - y) * np.log(1 - A2))

  # backward propagation
  dZ2 = A2 - y
  dW2 = 1/m * np.dot(dZ2,A1.T)
  db2 = 1/m * np.sum(dZ2,axis=1,keepdims=True)
  dZ1 = np.dot(W2.T,dZ2) * np.where(Z1 > 0,1,0)
  dW1 = 1/m * np.dot(dZ1,X.T)
  db1 = 1/m * np.sum(dZ1,axis=1,keepdims=True)

  return A2,cost,dW1,db1,dW2,db2

In [None]:
def fit_NN(X,y,W1,b1,W2,b2,learning_rate,iterations) :
  m = X.shape[1]
  best_parameters = {}
  best_cost = np.inf

  # loop
  for iter in range(iterations) :
    # get grads
    A2,cost,dW1,db1,dW2,db2 = propagation_NN(X,y,W1,b1,W2,b2)
    # update parameters by gradient descent
    W2 -= learning_rate * dW2
    b2 -= learning_rate * db2
    W1 -= learning_rate * dW1
    b1 -= learning_rate * db1
    # get best parameters
    if cost < best_cost :
      best_cost = cost
      best_parameters = {'W1':W1,'b1':b1,'W2':W2,'b2':b2}

    # track the train process
    if iter % 1000 == 0 :
      print(f'Cost after {iter} iterations: {cost}')
  print(f'Minimum Cost: {best_cost}')
  return best_parameters

In [None]:
def predict_NN(X_test,parameters) :
  # get parameters
  W1 = parameters['W1']
  b1 = parameters['b1']
  W2 = parameters['W2']
  b2 = parameters['b2']

  # forward
  Z1 = np.dot(W1,X_test) + b1
  A1 = relu(Z1)
  Z2 = np.dot(W2,A1) + b2
  A2 = sigmoid(Z2)

  # convert from probability to 0 or 1
  y_pred = np.where(A2 > 0.5,1,0)
  y_pred = y_pred.flatten()

  return y_pred

In [None]:
# initialize hyperparameters
n0 = X_train.shape[0]
n1 = 3 # the number of the neurons of hidden layer 1
n2 = 1 # the number of the neurons of output layer
learning_rate_NN = 0.1
iterations_NN = 2000

# initialize parameters
W1 = np.random.randn(n1,n0) * 0.0001 # to avoid to initialize 0 array
b1 = np.zeros((n1,1))
W2 = np.random.randn(n2,n1) * 0.0001
b2 = np.zeros((n2,1))

# fit data
parameters = fit_NN(X_train,y_train.reshape(1,-1),W1,b1,W2,b2,learning_rate_NN,iterations_NN)

# predict
y_pred_nn = predict_NN(X_test,parameters)

Cost after 0 iterations: 0.6931471806412166
Cost after 1000 iterations: 0.4257947183278817
Minimum Cost: 0.07951259045178963


In [None]:
f1_nn = f1_score(y_pred_nn,y_test)
acc_nn = accuracy_score(y_pred_nn,y_test)
print(f'F1 Score: {f1_nn}')
print(f'Accuracy Score: {acc_nn}')

F1 Score: 0.9793103448275863
Accuracy Score: 0.9736842105263158


Well, vey similar with Logistic