In [182]:
#import required modules
import pandas as pd
import numpy as np
from sklearn.metrics import recall_score, precision_score, accuracy_score

In [183]:
#read file from database
df = pd.read_csv('wdbc.data')
X = df.iloc[:,2:]
y = df[df.columns[1]]

y = [1 if res == "M" else 0 for res in y]

y = pd.DataFrame(y)


In [184]:
#Feature scaling
X = (X-X.mean())/X.std()    

X_train = X.iloc[0:454,:]
X_test = X.iloc[455:,:]
y_train = y.iloc[0:454,:]
y_test = y.iloc[455:,:]

In [185]:
#Helper function for traning and prediction
def sigmoid(z):
    return 1.0/(1 + np.exp(-z))

def loss(y, y_hat):
    loss = -np.mean(y*(np.log(y_hat)) - (1-y)*np.log(1-y_hat))
    return loss

def gradients(X, y, y_hat):
    
    # m-> number of training examples.
    m = X.shape[0]
    
    # Gradient of loss w.r.t weights.
    dw = (1/m)*np.dot(X.T, (y_hat - y))
    
    #return dw, db
    return dw

def train(X, y, batch_size, epochs, learning_rate):
        
    # m , n = no. of training examples and features
    m, n = X.shape
    
    # Initialize weights and bias to zeros.
    w = np.zeros((n,1))
    
    # Training loop.
    for epoch in range(epochs):
        for i in range((m-1)//batch_size + 1):
            # Defining batches for SGD
            start_i = i*batch_size
            end_i = start_i + batch_size
            xb = X[start_i:end_i]
            yb = y[start_i:end_i]
            
            # Calculate hypothesis
            y_hat = sigmoid(np.dot(xb, w))
            
            # Get the gradients of loss w.r.t parameters
            dw = gradients(xb, yb,  y_hat)
            
            # Update the weights.
            w = w - learning_rate*dw
            
    return w


def predict(X,w):
    
    preds = sigmoid(np.dot(X, w))
    
    pred_class = []

    pred_class = [1 if i > 0.5 else 0 for i in preds]
    
    return np.array(pred_class)              

In [188]:
#Training and prediction and evaluation
w = train(X_train, y_train, 100, 1000, 0.01)

y_pred = predict(X_test, w)

Recall = recall_score(y_test,y_pred)
Precision = precision_score(y_test, y_pred)
Accuracy = accuracy_score(y_test, y_pred)
print("Recall: %f" % Recall)
print("Precision: %f" % Precision)
print("Accuracy: %f" % Accuracy)

Recall: 1.000000
Precision: 0.928571
Accuracy: 0.982301


In [193]:
#Using cross validation
for batch_size in [100,500,1000]:
        print("For batch size: %f  - " % batch_size)
        w = train(X_train, y_train, batch_size, 1000, 0.01)
        y_pred = predict(X_test, w) 
        Recall = recall_score(y_test,y_pred)
        Precision = precision_score(y_test, y_pred)
        Accuracy = accuracy_score(y_test, y_pred)
        print("Recall: %f" % Recall)
        print("Precision: %f" % Precision)
        print("Accuracy: %f" % Accuracy)       

For batch size: 100.000000  - 
Recall: 1.000000
Precision: 0.928571
Accuracy: 0.982301
For batch size: 500.000000  - 
Recall: 1.000000
Precision: 0.962963
Accuracy: 0.991150
For batch size: 1000.000000  - 
Recall: 1.000000
Precision: 0.962963
Accuracy: 0.991150
