In [1]:
import pandas as pd
import numpy as np

In [2]:
# Converting data to CSV

f = open("breast-cancer-wisconsin.data")
data = f.read()
data = data.split("\n")
mydict = {"index":[],"1":[],"2":[],"3":[],"4":[],"5":[],"6":[],"7":[],"8":[],"9":[],"label":[]}
df = pd.DataFrame(mydict)
for k in data[:-1]:
    row = []
    temp = k.split(",")
    for i in range(0,len(temp)):  
        row.append(temp[i])
    df.loc[len(df.index)] = row

df = df.drop("index", axis=1)
df.to_csv("breast-cancer-wisconsin.csv")

In [3]:
data = pd.read_csv("breast-cancer-wisconsin.csv")

# Removing "?" from the dataset
data = data[data["1"] != "?"]
data = data[data["2"] != "?"]
data = data[data["3"] != "?"]
data = data[data["4"] != "?"]
data = data[data["5"] != "?"]
data = data[data["6"] != "?"]
data = data[data["7"] != "?"]
data = data[data["8"] != "?"]
data = data[data["9"] != "?"]

data = data.sample(frac=1)

# Preparing the dataset
X = data[["1","2","3","4","5","6","7","8","9"]]
X = X.astype(int)
y = data["label"]
y = y.replace(2,1)
y = y.replace(4,0)

In [4]:
import copy
import numpy as np
from sklearn.metrics import accuracy_score

# Logistic regression class
class LogisticRegression():
    def __init__(self):
        self.losses = []
        self.train_accuracies = []

    def fit(self, x, y, epochs):
        x = self._transform_x(x)
        y = self._transform_y(y)

        self.weights = np.zeros(x.shape[1])
        self.bias = 0

        for i in range(epochs):
            x_dot_weights = np.matmul(self.weights, x.transpose()) + self.bias
            pred = self._sigmoid(x_dot_weights)
            loss = self.compute_loss(y, pred)
            error_w, error_b = self.compute_gradients(x, y, pred)
            self.update_model_parameters(error_w, error_b)

            pred_to_class = [1 if p > 0.5 else 0 for p in pred]
            self.train_accuracies.append(accuracy_score(y, pred_to_class))
            self.losses.append(loss)

    def compute_loss(self, y_true, y_pred):
        # binary cross entropy
        y_zero_loss = y_true * np.log(y_pred + 1e-9)
        y_one_loss = (1-y_true) * np.log(1 - y_pred + 1e-9)
        return -np.mean(y_zero_loss + y_one_loss)

    def compute_gradients(self, x, y_true, y_pred):
        # derivative of binary cross entropy
        difference =  y_pred - y_true
        gradient_b = np.mean(difference)
        gradients_w = np.matmul(x.transpose(), difference)
        gradients_w = np.array([np.mean(grad) for grad in gradients_w])

        return gradients_w, gradient_b

    def update_model_parameters(self, error_w, error_b):
        self.weights = self.weights - 0.1 * error_w
        self.bias = self.bias - 0.1 * error_b

    def predict(self, x):
        x_dot_weights = np.matmul(x, self.weights.transpose()) + self.bias
        probabilities = self._sigmoid(x_dot_weights)
        return [1 if p > 0.5 else 0 for p in probabilities]

    def _sigmoid(self, x):
        return np.array([self._sigmoid_function(value) for value in x])

    def _sigmoid_function(self, x):
        if x >= 0:
            z = np.exp(-x)
            return 1 / (1 + z)
        else:
            z = np.exp(x)
            return z / (1 + z)

    def _transform_x(self, x):
        x = copy.deepcopy(x)
        return x.values

    def _transform_y(self, y):
        y = copy.deepcopy(y)
        return y.values.reshape(y.shape[0], 1)

In [5]:
# Cross validation

X1 = X[  0:136]
X2 = X[136:272]
X3 = X[272:408]
X4 = X[408:544]
X5 = X[544:683]

y1 = y[  0:136]
y2 = y[136:272]
y3 = y[272:408]
y4 = y[408:544]
y5 = y[544:683]

In [6]:
# Fold 1
x_train = X2
X2 = X2.append(X3)
X2 = X2.append(X4)
X2 = X2.append(X5)

y_train = y2
y2 = y2.append(y3)
y2 = y2.append(y4)
y2 = y2.append(y5)

lr = LogisticRegression()
lr.fit(x_train, y_train, epochs=150)

pred = lr.predict(X1)
accuracy = accuracy_score(y1, pred)
print("Accuracy at Fold 1 is ",accuracy)

Accuracy at Fold 1 is  0.6764705882352942


In [7]:
# Fold 2
x_train = X1
X1 = X1.append(X3)
X1 = X1.append(X4)
X1 = X1.append(X5)

y_train = y1
y1 = y1.append(y3)
y1 = y1.append(y4)
y1 = y1.append(y5)

lr = LogisticRegression()
lr.fit(x_train, y_train, epochs=150)

pred = lr.predict(X2)
accuracy = accuracy_score(y2, pred)
print("Accuracy at Fold 2 is ",accuracy)

Accuracy at Fold 2 is  0.659963436928702


In [8]:
# Fold 4
x_train = X1
X1 = X1.append(X2)
X1 = X1.append(X4)
X1 = X1.append(X5)

y_train = y1
y1 = y1.append(y2)
y1 = y1.append(y4)
y1 = y1.append(y5)

lr = LogisticRegression()
lr.fit(x_train, y_train, epochs=150)

pred = lr.predict(X3)
accuracy = accuracy_score(y3, pred)
print("Accuracy at Fold 3 is ",accuracy)

Accuracy at Fold 3 is  0.6617647058823529


In [9]:
# Fold 4
x_train = X1
X1 = X1.append(X3)
X1 = X1.append(X2)
X1 = X1.append(X5)

y_train = y1
y1 = y1.append(y3)
y1 = y1.append(y2)
y1 = y1.append(y5)

lr = LogisticRegression()
lr.fit(x_train, y_train, epochs=150)

pred = lr.predict(X4)
accuracy = accuracy_score(y4, pred)
print("Accuracy at Fold 4 is ",accuracy)

Accuracy at Fold 4 is  0.6691176470588235


In [10]:
# Fold 5
x_train = X1
X1 = X1.append(X3)
X1 = X1.append(X4)
X1 = X1.append(X2)

y_train = y1
y1 = y1.append(y3)
y1 = y1.append(y4)
y1 = y1.append(y2)

lr = LogisticRegression()
lr.fit(x_train, y_train, epochs=150)

pred = lr.predict(X5)
accuracy = accuracy_score(y5, pred)
print("Accuracy at Fold 5 is ",accuracy)

Accuracy at Fold 5 is  0.7266187050359713


The peak accuracy of the logistic regression was 73% and a low of 65%