In [7]:
import sys
sys.path.append("../Python")
import copy
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import PyQt5
import math
from MyDescribe import MyDescribe
from MyHistogram import MyHistogram


In [54]:
# Read dataset
gaugeColumnList = {}
gaugeColumnList["Mean"] = ["Mean Radius","Mean Texture","Mean Perimeter","Mean Area","Mean Smoothness","Mean Compactness","Mean Concavity","Mean Concave points","Mean Symmetry","Mean Fractal dimension"]
gaugeColumnList["SE"] = ["Radius SE","Texture SE","Perimeter SE","Area SE","Smoothness SE","Compactness SE","Concavity SE","Concave points SE","Symmetry SE","Fractal dimension SE"]
gaugeColumnList["Worst"] = ["Worst Radius","Worst Texture","Worst Perimeter","Worst Area","Worst Smoothness","Worst Compactness","Worst Concavity","Worst Concave points","Worst Symmetry","Worst Fractal dimension"]
gaugeColumnList["Result"] = ["Diagnosis"]
columnNames = []
columnNames.extend(["ID number","Diagnosis"])
columnNames.extend(gaugeColumnList["Mean"])
columnNames.extend(gaugeColumnList["SE"])
columnNames.extend(gaugeColumnList["Worst"])
featureNames = []
featureNames.extend(gaugeColumnList["Mean"])
featureNames.extend(gaugeColumnList["SE"])
featureNames.extend(gaugeColumnList["Worst"])
dataset_training = pd.read_csv("../Data/data.csv", header=None, names=columnNames)
# print(dataset_training.T)


In [9]:
colorList = ["red", "yellow"]
colors = {}
colors["M"] = colorList[0]
colors["B"] = colorList[1]
dataset_training["Color"] = dataset_training.apply(lambda row: colors[row.Diagnosis], axis=1)

In [84]:
def sigmoid(z):
    EPSILON = 0.0000001
    s = 1 / (1 + np.exp(-z) + EPSILON)
#     if (s == 1):
#         s -= EPSILON;
#     elif (s == 0):
#         s += EPSILON;
    return s

In [85]:
def initialize_with_zeros(dim):
    w = np.zeros((dim,1), float)
    b = float(0)
    return w, b

In [86]:
def propagate(w, b, X, Y):
    m = X.shape[1]
    Y_hat = sigmoid(w.T.dot(X) + b)
    cost = -1/m * np.sum(Y * np.log(Y_hat) + (1 - Y) * np.log(1 - Y_hat))
    dw = 1/m * X.dot((Y_hat - Y).T)
    db = 1/m * np.sum(Y_hat - Y)
    grads = {"dw": dw,
             "db": db}
    return grads, cost

In [109]:
def optimize(w, b, X, Y, num_iterations=100, learning_rate=0.009, print_cost=False):
    w = copy.deepcopy(w)
    b = copy.deepcopy(b)
    costs = []
    for i in range(num_iterations):
        grads, cost = propagate(w, b, X, Y)
        dw = grads["dw"]
        db = grads["db"]
        w -= learning_rate * dw
        b -= learning_rate * db
        if i % 5000 == 0:
            costs.append(cost)
            if print_cost:
                print ("Cost after iteration %i: %f" %(i, cost))
    params = {"w": w,
              "b": b}
    grads = {"dw": dw,
             "db": db}
    return params, grads, costs

In [110]:
def predict(w, b, X):
    m = X.shape[1]
    Y_prediction = np.zeros((1, m))
    w = w.reshape(X.shape[0], 1)
    A = sigmoid(w.T.dot(X) + b)
    for i in range(A.shape[1]):
        if A[0, i] > 0.5 :
            Y_prediction[0,i] = 1
        else:
            Y_prediction[0,i] = 0
    return Y_prediction

In [111]:
def model(X_train, Y_train, X_test, Y_test, num_iterations=2000, learning_rate=0.5, print_cost=False):
    (w, b) = initialize_with_zeros(X_train.shape[0])
    (params, grads, costs) = optimize(w, b, X_train, Y_train, num_iterations, learning_rate, print_cost)
    w = params["w"]
    b = params["b"]
    Y_prediction_test = predict(w, b, X_test)
    Y_prediction_train = predict(w, b, X_train)
    if print_cost:
        print("train accuracy: {} %".format(100 - np.mean(np.abs(Y_prediction_train - Y_train)) * 100))
        print("test accuracy: {} %".format(100 - np.mean(np.abs(Y_prediction_test - Y_test)) * 100))
    d = {"costs": costs,
         "Y_prediction_test": Y_prediction_test, 
         "Y_prediction_train" : Y_prediction_train, 
         "w" : w, 
         "b" : b,
         "learning_rate" : learning_rate,
         "num_iterations": num_iterations}
    return d

In [112]:
def normalizeData(data):
    return (data - np.min(data, axis=0)) / (np.max(data, axis=0) - np.min(data, axis=0))

In [113]:
x = dataset_training[featureNames].to_numpy()
result = dataset_training[gaugeColumnList["Result"]].to_numpy()
y = np.where((result == "M"),1,0)

In [119]:
normalized = normalizeData(x)
print(normalized.shape)
print(y.shape)

(569, 30)
(569, 1)


In [117]:
logistic_regression_model = model(normalized.T, y.T, normalized.T, y.T, num_iterations=100000, learning_rate=1, print_cost=True)

Cost after iteration 0: 0.693147
Cost after iteration 5000: 0.066001
Cost after iteration 10000: 0.057705
Cost after iteration 15000: 0.053952
Cost after iteration 20000: 0.051733
Cost after iteration 25000: 0.050225
Cost after iteration 30000: 0.049105
Cost after iteration 35000: 0.048221
Cost after iteration 40000: 0.047494
Cost after iteration 45000: 0.046878
Cost after iteration 50000: 0.046345
Cost after iteration 55000: 0.045874
Cost after iteration 60000: 0.045453
Cost after iteration 65000: 0.045072
Cost after iteration 70000: 0.044726
Cost after iteration 75000: 0.044407
Cost after iteration 80000: 0.044113
Cost after iteration 85000: 0.043839
Cost after iteration 90000: 0.043584
Cost after iteration 95000: 0.043344
train accuracy: 98.94551845342707 %
test accuracy: 98.94551845342707 %
