In [1]:
import numpy as np
import pandas as pd
from io import StringIO
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error


In [2]:
df = pd.read_csv('breast-cancer-wisconsin.data', header=None)
df.loc[:, 6].replace(['?'], 1, inplace=True)
df.loc[:, 6] = pd.to_numeric(df.loc[:, 6])
df.drop(axis = "rows", labels = df.index[df.duplicated()], inplace=True)
df.apply(pd.to_numeric, errors='ignore')
rows = df.shape[0]


In [3]:
X = df.loc[:,[1,2,3,4,5,6,7,8,9,10]]
Y = df.iloc[:,10]
Y = Y.replace(4,0)
Y = Y.replace(2,1)
# print(Y)
np.random.seed(9)
arr_rand = np.random.rand(X.shape[0])
split = arr_rand < np.percentile(arr_rand, 70)
X_train = X[split]
Y_train = Y[split]
X_test =  X[~split]
Y_test = Y[~split]


X_train.shape


(483, 10)

# LOGISTIC REGRESSION MULTIVARIATE GRADIENT DECENT

In [4]:
def logistic(z):
    return (1 + np.exp(-z))**(-1)


In [5]:
n_iter = 5000
lr = 0.01

In [6]:
length = X_train.shape[0]
X_input = X_train.iloc[:,0:9]
X_input.insert(9, "faltu", [1]*len(X_train), True)
X_input_test = X_test.iloc[:,0:9]
X_input_test.insert(9, "faltu", [1]*len(X_test), True)

beta = np.random.uniform(low = -10, high =10, size = 10)
for _ in range(n_iter):
    p = logistic(X_input@beta)
    gradient = X_input.T@(Y_train-p)
    beta += lr*gradient 

modi = np.array(logistic(X_input@beta))
count1 = 0 
count2 = 0

for i in range(len(modi)):
    if(modi[i]>=0.5):
        modi[i] = 1
        count1 = count1+1
    else:
        modi[i] = 0
        count2 = count2+1

from sklearn import metrics
print("Training data accuracy:-")
print(metrics.accuracy_score(Y_train, modi)*100)

modv = np.array(logistic(X_input_test@beta))
for i in range(len(modv)):
    if(modv[i]>=0.5):
        modv[i] = 1
        count1 = count1+1
    else:
        modv[i] = 0
        count2 = count2+1
print("Test data accuracy:-")
print(metrics.accuracy_score(Y_test, modv)*100)


  result = getattr(ufunc, method)(*inputs, **kwargs)


Training data accuracy:-
95.23809523809523
Test data accuracy:-
96.61835748792271


# LOGISTIC UNIVARIATE GRADIENT DECENT

In [7]:
Xn_iter = 5000
lr = 0.01

In [8]:
X_uni = X_train.iloc[:,2]
X_dummied = np.zeros((X_train.shape[0], 2))
X_dummied[:,1] = 1
X_dummied[:,0] = X_uni 

X_uni = X_test.iloc[:,2]
X_dummied_test = np.zeros((X_test.shape[0], 2))
X_dummied_test[:,1] = 1
X_dummied_test[:,0] = X_uni

beta = np.random.uniform(low = -10, high =10, size = 2)
for _ in range(n_iter):
    p = logistic(X_dummied@beta)
    gradient = X_dummied.T@(Y_train-p)
    beta += lr*gradient 
modi = np.array(logistic(X_dummied@beta))
count1 = 0 
count2 = 0

for i in range(len(modi)):
    if(modi[i]>=0.5):
        modi[i] = 1
        count1 = count1+1
    else:
        modi[i] = 0
        count2 = count2+1
from sklearn import metrics
print("Training data accuracy:-")
print(metrics.accuracy_score(Y_train, modi)*100)

modv = np.array(logistic(X_dummied_test@beta))
for i in range(len(modv)):
    if(modv[i]>=0.5):
        modv[i] = 1
        count1 = count1+1
    else:
        modv[i] = 0
        count2 = count2+1
print("Test data accuracy:-")
print(metrics.accuracy_score(Y_test, modv)*100)



Training data accuracy:-
89.23395445134575
Test data accuracy:-
94.20289855072464


# NAIVE BAYES UNIIVARIATE

In [9]:
b_p = 0.655
m_p = 0.345

In [10]:
r_b = X_train.loc[X_train.iloc[:,9] == 2] 
r_m = X_train.loc[X_train.iloc[:, 9] == 4]

In [11]:
Y_pred = []
Y_pred_tes = []

benign_column = r_b.iloc[:, 2]
malignant_column = r_m.iloc[:, 2]
b_mean = np.mean(benign_column)
b_standard = np.std(benign_column)
m_mean = np.mean(malignant_column)
m_standard = np.std(malignant_column)


for i in range(X_train.shape[0]):
    pB = b_p
    pM = m_p
    value = X_train.iat[i,2]
    pB*= 1/(b_standard * np.sqrt(2 * np.pi)) * np.exp( - (value - b_mean)*2 / (2 * b_standard*2))
    pM *= 1/(m_standard * np.sqrt(2 * np.pi)) * np.exp( - (value - m_mean)*2 / (2 * m_standard*2))
    if(pB>=pM):
        Y_pred.append(1)
    else:
        Y_pred.append(0)
print("Training data accuracy:-:", metrics.accuracy_score(Y_train, Y_pred)*100)


for i in range(X_test.shape[0]):
    pB = b_p
    pM = m_p
    value = X_test.iat[i,2]
    pB*= 1/(b_standard * np.sqrt(2 * np.pi)) * np.exp( - (value - b_mean)*2 / (2 * b_standard*2))
    pM *= 1/(m_standard * np.sqrt(2 * np.pi)) * np.exp( - (value - m_mean)*2 / (2 * m_standard*2))
    if(pB>=pM):
        Y_pred_tes.append(1)
    else:
        Y_pred_tes.append(0)
print("Testing data accuracy:-:", metrics.accuracy_score(Y_test, Y_pred_tes)*100)


Training data accuracy:-: 91.7184265010352
Testing data accuracy:-: 93.23671497584542


# MULTIvariate Naive 

In [12]:
def pro(mu, sigma, x):
    return 1/(sigma * np.sqrt(2 * np.pi)) * np.exp( - (x - mu)*2 / (2 * sigma*2))

In [13]:
b_muu = []
m_muu = []
b_standard = []
sigma_malignant = []
for c in r_b:
    MEAN = np.mean(r_b[c].values)
    b_muu.append(MEAN)

    SIGMA = np.std(r_b[c].values)
    b_standard.append(SIGMA)


for c in r_m:
    MEAN = np.mean(r_m[c].values)
    m_muu.append(MEAN)

    SIGMA = np.std(r_m[c].values)
    sigma_malignant.append(SIGMA)

sigma_malignant.pop()


print()




In [14]:

Y_pred = []

for i in range(X_train.shape[0]):
    pM = m_p
    pB = b_p
    for j in range(r_b.shape[1]-1):
        value = X_train.iat[i, j]
        pB *= pro(b_muu[j], b_standard[j], value)
        pM *= pro(m_muu[j], sigma_malignant[j], value)
    if(pB>=pM):
        Y_pred.append(1)
    else:
        Y_pred.append(0)

print("Training data accuracy:", metrics.accuracy_score(Y_train, Y_pred)*100)

Y_pred_tes = []
for i in range(X_test.shape[0]):
    pM = m_p
    pB = b_p
    for j in range(r_b.shape[1]-1):
        value = X_test.iat[i, j]
        pB *= pro(b_muu[j], b_standard[j], value)
        pM *= pro(m_muu[j], sigma_malignant[j], value)
    if(pB>=pM):
        Y_pred_tes.append(1)
    else:
        Y_pred_tes.append(0)


print("Test data accracy:", metrics.accuracy_score(Y_test, Y_pred_tes)*100)

Training data accuracy: 96.06625258799171
Test data accracy: 98.55072463768117


# MULTIVARIATE NEWTON

In [15]:
X_input = X_train.iloc[:,0:9]
X_input.insert(9, "faltu", [1]*len(X_train), True)
beta = np.random.uniform(low = -10, high =10, size = 10)
y_hat = logistic(X_input@beta)
y_hat = np.reshape(y_hat,-1)
S = np.diag(y_hat*(1-y_hat))
h = X_input.T@S
H = np.dot(h,X_input)
H = np.linalg.inv(H)

In [16]:
n_iter = 3000

for _ in range(n_iter):
    p = logistic(X_input@beta)
    gradient = X_input.T@(Y_train-p)
    beta += H@gradient

modi = np.array(logistic(X_input@beta))

for i in range(len(modi)):
    if(modi[i]>=0.5):
        modi[i] = 1
        count1 = count1+1
    else:
        modi[i] = 0
        count2 = count2+1

print("Training data accuracy:-")
print(metrics.accuracy_score(Y_train, modi)*100)

X_input_test = X_test.iloc[:,0:9]
X_input_test.insert(9, "faltu", [1]*len(X_test), True)
odv = np.array(logistic(X_input_test@beta))
for i in range(len(modv)):
    if(modv[i]>=0.5):
        modv[i] = 1
        count1 = count1+1
    else:
        modv[i] = 0
        count2 = count2+1
print("Test data accuracy:-")
print(metrics.accuracy_score(Y_test, modv)*100)

  result = getattr(ufunc, method)(*inputs, **kwargs)


Training data accuracy:-
89.02691511387164
Test data accuracy:-
94.20289855072464


  result = getattr(ufunc, method)(*inputs, **kwargs)


# logistic univariate using newton

In [17]:
X_uni = X_train.iloc[:,2]
X_dummied = np.zeros((X_train.shape[0], 2))
X_dummied[:,1] = 1
X_dummied[:,0] = X_uni 

X_uni = X_test.iloc[:,2]
X_dummied_test = np.zeros((X_test.shape[0], 2))
X_dummied_test[:,1] = 1
X_dummied_test[:,0] = X_uni

beta = np.random.uniform(low = -10, high =10, size = 2)
y_hat = logistic(X_dummied@beta)
y_hat = np.reshape(y_hat,-1)
S = np.diag(y_hat*(1-y_hat))
h = X_dummied.T@S
H = np.dot(h,X_dummied)
H = np.linalg.inv(H)
for _ in range(n_iter):
    p = logistic(X_dummied@beta)
    gradient = X_dummied.T@(Y_train-p)
    beta += H@gradient 
modi = np.array(logistic(X_dummied@beta))
count1 = 0 
count2 = 0

for i in range(len(modi)):
    if(modi[i]>=0.5):
        modi[i] = 1
        count1 = count1+1
    else:
        modi[i] = 0
        count2 = count2+1
from sklearn import metrics
print("Training data accuracy:-")
print(metrics.accuracy_score(Y_train, modi)*100)

modv = np.array(logistic(X_dummied_test@beta))
for i in range(len(modv)):
    if(modv[i]>=0.5):
        modv[i] = 1
        count1 = count1+1
    else:
        modv[i] = 0
        count2 = count2+1
print("Test data accuracy:-")
print(metrics.accuracy_score(Y_test, modv)*100)

  return (1 + np.exp(-z))**(-1)


Training data accuracy:-
82.81573498964804
Test data accuracy:-
87.43961352657004
