In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split as tts
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix

In [2]:
df_x = pd.read_csv('spambase.data', header=None, engine='python')
df_x = df_x.sample(frac=1, random_state=0)
df_y = pd.DataFrame(data=df_x[df_x.columns[-1]])
df_x.drop(df_x.columns[[-1,]], axis=1, inplace=True)

In [3]:
df_x_train, df_x_test, df_y_train, df_y_test = tts(df_x, df_y, test_size=0.33)

In [4]:
mean = df_x_train.mean(axis=0)
std = df_x_train.std(axis=0)
df_x_s_train = df_x_train.subtract(mean).divide(std)
x = np.hstack([np.ones((df_x_s_train.shape[0], 1)), df_x_s_train.values])
y = df_y_train.values

In [5]:
eta = 0.01
i = 0
term = 2 ** -23
thetas = np.random.uniform(-1, 1, (x.shape[1], 1))
def sigmoid(x, thetas):
    return 1/(1 + np.exp(-1 * (x @ thetas)))
def cost(x, y, thetas):
    hypothesis = sigmoid(x, thetas)
    return -1/x.shape[0] * (y.T @ np.log(hypothesis) + np.subtract(1, y).T @ np.subtract(1, np.log(hypothesis)))
prev_cost = 0
while (i < 1500):
    curr_cost = cost(x, y, thetas)
    gradient = x.T @ np.subtract(sigmoid(x, thetas), y)
    thetas -= (eta/x.shape[0] * gradient)
    if np.abs(curr_cost - prev_cost) < term:
        break;
    prev_cost = curr_cost
    i += 1

In [6]:
x_s_test = df_x_test.subtract(mean).divide(std)
x_test = np.hstack([np.ones((df_x_test.shape[0], 1)), x_s_test.values])
y_test_p = np.array([1 if i > .5 else 0 for i in (x_test @ thetas)])
y_test_p = y_test_p.reshape(y_test_p.shape[0], 1)
print(confusion_matrix(df_y_test, y_test_p))
tp, tn, fp, fn = 0, 0, 0, 0
for i in range(y_test_p.shape[0]):
    if y_test_p[i] == df_y_test.values[i]:
        if y_test_p[i] == 1:
            tp += 1
        else:
            tn += 1
    else:
        if y_test_p[i] == 1:
            fp += 1
        else:
            fn += 1
accuracy = (tp + tn)/y_test_p.shape[0]
precision = tp / (tp + fp)
recall = tp / (tp + fn)
f1 = 2 * precision * recall / (precision + recall)
print(tp, tn, fp, fn)
print(accuracy, precision, recall, f1)

[[861  62]
 [105 491]]
491 861 62 105
0.8900592495062541 0.8878842676311031 0.8238255033557047 0.8546562228024369


In [7]:
lgr = LogisticRegression(penalty='none', solver='lbfgs', max_iter=1500)
lgr.fit(df_x_s_train, y.ravel())
df_x_s_test = df_x_test.subtract(mean).divide(std)
y_pred = lgr.predict(df_x_s_test)
print(confusion_matrix(df_y_test, y_pred))

[[872  51]
 [ 73 523]]
