In [109]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
import numpy as np
import warnings

#suppress warnings
warnings.filterwarnings('ignore')

In [8]:
trainset = pd.read_csv('data/emails.csv')
traindata = np.loadtxt('data/emails.csv',delimiter=',', skiprows=1, usecols=range(1,3002))
EX_train = traindata[:, :3000]  
Ey_train = traindata[:,3000]

print(EX_train.shape, Ey_train.shape)

(5000, 3000) (5000,)


In [None]:
#Logistic Regression implementation

In [156]:
def sig(x):
    """
    Sigmoid function
    """
    return 1.0/(1+np.exp(-x))

def calc_gradient(X, y, y_hat, m):
    """
    Calculate gradient
    """
    grad = (1/m)*np.dot(X.T, (y_hat - y))   
    return grad


def cost_func(y, y_hat, m):
    """
    Calculate cross entroy loss
    """
    cost = (-1/m) * np.sum((y * np.log(y_hat)) + (1 - y) * np.log(1 - y_hat))
    return cost

def fit_log_reg(X, y, itr, lr):
    """
    Train log reg function
    """
    m,n = X.shape
    theta = np.zeros((n,1))
    cost_list = []
    
    for i in range(itr):
        y_hat = sig(np.dot(X, theta))
        grad = calc_gradient(X, y, y_hat, m)
        
        theta = theta - (lr * grad)
        cost = cost_func(y, sig(np.dot(X, theta)), m)
        cost_list.append(cost)
    return theta, cost_list
        
def predict(X, theta):
    """
    Return predictions list
    """
    predictions = sigmoid(np.dot(X, theta))
    labels = []
              
    for pred in predictions:
        if pred >= 0.5:
            labels.append(1)
        else:
            labels.append(0)
            
    labels = np.asarray(labels)
    return labels

In [195]:
for i in range(0,5):
    print("FOLD: " + str(i))
    X_test  = EX_train[i*1000:(i+1)*1000]
    y_test  = Ey_train[i*1000:(i+1)*1000] 
    indices = np.arange(i*1000, (i+1)*1000, 1, dtype=int)
    X_train = np.delete(EX_train, indices, axis=0)
    y_train = np.delete(Ey_train, indices, axis=0)
    print(X_test.shape, y_test.shape, X_train.shape, y_train.shape)
    
    # initialize the classifier with value of number of neighbors=1
    theta,cost_list = train(X_train, y_train, 1000, 0.01)
    # Make predictions
    predictions = predict(X_test, theta)
    accuracy = np.sum(predictions == y_test)/(len(y_test))
    TP = np.sum(np.logical_and(predictions == 1, y_test == 1))
    TN = np.sum(np.logical_and(predictions == 0, y_test == 0))
    FP = np.sum(np.logical_and(predictions == 1, y_test == 0))
    FN = np.sum(np.logical_and(predictions == 0, y_test == 1))
    print ("Accuracy = " + str(accuracy))
    print ("Precision = " + str (float (TP/(TP+FP))), "Recall = " + str (float(TP/(TP+FN))))

FOLD: 0
(1000, 3000) (1000,) (4000, 3000) (4000,)
Accuracy = 0.915 TP = 229 TN = 686 FP = 29 FN = 56
Precision = 0.8875968992248062 Recall = 0.8035087719298246
FOLD: 1
(1000, 3000) (1000,) (4000, 3000) (4000,)
Accuracy = 0.893 TP = 214 TN = 679 FP = 44 FN = 63
Precision = 0.8294573643410853 Recall = 0.7725631768953068
FOLD: 2
(1000, 3000) (1000,) (4000, 3000) (4000,)
Accuracy = 0.888 TP = 221 TN = 667 FP = 49 FN = 63
Precision = 0.8185185185185185 Recall = 0.778169014084507
FOLD: 3
(1000, 3000) (1000,) (4000, 3000) (4000,)
Accuracy = 0.834 TP = 139 TN = 695 FP = 11 FN = 155
Precision = 0.9266666666666666 Recall = 0.47278911564625853
FOLD: 4
(1000, 3000) (1000,) (4000, 3000) (4000,)
Accuracy = 0.851 TP = 213 TN = 638 FP = 56 FN = 93
Precision = 0.79182156133829 Recall = 0.696078431372549
