In [46]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import random
from collections import Counter

In [47]:
data = pd.read_csv("encoded_data.csv")
data

Unnamed: 0,gender,ssc_percentage,ssc_board,hsc_percentage,hsc_board,degree_percentage,work_experience,emp_test_percentage,specialisation,mba_percent,status,hsc_subject_Arts,hsc_subject_Commerce,hsc_subject_Science,undergrad_degree_Comm&Mgmt,undergrad_degree_Others,undergrad_degree_Sci&Tech
0,1,67.00,0,91.00,0,58.00,0,55.0,0,58.80,1,0,1,0,0,0,1
1,1,79.33,1,78.33,0,77.48,1,86.5,1,66.28,1,0,0,1,0,0,1
2,1,65.00,1,68.00,1,64.00,0,75.0,1,57.80,1,1,0,0,1,0,0
3,1,56.00,1,52.00,1,52.00,0,66.0,0,59.43,0,0,0,1,0,0,1
4,1,85.80,1,73.60,1,73.30,0,96.8,1,55.50,1,0,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
210,1,80.60,0,82.00,0,77.60,0,91.0,1,74.49,1,0,1,0,1,0,0
211,1,58.00,0,60.00,0,72.00,0,74.0,1,53.62,1,0,0,1,0,0,1
212,1,67.00,0,67.00,0,73.00,1,59.0,1,69.72,1,0,1,0,1,0,0
213,0,74.00,0,66.00,0,58.00,0,70.0,0,60.23,1,0,1,0,1,0,0


In [48]:
data.nunique()[data.nunique() > 2].index

Index(['ssc_percentage', 'hsc_percentage', 'degree_percentage',
       'emp_test_percentage', 'mba_percent'],
      dtype='object')

In [75]:
# Utility functions

def standardize_features(X: pd.DataFrame, columns: list=None):
    if not columns:
        columns = X.columns
    for col in columns:
        X[col] = (X[col] - np.mean(X[col])) / np.std(X[col])
    return X

def initialize(X: np.array):
    weights = np.zeros((np.shape(X)[1]+1, 1))
    X = np.c_[np.ones((np.shape(X)[0], 1)), X]
    return weights, X

#  In the below expression, z is the dot product of the mxn matrix containing observations and nx1 matrix of weights.
def sigmoid(z):
    sig = 1/(1+np.e**(-z))
    return sig

def fit(X, y, alpha=0.001, iter=100):
    weights, X = initialize(X)

    def cost(theta):
        z = np.dot(X, theta)
        cost0 = y.T.dot(np.log(sigmoid(z)))
        cost1 = (1 - y).T.dot(np.log(1 - sigmoid(z)))
        cost = -((cost1 + cost0)) / len(y) 
        return cost
    
    cost_list = np.zeros(iter)
    for i in range(iter):
        # weights = weights - alpha * np.dot(X.T, sigmoid(np.dot(X, weights)) - np.reshape(y, (len(y), 1)))
        step1 = sigmoid(np.dot(X, weights))
        step2 = np.reshape(y, (len(y), 1))
        step3 = np.dot(X.T, step1 - step2)
        weights = weights - alpha * step3
        cost_list[i] = cost(weights)
    return weights, cost_list

def predict(weights, X):
    z = np.dot(initialize(X)[1], weights)
    preds = []
    for i in sigmoid(z):
        if i > 0.5:
            preds.append(1)
        else:
            preds.append(0)
    return preds

In [50]:
def metrics(y: pd.Series, ypred: pd.Series):
    # Return accuracy, sensitivity, specificity, precision, f1-score
    tp = np.sum(np.logical_and(ypred == 1, y == 1))
    fp = np.sum(np.logical_and(ypred == 1, y == 0))
    tn = np.sum(np.logical_and(ypred == 0, y == 0))
    fn = np.sum(np.logical_and(ypred == 0, y == 1))

    accuracy = (tp + tn) / (tp + fp + tn + fn)
    if tp + fn == 0:
        sensitivity = 0
    else:
        sensitivity = tp / (tp + fn) # true positive rate, recall
    if tn + fp == 0:
        specificity = 0
    else:
        specificity = tn / (tn + fp) # true negative rate
    if tp + fp == 0:
        precision = 0
    else:
        precision = tp / (tp + fp)
    if precision + sensitivity == 0:
        f1 = 0
    else:
        f1 = 2 * ((precision * sensitivity) / (precision + sensitivity))
    return {
    "accuracy": accuracy,
    "sensitivity": sensitivity,
    "specificity": specificity,
    "precision" : precision,
    "f1": f1
    }

In [51]:
def train_test_split(data: pd.DataFrame, train_percent: float = 0.8):
    # Shuffle data
    data = data.sample(frac=1).reset_index(drop=True)
    n_train = int(len(data) * train_percent)
    train = data.iloc[:n_train, :]
    test = data.iloc[n_train:, :]
    return train, test

train_df, test_df = train_test_split(data, train_percent=0.8)
train_df

Unnamed: 0,gender,ssc_percentage,ssc_board,hsc_percentage,hsc_board,degree_percentage,work_experience,emp_test_percentage,specialisation,mba_percent,status,hsc_subject_Arts,hsc_subject_Commerce,hsc_subject_Science,undergrad_degree_Comm&Mgmt,undergrad_degree_Others,undergrad_degree_Sci&Tech
0,1,53.00,1,63.00,0,60.00,1,70.00,1,53.20,1,0,0,1,1,0,0
1,1,66.50,1,62.50,1,60.90,0,93.40,1,55.03,1,0,1,0,1,0,0
2,1,58.00,1,70.00,1,61.00,0,54.00,1,52.21,0,0,1,0,1,0,0
3,1,52.00,1,63.00,0,65.00,1,86.00,0,56.09,0,0,0,1,0,0,1
4,1,61.00,0,62.00,0,65.00,0,62.00,1,56.81,1,0,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
167,1,82.00,0,90.00,0,83.00,0,80.00,0,73.52,1,0,1,0,1,0,0
168,1,84.00,0,90.90,0,64.50,0,86.04,1,59.42,1,0,0,1,0,0,1
169,0,69.50,1,70.00,1,72.00,0,57.20,0,54.80,1,0,0,1,0,0,1
170,1,77.67,0,64.89,0,70.67,0,89.00,1,60.39,1,0,1,0,1,0,0


In [52]:
test_df

Unnamed: 0,gender,ssc_percentage,ssc_board,hsc_percentage,hsc_board,degree_percentage,work_experience,emp_test_percentage,specialisation,mba_percent,status,hsc_subject_Arts,hsc_subject_Commerce,hsc_subject_Science,undergrad_degree_Comm&Mgmt,undergrad_degree_Others,undergrad_degree_Sci&Tech
172,0,74.0,0,66.0,0,58.0,0,70.0,0,60.23,1,0,1,0,1,0,0
173,0,63.4,0,67.2,0,60.0,0,58.06,0,69.28,0,0,1,0,1,0,0
174,1,68.0,0,56.0,0,68.0,0,73.0,0,68.07,1,0,0,1,0,0,1
175,1,68.2,1,72.8,1,66.6,1,96.0,1,70.85,1,0,1,0,1,0,0
176,1,67.0,0,63.0,0,64.0,0,60.0,1,61.87,0,0,0,1,0,0,1
177,1,56.0,1,52.0,1,52.0,0,66.0,0,59.43,0,0,0,1,0,0,1
178,1,65.2,1,61.4,1,64.8,1,93.4,1,57.34,1,0,1,0,1,0,0
179,1,76.76,0,76.5,0,67.5,1,73.35,1,64.15,1,0,1,0,1,0,0
180,1,85.0,1,60.0,0,73.43,1,60.0,1,61.29,1,0,0,1,0,0,1
181,1,62.0,1,65.0,0,60.0,0,84.0,1,64.15,0,0,1,0,1,0,0


In [53]:
# Standardize continuous features

train_df_std = standardize_features(train_df, columns=['ssc_percentage', 'hsc_percentage', 'degree_percentage', 'emp_test_percentage', 'mba_percent'])
test_df_std = standardize_features(test_df, columns=['ssc_percentage', 'hsc_percentage', 'degree_percentage', 'emp_test_percentage', 'mba_percent'])

(172, 17)

In [55]:
X_train = train_df_std.drop("status", axis=1)
y_train = train_df_std["status"]
X_test = test_df_std.drop("status", axis=1)
y_test = test_df_std["status"]

In [76]:
weights, cost_list = fit(X_train.to_numpy(), y_train.to_numpy())
y_train_pred = predict(weights, X_train.to_numpy())
y_test_pred = predict(weights, X_test.to_numpy())


In [80]:
y_train_pred
y_test_pred

[1,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1]

In [84]:
type(y_train_pred)

list

In [85]:
np.sum(np.logical_and(y_train == 1, pd.Series(y_train_pred) == 1))

112

In [87]:
print(metrics(y_train, pd.Series(y_train_pred)))
print(metrics(y_test, pd.Series(y_test_pred)))

{'accuracy': 0.872093023255814, 'sensitivity': 0.9411764705882353, 'specificity': 0.7169811320754716, 'precision': 0.8818897637795275, 'f1': 0.9105691056910569}
{'accuracy': 0.5, 'sensitivity': 0.5, 'specificity': 0.5, 'precision': 0.6744186046511628, 'f1': 0.5742574257425743}
