In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

eps = np.finfo(float).eps

In [None]:
randomDataSet = dataSet = pd.read_csv("./../input_data/AdmissionDataset/data.csv")
# randomDataSet = dataSet.sample(frac=1).reset_index(drop=True)
Class = "Chance of Admit"
columns = ['Serial No.' , 'GRE Score' , 'TOEFL Score' , 'University Rating' , 'SOP' , 'LOR' , 'CGPA' , 'Research' , Class]
randomDataSet.columns = columns
columns = columns[1:]
randomDataSet = randomDataSet[columns]


In [None]:
# randomDataSet

In [None]:
for attr in columns[:-1]:
    randomDataSet[attr] = (randomDataSet[attr] - randomDataSet[attr].min())/(randomDataSet[attr].max() - randomDataSet[attr].min())
# randomDataSet.head()


In [None]:
trainingSet, validationSet = np.split(randomDataSet, [int(0.8*len(randomDataSet))])


In [None]:
# validationSet

In [None]:
alpha = 0.01
iterate = 1000

In [None]:
def hypothesis_hx(theta, row):
    hx = 0
    n = len(theta)
    for i in range(0, n - 1):
        hx += theta[i]*row[i]
    return hx + theta[n - 1]

In [None]:
def error(df, theta):
    error_list = []
    for index, row in df.iterrows():
        y = row[Class]
        hx = hypothesis_hx(theta, row)
        error_list.append(hx - y)
    return error_list

In [None]:
def costFunction(df, theta):
    Jtheta = 0
    er = 0
    error_list = error(df, theta)
    m = len(error_list)
    for i in range(0, m):
        er += error_list[i]**2
    Jtheta = er/(2*m + eps)
    return Jtheta , error_list

In [None]:
def sumError(df, error_list, j):
    m = len(error_list)
    n = len(theta)
    sum_error = 0
    for i in range(0, m):
        if j == n - 1:
            sum_error += error_list[i]
        else:
            sum_error += error_list[i]*df.iloc[i][j]
    return sum_error/(m + eps)

In [None]:
def gradientDecent(df, theta, error_list):
    n = len(theta)
    for j in range(0, n):
        theta[j] = theta[j] - alpha*sumError(df, error_list, j)
    return theta

In [None]:
def plotter(label_x, label_y, title, x_axis, y_axis, mark='', colr = 'blue'):
    plt.figure(num=None, figsize=(6, 4), dpi=175, facecolor='w', edgecolor='k')
    # plotting the points  
    plt.plot(x_axis, y_axis, marker = mark, color = colr, label = 'Error rate') 
    # naming the x axis 
    plt.xlabel(label_x) 
    # naming the y axis 
    plt.ylabel(label_y) 

    # giving a title to my graph 
    plt.title(title) 
    plt.grid(True)
    # function to show the plot 
    plt.show()

In [None]:
# Fitting the Model
theta = np.zeros([trainingSet.shape[1]])
print(theta)
iteration_list = []
cost_list = []
before = 10
for i in range(iterate):
    cost , error_list = costFunction(trainingSet, theta)
    theta = gradientDecent(trainingSet, theta, error_list)
    iteration_list.append(i)
    cost_list.append(cost)
    print(i,cost)
#     print(theta)
#     if before - cost < 0.0001:
#         break
#     before = cost
plotter('Iterations','Cost','Iterations-vs-Cost',iteration_list,cost_list)

In [None]:
def accuracy(true_positive , true_negative , false_negative, false_positive):
    return ((true_positive + true_negative)*100)/(true_positive + true_negative + false_positive + false_negative + eps)

In [None]:
def recall(true_positive , false_negative):
    return true_positive*100/(true_positive +  false_negative+ eps)

In [None]:
def precision(true_positive , false_positive):
    return true_positive*100/(true_positive +  false_positive + eps)

In [None]:
def f1score(recall , prescision):
    return 2/(1/(float(recall)+eps)+1/(float(prescision)+eps))

In [None]:
def valdidation(validationSet,theta):
    predicted = []
    actual = []
    for index, row in validationSet.iterrows():
        pred = hypothesis_hx(theta, row)
        ac = row[Class]
        predicted.append(pred)
        actual.append(ac)
        print(pred)
        print(ac)
        print("--------------")
    return predicted, actual


In [None]:
def threshold_changer(predicted, actual):
    threshold_list = []
    precision_list = []
    recall_list = []
    for j in np.arange(0.0, 1.1, 0.1):
        true_positive = 0
        true_negative = 0 
        false_negative = 0
        false_positive = 0
        pred = 0
        ac = 0
        rec = 0
        pre = 0
        for i in range(0, len(predicted)):
            if predicted[i] >= j:
                pred = 1 
            else:
                pred = 0
            if actual[i] >= j:
                ac = 1
            else:
                ac = 0
            if pred == ac:
                if pred == 1:
                    true_positive += 1
                else:
                    true_negative += 1
            else:
                if pred == 1:
                    false_positive += 1
                else:
                    false_negative += 1
        rec = recall(true_positive , false_negative)
        pre = precision(true_positive , false_positive)
        threshold_list.append(j)
        recall_list.append(rec)
        precision_list.append(pre)
    return threshold_list, recall_list, precision_list

In [None]:
predicted,actual = valdidation(validationSet,theta)


In [None]:
threshold_list, recall_list, precision_list = threshold_changer(predicted, actual)
print(threshold_list)
print(recall_list)
print(precision_list)

plotter('Threshold','recall','recall-vs-Threshold',threshold_list,recall_list)
plotter('Threshold','Precision','Precision-vs-Threshold',threshold_list,precision_list)


In [None]:
sns.pairplot(dataSet)


In [None]:
sns.heatmap(dataSet.corr(),linewidth = 0.2, vmax=1.0, square=True, linecolor='red',annot=True)
