In [56]:
"""
An implementation for HW2.
This source code includes both Task 1 and Task 2.
This uses Logistic regression method with cross entrophy with GD for optimization.
By 32190984 Isu Kim  @ github.com/gooday2die
"""

'\nAn implementation for HW2.\nThis source code includes both Task 1 and Task 2.\nThis uses Logistic regression method with cross entrophy with GD for optimization.\nBy 32190984 Isu Kim  @ github.com/gooday2die\n'

In [2]:
import pandas as pd
import numpy as np

In [3]:
def sigmoid(x):
    """
    A function that implements sigmoid function.
    This will calculate sigmoid function with value of x.
    
    @param x: The x value to calculate sigmoid function.
    @return: The calculated value in float.
    """
    return 1 / (1 + np.exp(-x))

In [4]:
def linear_model(x, theta):
    """
    A function that calculate linear function.
    This will be used as exponent for sigmoid.
    
    Both parameters must be in 1D flattened list object.
    @param x: The list object containing pair of [x_1 and x_2]
    @param theta: The list object containing tuple of [\theta_0, \theta_1, \theta_2]
    @return: The calculated value.
    """
    return theta[0] * x[0] + theta[1] * x[1] + theta[2] * x[2]

In [5]:
def hypothesis(x, theta):
    """
    A function that is for hypothesis which is our model h_\theta(x)
    This will calculate sigmoid function with linear function as exponent.
    
    Both parameters must be in 1D flattened list object.
    @param x: The list object containing pair of [x_1 and x_2]
    @param theta: The list object containing tuple of [\theta_0, \theta_1, \theta_2]
    @return: The calculated value.    
    """
    return sigmoid(linear_model(x, theta))

In [6]:
def get_gradient(x, y, theta, j):
    """
    A function that is for calculating gradient of specific j.

    @param x: All data containing x values. 
              This shall be in N * 3 dimension.
    @param y: All data containing y values.
              This shall be in N * 1 dimension.
    @param theta: All theta values.
              This shall be in N * 1 dimension.
    @param j: The specific j to get gradient from.
              For example, if we are about to update \theta_1, use j as 1.
              This function will calculate gradient of the cost graph with j = 1.
    @return: The calculate gradient value for specific j.
    """
    tmp = 0
    for i in range(len(x)):
        predict = hypothesis(x[i], theta)
        tmp = tmp + (predict - y[i]) * x[i][j]
    return tmp / len(x)

In [7]:
def do_gradient_descent(max_iter, learning_rate, data_x, data_y, round_point):
    """
    A function that does gradient descent.
    This will use logistic regression with cost using cross entropy.
    
    Please refer to documentation for more information about implementation.
    
    @param max_iter: The maximum iteration count for training. Use iterations like 1000000.
    @param learning_rate: The learning rate for gradient descent.
                          Use learning rate which is not too small, don't use values like 0.01.
                          That will take so much time.
    @param data_x: The x data.
    @param data_y: The y data.
    @param round_point: The point that the weights are rounded.
                        If not rounded, this might go up to lots of iterations.
    @return list object that contains (a, b)
    """
    theta = [0] * 3  # Intial value for \thetas.
    x = [(1, data_x["x1"][i], data_x["x2"][i]) for i in range(len(data_x))]  # Store 1, x_1, x_2)
    y = list(data_y)
    
    before_theta = list()
    
    for i in range(max_iter):  # Iterate for max_iter count
        if i % 100 == 0:  # In every 1000 iterations, print \theta. just for outputs.
            print("[+] Iter : " + str(i) + " / Theta : " + str(theta))

        before_theta = theta
        tmp_theta = [0] * 3  # Store tmp \theta values.
        
        for j in range(3):  # Update \theta_0 to \theta_3
            gradient = get_gradient(x, y, theta, j)  # Calculate gradient
            tmp_theta[j] = theta[j] - learning_rate * gradient  # Apply GD.

        theta = tmp_theta  # Update \thetas simultaneously.        

        # Check if convergence happened. If so, stop training.
        if ([round(x, round_point) for x in before_theta] == [round(x, round_point) for x in theta]):
        #if before_theta == theta:
            print("Convergence!!! Stop training.")
            print("[+] Iter : " + str(i) + " / Theta : " + str(theta))
            break
        
    return theta

In [8]:
def predict(x, theta):
    """
    A function that predicts data using theta value
    @param x: The x value in [X_0, X_1, X_2] format.
    @param theta: The theta values to use.
    @return: Predicted label
    """
    if linear_model(x, theta) >= 0:
        return 1
    else:
        return 0

In [34]:
def calculate_accr(x_in, y_in, theta):
    """
    A function that calculates accuracy with \theta
    The accuracy is calculated by
    (total correct predictions) / (data count)
    
    @param x_in: The x value in [X_1, X_2] format
    @param y_in: The y values
    @param theta: The theta values in [theta_0, theta_1, theta_2] format
    @return Accuracy.
    """
    x = [(1, x_in["x1"][i], x_in["x2"][i]) for i in range(len(y_in))]  # Store (1, x_1, x_2)
    y = list(y_in)  # Store y values.

    correct = 0
    for i in range(len(x)):  # Iterate and check if predict matches real data.
        predicted = predict(x[i], theta)
        if predicted == y[i]:
            correct += 1  # If so, add one to correct.
    print("[+] " + str(correct) + "/" + str(len(x)))
    return correct / len(x)  # Divide all corrects by all data.

In [42]:
def calculate_cost(x_in, y_in, theta):
    """
    A function that calculates cost of with \theta.
    The cost is calculated by
    
    -1/m sum i from 1 to m (y_i * ln(h_\theta(x_i)) + (1 - y_i) * ln(1 - h_\theta(x_i)))
    
    @param x_in: The x value in [X_1, X_2] format
    @param y_in: The y values
    @param theta: The theta values in [theta_0, theta_1, theta_2] format
    @return Cost.
    """
    x = [(1, x_in["x1"][i], x_in["x2"][i]) for i in range(len(y_in))]  # Store (1, x_1, x_2)
    y = list(y_in)  # Store y values.
    
    total = 0
    for i in range(len(y)):
        # Iterate and calculate (y_i * ln(h_\theta(x_i)) + (1 - y_i) * ln(1 - h_\theta(x_i))
        total += ((y[i] * np.log(hypothesis(x[i], theta))) + (1 - y[i]) * np.log(1 - hypothesis(x[i], theta)))
    
    return -1 * total / len(y)

In [61]:
df_train = pd.read_csv("hw2_train.csv")  # Read csv file.
data_x = df_train[["x1", "x2"]]  # Select x1 and x2 col from data
data_y = df_train["y"]  # Select y col from data

In [76]:
# This is for Task 1. from homework 2.
# This will train data using hw2_train.csv and store theta values into result.
# Please be aware that this will take lots of time to converge and exit.
# If you cannot wait this convergence, there are two options
#
# 1. Use low rounding points: 
#    This will make our model if some digits match for some digits, it will consider it convergence.
result = do_gradient_descent(1000000, 0.1, data_x, data_y, 20) 

[+] Iter : 0 / Theta : [0, 0, 0]
[+] Iter : 100 / Theta : [5.256908552735544e-01, -1.63893185177645, -0.4668425672979544]
[+] Iter : 200 / Theta : [8.527054821064965e-01, -2.0550728766618906, -0.5695286482503399]
[+] Iter : 300 / Theta : [1.0714828334921622e+00, -2.307553934017127, -0.6406898518980715]
[+] Iter : 400 / Theta : [1.2323390374015146e+00, -2.489813499785945, -0.6970598693275062]
[+] Iter : 500 / Theta : [1.357478497191946e+00, -2.6322290918990583, -0.7437116888570025]
[+] Iter : 600 / Theta : [1.4585901978632776e+00, -2.7486087193146416, -0.7832029975465276]
[+] Iter : 700 / Theta : [1.542533463823606e+00, -2.8464700374029035, -0.8171436735393784]
[+] Iter : 800 / Theta : [1.613650078165492e+00, -2.930402630172282, -0.8466544878581355]
[+] Iter : 900 / Theta : [1.6748488936806396e+00, -3.0034385161904593, -0.8725574430700208]
[+] Iter : 1000 / Theta : [1.7281675715917897e+00, -3.0676989160559422, -0.8954749448551055]
[+] Iter : 1100 / Theta : [1.7750858720878446e+00, -3.12

[+] Iter : 9100 / Theta : [2.286155448566839e+00, -3.7718542456505877, -1.1500559575425802]
[+] Iter : 9200 / Theta : [2.2863366330052703e+00, -3.7720902019065985, -1.1501424958158555]
[+] Iter : 9300 / Theta : [2.2865072293257804e+00, -3.7723123722973693, -1.1502239792897504]
[+] Iter : 9400 / Theta : [2.2866678583436766e+00, -3.7725215647887835, -1.1503007040769013]
[+] Iter : 9500 / Theta : [2.2868191042397967e+00, -3.7727185397074567, -1.1503729488463934]
[+] Iter : 9600 / Theta : [2.2869615167488755e+00, -3.772904012582059, -1.1504409758623535]
[+] Iter : 9700 / Theta : [2.287095613214163e+00, -3.7730786568114825, -1.1505050319594592]
[+] Iter : 9800 / Theta : [2.2872218805168534e+00, -3.773243106170857, -1.150565349459334]
[+] Iter : 9900 / Theta : [2.287340776888206e+00, -3.77339795716562, -1.1506221470315323]
[+] Iter : 10000 / Theta : [2.2874527336117856e+00, -3.7735437712431694, -1.150675630502574]
[+] Iter : 10100 / Theta : [2.2875581566227092e+00, -3.77368107687103, -1.1507

[+] Iter : 18000 / Theta : [2.2892463514271113e+00, -3.775879977216063, -1.1515326037660125]
[+] Iter : 18100 / Theta : [2.2892472174443825e+00, -3.7758811052906625, -1.1515330176016727]
[+] Iter : 18200 / Theta : [2.289248033031379e+00, -3.7758821676748227, -1.151533407338739]
[+] Iter : 18300 / Theta : [2.2892488011248218e+00, -3.775883168193914, -1.1515337743805447]
[+] Iter : 18400 / Theta : [2.2892495244904167e+00, -3.775884110450529, -1.1515341200487044]
[+] Iter : 18500 / Theta : [2.2892502057328046e+00, -3.775884997837468, -1.1515344455878658]
[+] Iter : 18600 / Theta : [2.289250847304949e+00, -3.7758858335499568, -1.1515347521701977]
[+] Iter : 18700 / Theta : [2.289251451516964e+00, -3.7758866205971424, -1.1515350408996097]
[+] Iter : 18800 / Theta : [2.289252020544436e+00, -3.775887361812939, -1.1515353128157242]
[+] Iter : 18900 / Theta : [2.289252556436255e+00, -3.7758880598662348, -1.151535568897621]
[+] Iter : 19000 / Theta : [2.2892530611220048e+00, -3.775888717270491, 

[+] Iter : 26900 / Theta : [2.2892611519388537e+00, -3.775899256381593, -1.1515396763577082]
[+] Iter : 27000 / Theta : [2.289261156093673e+00, -3.775899261793669, -1.1515396783431375]
[+] Iter : 27100 / Theta : [2.2892611600065504e+00, -3.7758992668905926, -1.1515396802129525]
[+] Iter : 27200 / Theta : [2.2892611636915765e+00, -3.7758992716907156, -1.151539681973886]
[+] Iter : 27300 / Theta : [2.2892611671620187e+00, -3.775899276211318, -1.151539683632278]
[+] Iter : 27400 / Theta : [2.2892611704303714e+00, -3.775899280468681, -1.1515396851940995]
[+] Iter : 27500 / Theta : [2.2892611735084047e+00, -3.7758992844781307, -1.1515396866649732]
[+] Iter : 27600 / Theta : [2.2892611764071975e+00, -3.7758992882541045, -1.1515396880501956]
[+] Iter : 27700 / Theta : [2.289261179137189e+00, -3.775899291810199, -1.1515396893547551]
[+] Iter : 27800 / Theta : [2.2892611817082105e+00, -3.775899295159215, -1.1515396905833482]
[+] Iter : 27900 / Theta : [2.289261184129516e+00, -3.775899298313216,

[+] Iter : 35800 / Theta : [2.2892612229466556e+00, -3.775899348876498, -1.15153971028963]
[+] Iter : 35900 / Theta : [2.289261222966589e+00, -3.7758993489024637, -1.1515397102991554]
[+] Iter : 36000 / Theta : [2.2892612229853615e+00, -3.7758993489269153, -1.1515397103081262]
[+] Iter : 36100 / Theta : [2.2892612230030407e+00, -3.775899348949945, -1.1515397103165748]
[+] Iter : 36200 / Theta : [2.2892612230196914e+00, -3.7758993489716337, -1.1515397103245313]
[+] Iter : 36300 / Theta : [2.289261223035372e+00, -3.7758993489920596, -1.1515397103320248]
[+] Iter : 36400 / Theta : [2.2892612230501395e+00, -3.7758993490112958, -1.1515397103390819]
[+] Iter : 36500 / Theta : [2.289261223064047e+00, -3.775899349029411, -1.1515397103457279]
[+] Iter : 36600 / Theta : [2.2892612230771445e+00, -3.775899349046472, -1.1515397103519869]
[+] Iter : 36700 / Theta : [2.289261223089479e+00, -3.7758993490625397, -1.151539710357881]
[+] Iter : 36800 / Theta : [2.2892612231010956e+00, -3.775899349077673,

[+] Iter : 44700 / Theta : [2.2892612232872755e+00, -3.7758993493202135, -1.1515397104524012]
[+] Iter : 44800 / Theta : [2.2892612232873644e+00, -3.7758993493203468, -1.1515397104524456]
[+] Iter : 44900 / Theta : [2.289261223287453e+00, -3.7758993493204693, -1.15153971045249]
[+] Iter : 45000 / Theta : [2.289261223287542e+00, -3.775899349320558, -1.1515397104525344]
[+] Iter : 45100 / Theta : [2.289261223287631e+00, -3.775899349320647, -1.1515397104525789]
[+] Iter : 45200 / Theta : [2.2892612232877196e+00, -3.775899349320736, -1.1515397104526206]
[+] Iter : 45300 / Theta : [2.2892612232877867e+00, -3.7758993493208246, -1.1515397104526477]
[+] Iter : 45400 / Theta : [2.289261223287838e+00, -3.7758993493209134, -1.15153971045267]
[+] Iter : 45500 / Theta : [2.2892612232878893e+00, -3.7758993493210022, -1.151539710452692]
[+] Iter : 45600 / Theta : [2.289261223287941e+00, -3.775899349321091, -1.1515397104527143]
[+] Iter : 45700 / Theta : [2.289261223287992e+00, -3.77589934932118, -1.1

[2.289261223288459, -3.775899349321668, -1.1515397104529768]

In [57]:
# 2. Use pretrained values from my training result.
result =[2.289261223288459, -3.775899349321668, -1.1515397104529768]  

In [62]:
print("[+] Train data")
cost = calculate_cost(data_x, data_y, result)  # Calculate cost of trained model by train data.
print("[+] Cost : " + str(cost))
accr = calculate_accr(data_x, data_y, result)  # Accuracy of trained model by train data.
print("[+] Accuracy : " + str(accr))

[+] Train data
[+] Cost : 1.0636787925061122e-01
[+] 1437/1500
[+] Accuracy : 0.958


In [63]:
# This is for Task 2. from homework 2.
# This will load hw2_test.csv and validate cost and accuracy.
df_test = pd.read_csv("hw2_test.csv")  # Read csv file.
data_x = df_test[["x1", "x2"]]  # Select x1 and x2 col from data
data_y = df_test["y"]  # Select y col from data

In [64]:
print("[+] Test data")
cost = calculate_cost(data_x, data_y, result)  # Calculate cost of trained model by test data.
print("[+] Cost : " + str(cost))
accr = calculate_accr(data_x, data_y, result)  # Accuracy of trained model by test data.
print("[+] Accuracy : " + str(accr))

[+] Test data
[+] Cost : 8.807312045946367e-02
[+] 483/500
[+] Accuracy : 0.966
