In [1]:
import numpy as np
import pickle
from __future__ import division 
import plotly.plotly as py
import plotly.graph_objs as go

In [2]:
def compute_cost(data, label, w):
    """
        Cost function to evaluate the linear regression model
        @Arguments: 
            data: The dataset in m x n format. Where m is total samples and n is total features per sample. 
            label: +1/-1 for the data point. 
            w: The current weights [size: n x 1]
        @returns: 
            cost: The performance of the model on the current dataset. 
    """
    m = len(label)
    h = np.vstack(np.array([np.sum(j*w) for j in data]))
    temp1 = np.power(np.vstack(np.array([h[i] - label[i] for i in xrange(0, len(label))])), 2)
    cost = (1/(2*m)) * np.sum(temp1)
    
    return cost

In [3]:
def gradient_descent(data, label, w, alpha, numiters):
    """
        Function to optimize cost for linear regression and update weights.
        @Arguments: 
            data: The dataset in m x n format. Where m is total samples and n is total features per sample. 
            label: +1/-1 for the data point. 
            w: The current weights [size: n x 1]
            alpha: The learning rate for taking steps in gradient descent.
            numiters: The number of iterations
        @returns: 
            cost_history: The values of cost function over the iterations
            w: The updated weight
    """
    m = len(label)
    cost_history = []
    numiters = 0
    while True: 
        h = data.dot(w)
        diff_hy = np.subtract(h, np.vstack(label))
        data_transpose = np.array(data).transpose()
        sigma = data_transpose.dot(diff_hy) 
        temp = np.array(w) - ((alpha/m)*(sigma))
        
        w = np.vstack(np.array(temp))
        new_cost = compute_cost(data, label, w)
        cost_history.append(new_cost)
        neg_example_mistakes, pos_example_mistakes = eval(data, label, w)
        numerrors = len(neg_example_mistakes) + len(pos_example_mistakes)
        numiters += 1
        if new_cost < 0.5: 
            plot_linear_reg(data, w, neg_example_mistakes, pos_example_mistakes, numiters)
            break
    
    return cost_history, w, numiters

In [4]:
def eval(data, label, w):
    pred = np.sign(data.dot(w))
    neg_example_mistakes = []
    pos_example_mistakes = []
    
    for i in xrange(0, data.shape[0]):
        if label[i] != pred[i] and np.int(label[i]) == 1:
            pos_example_mistakes.append(i)
        elif label[i] != pred[i] and np.int(label[i]) == -1:
            neg_example_mistakes.append(i)
    
    return neg_example_mistakes, pos_example_mistakes 

In [5]:
def plot_linear_reg(data, w, neg_example_mistakes, pos_example_mistakes, numiters):
    green_dots = []
    red_dots = []
    
    for i in xrange(0, data.shape[0]):
        if i in neg_example_mistakes: 
            red_dots.append(data[i])
        else: 
            green_dots.append(data[i])
    
    green_x = []
    green_y = []
    red_x = []
    red_y = []
    
    for i in green_dots:
        green_x.append(i[0])
        green_y.append(i[1])
        
    for i in red_dots:
        red_x.append(i[0])
        red_y.append(i[1])
        
    
    approx_curve_slope, approx_curve_bias  = -w[0]/w[2], -w[1]/w[2]
    
    approx_y = np.linspace(0,10)
        
    approx_x = (((approx_y - approx_curve_bias)/(approx_curve_slope))) + 2.5
                
    green_trace = go.Scatter(
        x = green_x,
        y = green_y,
        mode = 'markers', 
        name = 'Correctly classified points'
    )
    
    red_trace = go.Scatter(
        x = red_x,
        y = red_y,
        mode = 'markers', 
        name = 'Incorrectly classified points'
    )
    
    original_trace = go.Scatter(
        x = [5,5],
        y = [0,10],
        mode = 'lines',
        name = 'decision curve'
    )
    
    approx_trace = go.Scatter(
        x = approx_x, 
        y = approx_y,
        mode = 'lines', 
        name = 'Approximated curve'
    )
    

    data = [green_trace, red_trace, original_trace, approx_trace]
    layout = go.Layout(title='Evaluating Iteration: {}'.format(numiters), width=800, height=640)
    fig = go.Figure(data=data, layout=layout)

    py.image.save_as(fig, filename='linear{}.png'.format(numiters))

In [6]:
#Fetching the dataset from PLA. 
with open('dataset.pickle', 'r') as f: 
    dataset = pickle.load(f)

In [7]:
#Constructing the data and label. 
data = []
label = []

for i in dataset: 
    data.append(np.array(i))
    if i[0] <= 5:
        label.append(-1)
    else:
        label.append(1)

In [8]:
data_no_bias = np.vstack(np.array(data))
data = np.ones((data_no_bias.shape[0], data_no_bias.shape[1]+1))
data[:,:-1] = data_no_bias
label = np.array(label)

In [9]:
#Initializing weights. 

w = np.vstack(np.array([np.random.random() for i in xrange(data.shape[1])]))
alpha = 0.01
numiters = 35000

print "Initial Cost is: {}".format(compute_cost(data, label, w))


cost_history, w, iterations = gradient_descent(data, label, w, alpha, numiters)

print "Cost after training {} iterations: {}".format(iterations, compute_cost(data, label, w))
print "Final weights:"
print w

pred = data.dot(w)

for i in xrange(0, len(data)):
    print "Datapoint : {}, predicted value: {}".format(data[i], pred[i])



Initial Cost is: 183.90733096
