In [21]:
import math
import matplotlib.pyplot as plt
import numpy as np
import random
import pandas as pd
np.random.seed(42)

### Squared L2 Norm Function:

$\sqrt{\Sigma w_i^2} ^ 2$

In [23]:
def logistic(z):
    return 1/(1 + np.exp(-z))


def log_prob(z, y_i):
    '''
    Returns the log_prob for one point
    '''
    fz = logistic(z)
    return y_i * np.log(fz) + (1 - y_i) * np.log(1 - fz)


def neg_log_likelihood(X, w, y):
    '''Compute the negative log likelihood'''
    L = 0
    for _x,_y in zip(X, y):
        z = w.dot(_x)
        L += log_prob(z=z, y_i=_y)
    return -1 * L


def fast_logistic(X, w):
    '''Compute the logistic function over many data points'''
    return 1/(1 + np.exp(-1 * X.dot(w)))


def grad(_X, w, _y, lambda_=.5):
    '''
    Return the gradient
    
    - https://web.stanford.edu/~jurafsky/slp3/5.pdf
    '''
    grad = np.zeros_like(w)
    
    N,D= _X.shape
    
    b = _X * (fast_logistic(_X, w) - _y).reshape((N, 1))

    return np.sum(b, axis=0) + (lambda_ * 2 * w)


def squared_l2_norm(w):
    '''
    Return the L2 norm of the weights, squared. 
    
    Remember that we square the norm of the weights,
    to make the math easier when computing the gradients
    
    $\sqrt{\Sigma w_i^2} ^ 2
    '''
    norm = 0
    for weight in w:
        norm += weight** 2
    l2 = math.sqrt(norm)
    squared_l2 = l2 ** 2
    return squared_l2

#tolerance is any small number close to 0 that you specify (hyperparam)
def grad_descent(_X, _y, eta = .0001, lambda_ = 0, tolerance=1e-4, verbose=True, batch_size=None, iters=None):
    '''
    Perform gradient descent
    '''
    w = np.random.uniform(low=-5, high=2, size=dim_)
    
    losses = []
    for i in range(1000):
        if i > iters and iters is not None:
            break
        #this basically is the L(w) + lambda*squared_L2_norm
        #This is our regularization that penalizes big weights
        this_ll = neg_log_likelihood(_X, w, _y)
        loss = this_ll + lambda_ * squared_l2_norm(w)
        losses.append(loss)
        if verbose:
            print("iter: {}, loss: {}, accuracy: {}".format(i, loss, accuracy(_X, w, _y)))
        
        #the tolerance is our threshold as we approach 0
        #this code is checking how far away you are from 0
        #if you are close enough, we will call it a day (to not waste time getting to exactly 0)
        if (squared_l2_norm(grad(_X, w, _y, lambda_=lambda_))) < tolerance:
            break
        
        #This is to help us be more efficient computing the gradient
        #Stochastic gradient descent: getting gradients randomly to get to the minimum
        #Batch_size --> if it's none, we will compute the whole gradient
        #Otherwise, we will select random instances and will compute the gradient based on
        #that chosen batch
        #small batch size will make our loss more volatile, but generally go down
        #if the batch size is really big, it will really smoothly decrease
        #(bc its a big sample of the gradient), so less noisy
        #but it will take a lot longer to compute!
        if batch_size is None:
            w -= eta * grad(_X, w, _y, lambda_=lambda_)
        else:
            _N,F = _X.shape
            idx = np.random.randint(_N, size=batch_size)
            w -= eta * grad(_X[idx], w, _y[idx], lambda_=lambda_)/batch_size
        
    return w, losses

def prediction(X, w, threshold=.5):
    '''
    - Return a Boolean array of length N.
    - The array should be True if the weights dotted with the features for a given instance is greater than .5
    '''
    N, D = X.shape
    return X.dot(w) > threshold

def accuracy(X, w, y):
    '''
    Return a value between 0 and 1, showing the fraction of data points which have been classified correctly
    '''
    return np.mean(prediction(X, w) == y)

def init_data(N, dim_):
    '''
    Initialize data. Note how we generate y below. We know how the data is generated.
    '''
    #Generating random data, and random labels according to some random process
    w = np.random.uniform(low=-1, high=1, size=dim_)
    X = (np.random.rand(dim_ * N) > .5).astype(int)
    X = X.reshape(N, dim_)

    z_ = X.dot(w) + np.random.uniform(low=-1, high=1, size=X.dot(w).size)

    y =  1/(1 + np.exp(-1 * z_)) > .5
    
    return X, y

np.random.seed(42)

N = 10000 #Number of instances in our dataset
dim_ = 10 #Number of dimensions (features)

#Instantiating the weight vector as a uniform distribution
#low sample is -5, high sample is 2, and the number of items is our dim #
#all random numbers
w = np.random.uniform(low=-5, high=2, size=dim_)

#X is N points with dim_ features
#y is N points with 1 label
X, y = init_data(N, dim_)

# splitting the data in half
split = int(N/2)

X_train = X[0:split]
X_test = X[split:]
y_train = y[0:split]
y_test = y[split:]

#regularization penalty, we multiply the norm of the gradient
#by this number to penalize larger weights
lambda_ = .1

w, losses = grad_descent(X_train, y_train,
                        eta=1, tolerance=.0001,
                        iters=100, verbose=True,
                        lambda_=lambda_, batch_size=10)


iter: 0, loss: 5793.811507862259, accuracy: 0.8196
iter: 1, loss: 5531.492067116258, accuracy: 0.8196
iter: 2, loss: 5217.3737400898735, accuracy: 0.8194
iter: 3, loss: 4603.570963825092, accuracy: 0.8212
iter: 4, loss: 4229.555574459421, accuracy: 0.8192
iter: 5, loss: 3953.6292325296454, accuracy: 0.8212
iter: 6, loss: 3959.666941502819, accuracy: 0.823
iter: 7, loss: 3713.612427443597, accuracy: 0.8218
iter: 8, loss: 3172.2861609198026, accuracy: 0.814
iter: 9, loss: 3034.5649577972554, accuracy: 0.8072
iter: 10, loss: 2831.6970514607424, accuracy: 0.8054
iter: 11, loss: 2628.1126175321942, accuracy: 0.8064
iter: 12, loss: 2504.702827297553, accuracy: 0.8114
iter: 13, loss: 2435.3492857116044, accuracy: 0.813
iter: 14, loss: 2388.010077070792, accuracy: 0.8124
iter: 15, loss: 2283.8172159756323, accuracy: 0.8166
iter: 16, loss: 2239.3687537582864, accuracy: 0.8178
iter: 17, loss: 2238.32187272549, accuracy: 0.8272
iter: 18, loss: 2131.870205159041, accuracy: 0.831
iter: 19, loss: 21

In [27]:
#make batch size really small (1 instead of 10)
#verbose is False (why?) oh, it doesn't print out the solution I see
w, losses_small = grad_descent(X_train, y_train,
                        eta=1, tolerance=.0001,
                        iters=100, verbose=False,
                        lambda_=lambda_, batch_size=1)


In [34]:
#making a dataframe for the iteration and loss at that iteration
index = [i for i in range(len(losses))]
kind = ["1" for i in range(len(losses))]
kind = kind + ["10" for i in range(len(losses))]
df = pd.DataFrame({"iter": index + index,
                   "batch_size":kind,
                   "loss":losses_small + losses})

df.head()

Unnamed: 0,iter,batch_size,loss
0,0,1,8995.631731
1,1,1,7219.30803
2,2,1,3317.651849
3,3,1,3736.098317
4,4,1,3181.592848


In [36]:
import altair as alt

#charting loss by iteration
#When the batch size is small, the loss is bouncing around a lot
    #this is because the gradient for any one point is kind of random
#When the batch size is large, it is more stable
    #And lower loss achieved more broadly
alt.Chart(df).mark_line().encode(
    x='iter',
    y='loss',
    color="batch_size"
)

In [6]:
#Taking a sneak peak
import numpy as np
np.random.seed(42)

N = 10000 #Number of instances in our dataset
dim_ = 10 #Number of dimensions (features)

#Instantiating the weight vector as a uniform distribution
#low sample is -5, high sample is 2, and the number of items is our dim #
#all random numbers
w = np.random.uniform(low=-5, high=2, size=dim_)

X, y = init_data(N, dim_)


In [9]:
#X is 10k samples with 10 features
X.shape

(10000, 10)

In [10]:
#We would expect the shape of y to be 10k (1 label for each instance)
y.shape

(10000,)

In [11]:
#We would expect the weight vector to be 10 (1 weight for each feature)
w.shape

(10,)

In [14]:
#Since z in the dot product of X and w, its shape will be the number
#of instances (this is our score vector), one score per instance
z = X.dot(w)
z.shape

(10000,)

In [20]:
X[0:3].dot(w) #since this is 3 instances, it gives us 3 scalars

array([-8.47247171, -7.98054957, -8.84796539])

In [17]:
#a dot product on a single row of x
#the dot product of the weights and one instance gives us a scalar
x_0 = [1,0,0,1]
w = [.2,.1,.3,-3]
dot = 1 * .2- + 0 * .1 + 0 * -3 + 1 * -3
dot

-2.8

### Questions: normalization
- Complete the L2 norm function

- What does the variable `lambda` do in the code above? 

- What happens if you set `lambda` to a huge number? What happens if you set `lambda` to a small number?  What should you see in terms of accuracy and the norm of the weights? Try systematically varying lambda

### Questions: Stochastic gradient descent
- Print the loss and vary the batch size:
    - How do you think that varying eta will vary the amount of noise in the loss?
    - How do you think that varying batch size will vary the amount of noise in the loss?
    
- Test your answers to the previous two questions by making a plot. Your plot should show the loss each iteration, for different batch sizes. You should try batch sizes of 1, 10 and 100. What do you observe in your plot?