In [1]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2

## Load the training data into feature matrix, class labels, and event ids:

In [2]:
def clean(X):
    x = np.copy(X)
    """Remove weird values from the original data set."""
    x[abs(x) ==  999] = np.nan
    mean_x = np.nanmean(x, axis = 0)
    std_x = np.nanstd(x, axis=0)
    rows, cols = x.shape
    for i in range(rows):
        for j in range(cols):
            if(np.isnan(x[i][j])):
                x[i][j] = mean_x[j]
                
    return x, mean_x, std_x

In [3]:
def standardize(x, mean_x, std_x):
    """Standardize values from the cleaned data set."""
    tX = np.copy(x)
    tX = tX - mean_x[np.newaxis, :]
    tX = tX / std_x[np.newaxis, :]
    return tX

In [4]:
def change_y(y):
    y[y == -1.0] = 0
    return y

In [5]:
from proj1_helpers import *
DATA_TRAIN_PATH = '../data/train.csv'
y, X, ids = load_csv_data(DATA_TRAIN_PATH)

In [6]:
x, mean_x, std_x = clean(X)

In [7]:
standarized_x = standardize(x, mean_x, std_x)

In [8]:
yf = change_y(y)

In [51]:
def sample_data(yf, standarized_x, seed, size_samples):
    tX = np.copy(standarized_x)
    y = np.copy(yf)
    
    y = np.expand_dims(y, axis=1)
    np.random.seed(seed)
    num_observations = y.shape[0]
    random_permuted_indices = np.random.permutation(num_observations)
    y = y[random_permuted_indices]
    tX = tX[random_permuted_indices]
    return y[:size_samples,:], tX[:size_samples,:]


In [54]:
def de_standardize(x, mean_x, std_x):
    """Reverse the procedure of standardization."""
    x = x * std_x
    x = x + mean_x
    return x


## Do your thing crazy machine learning thing here :) ...

In [55]:
'''
def calculate_mse(e):
    
    return 1/2*np.mean(e**2)

def compute_mae_loss(y, tx, w):
   
    e = y - tx.dot(w)
    return calculate_mse(e)
    
def batch_iter(y, tx, batch_size, num_batches=1, shuffle=True):
    """
    Generate a minibatch iterator for a dataset.
    Takes as input two iterables (here the output desired values 'y' and the input data 'tx')
    Outputs an iterator which gives mini-batches of `batch_size` matching elements from `y` and `tx`.
    Data can be randomly shuffled to avoid ordering in the original data messing with the randomness of the minibatches.
    Example of use :
    for minibatch_y, minibatch_tx in batch_iter(y, tx, 32):
        <DO-SOMETHING>
    """
    data_size = len(y)

    if shuffle:
        shuffle_indices = np.random.permutation(np.arange(data_size))
        shuffled_y = y[shuffle_indices]
        shuffled_tx = tx[shuffle_indices]
    else:
        shuffled_y = y
        shuffled_tx = tx
    for batch_num in range(num_batches):
        start_index = batch_num * batch_size
        end_index = min((batch_num + 1) * batch_size, data_size)
        if start_index != end_index:
            yield shuffled_y[start_index:end_index], shuffled_tx[start_index:end_index]
            
def compute_stoch_gradient(y, tx, w):
    
    err = y - tx.dot(w)
    grad = -tx.T.dot(err) / len(err)
    return grad, err


def least_squares_SGD(y, tx, initial_w, max_iters, gamma):
    
    w = initial_w
    for n_iter in range(max_iters):
        for y_batch, tx_batch in batch_iter(y, tx, batch_size=1, num_batches=1):
            grad, _ = compute_stoch_gradient(y_batch, tx_batch, w)
            w = w - gamma * grad
            loss = compute_loss(y, tx, w)

        print("SGD({bi}/{ti}): loss={l}, w0={w0}, w1={w1}".format(
              bi=n_iter, ti=max_iters - 1, l=loss, w0=w[0], w1=w[1]))
    
    return (w, loss)

'''

'\ndef calculate_mse(e):\n    \n    return 1/2*np.mean(e**2)\n\ndef compute_mae_loss(y, tx, w):\n   \n    e = y - tx.dot(w)\n    return calculate_mse(e)\n    \ndef batch_iter(y, tx, batch_size, num_batches=1, shuffle=True):\n    """\n    Generate a minibatch iterator for a dataset.\n    Takes as input two iterables (here the output desired values \'y\' and the input data \'tx\')\n    Outputs an iterator which gives mini-batches of `batch_size` matching elements from `y` and `tx`.\n    Data can be randomly shuffled to avoid ordering in the original data messing with the randomness of the minibatches.\n    Example of use :\n    for minibatch_y, minibatch_tx in batch_iter(y, tx, 32):\n        <DO-SOMETHING>\n    """\n    data_size = len(y)\n\n    if shuffle:\n        shuffle_indices = np.random.permutation(np.arange(data_size))\n        shuffled_y = y[shuffle_indices]\n        shuffled_tx = tx[shuffle_indices]\n    else:\n        shuffled_y = y\n        shuffled_tx = tx\n    for batch_num

In [56]:
def sigmoid(t):
    
    expo = np.exp(-t)
    result = 1.0/(1.0 + expo)
    return result


In [57]:
def compute_gradient(tx, y, w):
    
    pred = sigmoid(tx @ w)
    gradient = tx.T @ (pred - y) 
    return gradient


In [58]:
def compute_sigmoid_loss(tx, y, w):
    
    predictions = sigmoid(tx @ w)
    neg_losses_per_datapoint = -(y * np.log(predictions) + (1 - y) * np.log(1 - predictions))
    return neg_losses_per_datapoint.sum()


In [69]:
def learning_by_gradient_descent(y, tx, w, gamma):
    """
    Do one step of gradient descent using logistic regression.
    Return the loss and the updated w.
    """
    # ***************************************************
    loss = compute_sigmoid_loss(tx, y, w)
    # ***************************************************
    gradient = compute_gradient(tx, y, w)
    # ***************************************************
    w = w - gamma * gradient
    # ***************************************************
    return loss, w



In [85]:
def logistic_regression(y, tx, initial_w, max_iters, gamma):
    
    # init parameters
    threshold = 1e-8
    losses = []
    w = np.concatenate(([[1]], np.copy(initial_w)), axis=0)
    # build tx
    tX = np.c_[np.ones((y.shape[0], 1)), tx]
    # start the logistic regression
    for iter in range(max_iters):
        # get loss and update w.
        loss, w = learning_by_gradient_descent(y, tX, w, gamma)
        # converge criterion
        losses.append(loss)
        if len(losses) > 1 and np.abs(losses[-1] - losses[-2]) < threshold:
            return (w, loss)
    return (w, loss)


In [88]:
y, tX = sample_data(yf, standarized_x, 23, 100)
weights, loss = logistic_regression(y, tX, 0.001* np.ones((tX.shape[1],1)), 1000, 0.001)
print(loss)

34.47534944428531


## Generate predictions and save ouput in csv format for submission:

In [92]:
DATA_TEST_PATH = '../data/test.csv' # TODO: download train data and supply path here 
_, tX_test, ids_test = load_csv_data(DATA_TEST_PATH)
tX_test, mean, std = clean(tX_test)
tX_test = standardize(tX_test, mean, std)

In [93]:
OUTPUT_PATH = '../data/sample-submission.csv' # TODO: fill in desired name of output file for submission
y_pred = predict_labels(np.squeeze(weights[1:,:]), tX_test)
print(y_pred)
create_csv_submission(ids_test, y_pred, OUTPUT_PATH)

[ 1. -1. -1. ...  1.  1. -1.]
