In [1]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2

## Load the training data into feature matrix, class labels, and event ids:

In [2]:
def clean(X):
    x = np.copy(X)
    """Remove weird values from the original data set."""
    x[abs(x) ==  999] = np.nan
    mean_x = np.nanmean(x, axis = 0)
    std_x = np.nanstd(x, axis=0)
    rows, cols = x.shape
    for i in range(rows):
        for j in range(cols):
            if(np.isnan(x[i][j])):
                x[i][j] = mean_x[j]
                
    return x, mean_x, std_x

In [3]:
def standardize(x, mean_x, std_x):
    """Standardize values from the cleaned data set."""
    tX = np.copy(x)
    tX = tX - mean_x[np.newaxis, :]
    tX = tX / std_x[np.newaxis, :]
    return tX

In [4]:
def change_y(y):
    y[y == -1.0] = 0
    return y

In [5]:
from proj1_helpers import *
DATA_TRAIN_PATH = '../data/train.csv'
y, X, ids = load_csv_data(DATA_TRAIN_PATH)

In [6]:
x, mean_x, std_x = clean(X)

In [7]:
standarized_x = standardize(x, mean_x, std_x)

In [8]:
yf = change_y(y)

In [51]:
def sample_data(yf, standarized_x, seed, size_samples):
    tX = np.copy(standarized_x)
    y = np.copy(yf)
    
    y = np.expand_dims(y, axis=1)
    np.random.seed(seed)
    num_observations = y.shape[0]
    random_permuted_indices = np.random.permutation(num_observations)
    y = y[random_permuted_indices]
    tX = tX[random_permuted_indices]
    return y[:size_samples,:], tX[:size_samples,:]


In [54]:
def de_standardize(x, mean_x, std_x):
    """Reverse the procedure of standardization."""
    x = x * std_x
    x = x + mean_x
    return x


## Do your thing crazy machine learning thing here :) ...

In [None]:
#====================================================Least-Squares-SGD==========================================================

In [103]:
def calculate_mse(e):
    
    return 1/2*np.mean(e**2)

In [104]:
def compute_mae_loss(y, tx, w):
   
    e = y - tx.dot(w)
    return calculate_mse(e)

In [105]:
def batch_iter(y, tx, batch_size, num_batches=1, shuffle=True):
    
    data_size = len(y)

    if shuffle:
        shuffle_indices = np.random.permutation(np.arange(data_size))
        shuffled_y = y[shuffle_indices]
        shuffled_tx = tx[shuffle_indices]
    else:
        shuffled_y = y
        shuffled_tx = tx
    for batch_num in range(num_batches):
        start_index = batch_num * batch_size
        end_index = min((batch_num + 1) * batch_size, data_size)
        if start_index != end_index:
            yield shuffled_y[start_index:end_index], shuffled_tx[start_index:end_index]
            


In [106]:
def compute_stoch_gradient(y, tx, w):
    
    error = y - tx.dot(w)
    gradient = -tx.T.dot(error) / len(error)
    return gradient, error

In [113]:
def least_squares_SGD(y, tx, initial_w, max_iters, gamma):
    
    w = initial_w
    for n_iter in range(max_iters):
        for y_batch, tx_batch in batch_iter(y, tx, batch_size=1, num_batches=1):
            grad, _ = compute_stoch_gradient(y_batch, tx_batch, w)
            w = w - gamma * grad
            loss = compute_mae_loss(y, tx, w)
    return (w, loss)

In [114]:
y, tX = sample_data(yf, standarized_x, 23, 100)
weights, loss = least_squares_SGD(y, tX, 0.001* np.ones((tX.shape[1],1)), 1000, 0.001)

SGD(0/999): loss=0.17450733867101603, w=[[0.0010092 ]
 [0.00089424]
 [0.00096883]
 [0.00108569]
 [0.00110334]
 [0.00114076]
 [0.00089608]
 [0.00097431]
 [0.00100225]
 [0.00102694]
 [0.00095006]
 [0.0012548 ]
 [0.00108662]
 [0.00106432]
 [0.00095368]
 [0.00102163]
 [0.00098757]
 [0.00087568]
 [0.00097404]
 [0.00100937]
 [0.00105335]
 [0.00102936]
 [0.00106155]
 [0.00099316]
 [0.00093433]
 [0.00088197]
 [0.00095906]
 [0.00100835]
 [0.00103027]
 [0.00101989]], 
SGD(1/999): loss=0.17431929708585148, w=[[0.00101832]
 [0.00078875]
 [0.00093762]
 [0.00117101]
 [0.00120646]
 [0.00128105]
 [0.00079242]
 [0.00094872]
 [0.00100437]
 [0.00105349]
 [0.00090019]
 [0.0015091 ]
 [0.00117307]
 [0.00112847]
 [0.00090751]
 [0.00104327]
 [0.00097508]
 [0.00075152]
 [0.00094809]
 [0.00101861]
 [0.00110658]
 [0.00105834]
 [0.00112269]
 [0.00098618]
 [0.00086876]
 [0.00076411]
 [0.00091802]
 [0.00101667]
 [0.00106049]
 [0.00103938]], 
SGD(2/999): loss=0.17413213834046534, w=[[0.00102736]
 [0.00068354]
 [0.00

SGD(150/999): loss=0.15357422257675504, w=[[ 0.00194329]
 [-0.01233186]
 [-0.0038056 ]
 [ 0.01083083]
 [ 0.01436188]
 [ 0.01758   ]
 [-0.012093  ]
 [-0.00197523]
 [ 0.00035968]
 [ 0.00208161]
 [-0.00575703]
 [ 0.03471859]
 [ 0.01243525]
 [ 0.00941374]
 [-0.00444677]
 [ 0.00431086]
 [-0.00108242]
 [-0.01611902]
 [-0.00284072]
 [ 0.00142543]
 [ 0.00779254]
 [ 0.00244762]
 [ 0.00670462]
 [-0.00103102]
 [-0.00788541]
 [-0.01512119]
 [-0.00564081]
 [ 0.00202783]
 [ 0.00509464]
 [ 0.00082172]], 
SGD(151/999): loss=0.15347108695882766, w=[[ 0.00194846]
 [-0.01240496]
 [-0.00383637]
 [ 0.01088053]
 [ 0.01443681]
 [ 0.01766297]
 [-0.01216453]
 [-0.00199045]
 [ 0.00035129]
 [ 0.00207557]
 [-0.00579623]
 [ 0.03491497]
 [ 0.01250139]
 [ 0.00946321]
 [-0.0044737 ]
 [ 0.00433292]
 [-0.00109519]
 [-0.01622182]
 [-0.00286555]
 [ 0.00142415]
 [ 0.00782982]
 [ 0.00244378]
 [ 0.00672414]
 [-0.00104772]
 [-0.007938  ]
 [-0.01521798]
 [-0.00568471]
 [ 0.00203307]
 [ 0.00511928]
 [ 0.00080616]], 
SGD(152/99

 [-0.00231782]], 
SGD(297/999): loss=0.14153836951513205, w=[[ 0.00278524]
 [-0.02146575]
 [-0.00800759]
 [ 0.0168785 ]
 [ 0.02383745]
 [ 0.02697319]
 [-0.02096588]
 [-0.00386755]
 [-0.00107298]
 [ 0.00043373]
 [-0.01075525]
 [ 0.0607472 ]
 [ 0.02111595]
 [ 0.01617231]
 [-0.00739494]
 [ 0.00752386]
 [-0.00252976]
 [-0.0299014 ]
 [-0.00638   ]
 [ 0.00106575]
 [ 0.01236915]
 [ 0.00106203]
 [ 0.00802073]
 [-0.00341309]
 [-0.01488068]
 [-0.02824066]
 [-0.01161618]
 [ 0.00259165]
 [ 0.00849605]
 [-0.00234317]], 
SGD(298/999): loss=0.14147389916614858, w=[[ 0.00279174]
 [-0.02151815]
 [-0.00803393]
 [ 0.01691279]
 [ 0.02389242]
 [ 0.02702036]
 [-0.02101626]
 [-0.00387864]
 [-0.00108342]
 [ 0.0004193 ]
 [-0.01078425]
 [ 0.06090697]
 [ 0.02116866]
 [ 0.01621535]
 [-0.0074088 ]
 [ 0.00754537]
 [-0.00253641]
 [-0.02998656]
 [-0.00640323]
 [ 0.00106284]
 [ 0.0123946 ]
 [ 0.00104893]
 [ 0.00802112]
 [-0.00342792]
 [-0.01492362]
 [-0.02832302]
 [-0.01165284]
 [ 0.00259416]
 [ 0.00851793]
 [-0.00236

SGD(451/999): loss=0.13346156214286783, w=[[ 0.00390066]
 [-0.02833983]
 [-0.01176191]
 [ 0.02143938]
 [ 0.03110862]
 [ 0.03224966]
 [-0.02749085]
 [-0.00542439]
 [-0.00267085]
 [-0.00192591]
 [-0.01453074]
 [ 0.08315123]
 [ 0.02844358]
 [ 0.02243992]
 [-0.00875015]
 [ 0.01074196]
 [-0.00307545]
 [-0.04182166]
 [-0.00980037]
 [ 0.00067219]
 [ 0.01553783]
 [-0.00116241]
 [ 0.00718788]
 [-0.00534384]
 [-0.02087037]
 [-0.0400222 ]
 [-0.01654228]
 [ 0.00279877]
 [ 0.01172678]
 [-0.00643901]], 
SGD(452/999): loss=0.1334191764895517, w=[[ 0.00390853]
 [-0.02837755]
 [-0.01178458]
 [ 0.02146504]
 [ 0.03114887]
 [ 0.03227265]
 [-0.02752607]
 [-0.00543377]
 [-0.00268097]
 [-0.00194147]
 [-0.01455109]
 [ 0.08328364]
 [ 0.02848656]
 [ 0.02247843]
 [-0.00875439]
 [ 0.01076214]
 [-0.0030761 ]
 [-0.04189177]
 [-0.00982151]
 [ 0.0006702 ]
 [ 0.01555392]
 [-0.00117754]
 [ 0.00717765]
 [-0.00535392]
 [-0.02090549]
 [-0.04009329]
 [-0.01656955]
 [ 0.00279904]
 [ 0.01174697]
 [-0.00646604]], 
SGD(453/999

SGD(602/999): loss=0.12808648706485198, w=[[ 0.00513761]
 [-0.03324566]
 [-0.01500799]
 [ 0.02488641]
 [ 0.03638854]
 [ 0.03449496]
 [-0.03199488]
 [-0.00678755]
 [-0.00413236]
 [-0.00420723]
 [-0.01709858]
 [ 0.10156167]
 [ 0.03440239]
 [ 0.02796991]
 [-0.00886345]
 [ 0.01366348]
 [-0.00283869]
 [-0.05148468]
 [-0.01283114]
 [ 0.00047796]
 [ 0.01742567]
 [-0.0034346 ]
 [ 0.00515662]
 [-0.00650878]
 [-0.0257056 ]
 [-0.05008992]
 [-0.02001497]
 [ 0.00270965]
 [ 0.01468476]
 [-0.01044987]], 
SGD(603/999): loss=0.12805670568138783, w=[[ 0.00514599]
 [-0.03327344]
 [-0.01502852]
 [ 0.0249067 ]
 [ 0.03641875]
 [ 0.03450272]
 [-0.03201986]
 [-0.00679632]
 [-0.00414157]
 [-0.00422171]
 [-0.01711251]
 [ 0.10167383]
 [ 0.03443867]
 [ 0.0280047 ]
 [-0.00886104]
 [ 0.01368194]
 [-0.00283515]
 [-0.05154292]
 [-0.01285013]
 [ 0.00047741]
 [ 0.01743487]
 [-0.00344938]
 [ 0.00514043]
 [-0.0065142 ]
 [-0.02573471]
 [-0.05015246]
 [-0.02003386]
 [ 0.00270827]
 [ 0.0147038 ]
 [-0.01047572]], 
SGD(604/99

SGD(744/999): loss=0.1244177449531279, w=[[ 0.00632698]
 [-0.0366962 ]
 [-0.01783842]
 [ 0.0274984 ]
 [ 0.04018312]
 [ 0.0348946 ]
 [-0.03503683]
 [-0.00801661]
 [-0.005378  ]
 [-0.00616317]
 [-0.01874351]
 [ 0.11638578]
 [ 0.0392154 ]
 [ 0.03268906]
 [-0.00818765]
 [ 0.01616073]
 [-0.00214244]
 [-0.05909672]
 [-0.01538879]
 [ 0.00049207]
 [ 0.01837035]
 [-0.00547138]
 [ 0.0026011 ]
 [-0.00700821]
 [-0.0295094 ]
 [-0.05849861]
 [-0.02221744]
 [ 0.00242721]
 [ 0.01732754]
 [-0.01399466]], 
SGD(745/999): loss=0.12439537619231103, w=[[ 0.00633527]
 [-0.03671733]
 [-0.01785786]
 [ 0.02751504]
 [ 0.04020668]
 [ 0.03489306]
 [-0.03505502]
 [-0.00802518]
 [-0.00538634]
 [-0.00617621]
 [-0.01875294]
 [ 0.11648285]
 [ 0.03924711]
 [ 0.03272078]
 [-0.00818075]
 [ 0.01617742]
 [-0.00213634]
 [-0.05914594]
 [-0.01540584]
 [ 0.0004928 ]
 [ 0.01837464]
 [-0.00548525]
 [ 0.00258155]
 [-0.00700992]
 [-0.02953398]
 [-0.05855467]
 [-0.02222975]
 [ 0.00242466]
 [ 0.01734574]
 [-0.01401867]], 
SGD(746/999

SGD(891/999): loss=0.12152371133402536, w=[[ 0.00751954]
 [-0.03941818]
 [-0.02064774]
 [ 0.02971989]
 [ 0.0432684 ]
 [ 0.03418078]
 [-0.03732344]
 [-0.0092694 ]
 [-0.00654356]
 [-0.00796992]
 [-0.01986316]
 [ 0.12969619]
 [ 0.04360886]
 [ 0.03714431]
 [-0.00691604]
 [ 0.01847781]
 [-0.00111692]
 [-0.06575864]
 [-0.01775711]
 [ 0.00068296]
 [ 0.01870892]
 [-0.00743271]
 [-0.00043592]
 [-0.00702817]
 [-0.03283659]
 [-0.06632502]
 [-0.02360999]
 [ 0.00198156]
 [ 0.01994764]
 [-0.01737711]], 
SGD(892/999): loss=0.12150640212603163, w=[[ 0.00752743]
 [-0.03943432]
 [-0.02066657]
 [ 0.02973358]
 [ 0.04328707]
 [ 0.03417304]
 [-0.03733661]
 [-0.00927788]
 [-0.00655109]
 [-0.00798148]
 [-0.01986913]
 [ 0.12978056]
 [ 0.04363708]
 [ 0.03717326]
 [-0.0069058 ]
 [ 0.01849264]
 [-0.0011092 ]
 [-0.06580028]
 [-0.01777231]
 [ 0.00068479]
 [ 0.01870938]
 [-0.00744553]
 [-0.00045751]
 [-0.00702683]
 [-0.03285738]
 [-0.06637557]
 [-0.02361681]
 [ 0.00197809]
 [ 0.0199651 ]
 [-0.01739911]], 
SGD(893/99

In [None]:
#==================================================Logistic-Regression==========================================================

In [56]:
def sigmoid(t):
    
    expo = np.exp(-t)
    result = 1.0/(1.0 + expo)
    return result


In [57]:
def compute_gradient(tx, y, w):
    
    pred = sigmoid(tx @ w)
    gradient = tx.T @ (pred - y) 
    return gradient


In [58]:
def compute_sigmoid_loss(tx, y, w):
    
    predictions = sigmoid(tx @ w)
    neg_losses_per_datapoint = -(y * np.log(predictions) + (1 - y) * np.log(1 - predictions))
    return neg_losses_per_datapoint.sum()


In [69]:
def learning_by_gradient_descent(y, tx, w, gamma):
    """
    Do one step of gradient descent using logistic regression.
    Return the loss and the updated w.
    """
    # ***************************************************
    loss = compute_sigmoid_loss(tx, y, w)
    # ***************************************************
    gradient = compute_gradient(tx, y, w)
    # ***************************************************
    w = w - gamma * gradient
    # ***************************************************
    return loss, w



In [85]:
def logistic_regression(y, tx, initial_w, max_iters, gamma):
    
    # init parameters
    threshold = 1e-8
    losses = []
    w = np.concatenate(([[1]], np.copy(initial_w)), axis=0)
    # build tx
    tX = np.c_[np.ones((y.shape[0], 1)), tx]
    # start the logistic regression
    for iter in range(max_iters):
        # get loss and update w.
        loss, w = learning_by_gradient_descent(y, tX, w, gamma)
        # converge criterion
        losses.append(loss)
        if len(losses) > 1 and np.abs(losses[-1] - losses[-2]) < threshold:
            return (w, loss)
    return (w, loss)


In [88]:
y, tX = sample_data(yf, standarized_x, 23, 100)
weights, loss = logistic_regression(y, tX, 0.001* np.ones((tX.shape[1],1)), 1000, 0.001)

34.47534944428531


## Generate predictions and save ouput in csv format for submission:

In [92]:
DATA_TEST_PATH = '../data/test.csv' # TODO: download train data and supply path here 
_, tX_test, ids_test = load_csv_data(DATA_TEST_PATH)
tX_test, mean, std = clean(tX_test)
tX_test = standardize(tX_test, mean, std)

In [93]:
OUTPUT_PATH = '../data/sample-submission.csv' # TODO: fill in desired name of output file for submission
y_pred = predict_labels(np.squeeze(weights[1:,:]), tX_test)
print(y_pred)
create_csv_submission(ids_test, y_pred, OUTPUT_PATH)

[ 1. -1. -1. ...  1.  1. -1.]
