In [41]:
import csv
import numpy as np

"""Data loading"""
def load_results(path_dataset):
    """load data features."""
    to_int = dict(s = 1,b = 0)
    def convert(s):
        return to_int.get(s.decode("utf-8") , 0)
    
    data = data = np.genfromtxt(path_dataset, delimiter=",", skip_header=1, usecols=[1],
                                converters={1: convert})
    
    return data

def load_data_features(path_dataset):
    """load data features."""
    data = data = np.genfromtxt(path_dataset, delimiter=",", skip_header=1, 
                                usecols=tuple(range(2,32)))
    
    ids = np.genfromtxt(path_dataset, delimiter=",", skip_header=1, usecols=[0])
    
    return data, ids

In [285]:
def build_polynomial(x, degree):
    """polynomial basis functions for input data x, for j=0 up to j=degree."""
    Extended = np.empty((x.shape[0],0))
    
    for j in range(0, degree+1):
        for i in range(x.shape[1]):
            Extended = np.c_[Extended, x[:,i]**j]
          
    
    return Extended

In [3]:
def build_k_indices(y, k_fold, seed):
    """build k indices for k-fold."""
    num_row = y.shape[0]
    interval = int(num_row / k_fold)
    np.random.seed(seed)
    indices = np.random.permutation(num_row)
    k_indices = [indices[k * interval: (k + 1) * interval]
                 for k in range(k_fold)]
    return np.array(k_indices)

In [4]:
def correlation_filter(X, y):
    abs_corr = np.zeros(X.shape[1])
    for index, x in enumerate(X.T):
        abs_corr[index] = np.abs(np.corrcoef(y,x.T)[0,1])
    quality = np.where(abs_corr > 0.05)
    return X[:,quality[0]], quality[0]

In [5]:
def learning_by_penalized_gradient(y, tx, w, gamma, lambda_):
    """
    Do one step of gradient descent, using the penalized logistic regression.
    Return the loss and updated w.
    """
    loss = calculate_loss(y, tx, w) + lambda_ * np.squeeze(w.T.dot(w))
    gradient = calculate_gradient(y, tx, w) + 2 * lambda_ * w
    # ***************************************************
    w -= gradient*gamma
    return loss, w

In [138]:
def standardize_data(data, mean = None, sigma = None, min_value = None, max_value = None):
    """Standardizes the data and maps data to [0,1] range to prevent overflows"""
    if mean is None:
        mean = np.nanmean(data, axis = 0)
    
    if sigma is None:
        sigma = np.nanstd(data, axis = 0)
    
    output = (data - mean)/sigma
    
    if min_value is None:
        min_value = np.min(data, axis = 0)
    
    if max_value is None:
        max_value = np.max(data, axis = 0)
        
    output = (output - min_value)/(max_value - min_value)
    
    return output, mean, sigma, min_value, max_value

In [178]:
def create_subsets(data, y):
    """Creates four subsets based on the number of jets, which is 0, 1, 2 or 3"""
    data_subsets = []
    y_subsets=[]
    for i in range(4):
        mask = data[:,22] == i
        data_subsets.append(data[mask])
        y_subsets.append(y[mask])
        
    return data_subsets, y_subsets

In [209]:
def remove_zero_variance(data, mask = None):
    """removes zero variance columns based on the subset"""
    if mask is None:
        variance = np.var(data, axis = 0)
        mask = np.squeeze(~np.logical_or([variance ==0],[np.isnan(variance)]))
        
    return data[:, mask[:]], mask

In [275]:
def replace_or_remove_nan(data, delete = None):        
    """deletes columns with nan only and replaces nan by median value"""
    
    if delete is None:
        delete = []
        for j in range(data.shape[1]):
            if np.all(np.isnan(data[:,j])):
                delete.append(j)
                
    data = np.delete(data, delete, axis = 1)

    for j in range(data.shape[1]):
        replace = np.median(data[:,j])
        data[np.isnan(data[:,j]),j] = replace

    
    return data, delete

In [276]:
def process_data(X_train, X_test, y_train, y_test):
    """Processes the test and training data, splits with respect to jet number,
    removes zero variance, standardizes and normalizes the data and 
    replaces -999 by median value of column"""
    
    train_subsets, y_train = create_subsets(X_train, y_train)
    test_subsets, y_test = create_subsets(X_test, y_test)
    
    for i in range(4):
        train_subsets[i], mask = remove_zero_variance(train_subsets[i])
        train_subsets[i][train_subsets[i] == -999] = np.nan
        train_subsets[i], mean, sigma, min_value, max_value =  standardize_data(train_subsets[i],
                                                                               mean = None, sigma = None, min_value = None, max_value = None)
        
        train_subsets[i], delete = replace_or_remove_nan(train_subsets[i], delete = None)
        
        test_subsets[i], _ = remove_zero_variance(test_subsets[i], mask)
        test_subsets[i][test_subsets[i] == -999] =np.nan
        test_subsets[i], _, _, _, _ =  standardize_data(test_subsets[i],
                                                mean = mean, sigma = sigma, min_value = min_value, max_value = max_value)
        test_subsets[i], _ = replace_or_remove_nan(test_subsets[i], delete = delete)
        
        
    return train_subsets, test_subsets, y_train, y_test
        


In [324]:
def add_features(data, degree = None, sqrt = True, log = True):
    """Adds following features to data set:
    -sqrt of features
    -polynomial extension of 0 up to degree
    -log of features
    """ 
    output = np.empty((data.shape[0],0))
    # add sqrt
    if sqrt:
        output = np.c_[output, np.sqrt(np.abs(data))]
        
    #polynomial
    if degree is not None:
        output = np.c_[output, build_polynomial(data, degree)]
        
    #log
    if log:
        output = np.c_[output, np.log(np.abs(data))]
        
    return output

In [277]:
#%% load data and process data
path =  "data"
X, _ = load_data_features(path +"/train.csv")
y = load_results(path +"/train.csv")
X_test, y_test = X[:int(0.8*len(X)),:], y[:int(0.8*len(X))]
X_train, y_train = X[int(0.8*len(X)):,:], y[int(0.8*len(X)):]
X_train, X_test, y_train, y_test = process_data(X_train, X_test, y_train, y_test)

In [326]:
# logistic regression
ff = add_features(X_train[1], degree = 15, sqrt = True, log = True)
xx = add_features(X_test[1], degree = 15, sqrt = True, log = True)
w = np.zeros((ff.shape[1], 1))
threshold = 1e-8
losses = []
max_iter = 100000
lambda_ = 0.5
gamma = 0.00001
# start the logistic regression
for iter in range(max_iter):
     # get loss and update w.
    loss, w = learning_by_penalized_gradient(y_train[1], ff, w, gamma, lambda_)
        # log info
    if iter % 1000 == 0:
        print("Current iteration={i}, loss={l}".format(i=iter, l=loss))
    # converge criterion
    losses.append(loss)
    if len(losses) > 1 and np.abs(losses[-1] - losses[-2]) < threshold:
        break

Current iteration=0, loss=10716.748558637313
Current iteration=1000, loss=568885.5464600606
Current iteration=2000, loss=636965.8299172868
Current iteration=3000, loss=151355.72751964434


KeyboardInterrupt: 

In [306]:
y = xx @ w
y[y<0.5] = 0
y[y>=0.5] = 1
sum(y)

array([738.])

In [325]:
ff = add_features(X_train[1], degree = 15, sqrt = True, log = True)

In [321]:
def sigmoid(t):
    """apply the sigmoid function on t."""
    return 1/(1+np.exp(-t))

def calculate_loss(y, tx, w):
    """compute the loss: negative log likelihood."""
    inter_y = y.reshape(len(y),1)
    z = tx @ w
    a = np.sum(np.log(1 + np.exp(z)))
    b = inter_y.T @ z
    loss = a - b
    return np.squeeze(loss)

def calculate_gradient(y, tx, w):
    """compute the gradient of loss."""
    inter_y = y.reshape(len(y),1)
    gradient = tx.T @ (sigmoid(tx @ w) - inter_y)
    return gradient

In [62]:
def batch_iter(y, tx, batch_size, num_batches=1, shuffle=True):
    """
    Generate a minibatch iterator for a dataset.
    Takes as input two iterables (here the output desired values 'y' and the input data 'tx')
    Outputs an iterator which gives mini-batches of `batch_size` matching elements from `y` and `tx`.
    Data can be randomly shuffled to avoid ordering in the original data messing with the randomness of the minibatches.
    Example of use :
    for minibatch_y, minibatch_tx in batch_iter(y, tx, 32):
        <DO-SOMETHING>
    """
    data_size = len(y)

    if shuffle:
        shuffle_indices = np.random.permutation(np.arange(data_size))
        shuffled_y = y[shuffle_indices]
        shuffled_tx = tx[shuffle_indices]
    else:
        shuffled_y = y
        shuffled_tx = tx
    for batch_num in range(num_batches):
        start_index = batch_num * batch_size
        end_index = min((batch_num + 1) * batch_size, data_size)
        if start_index != end_index:
            yield shuffled_y[start_index:end_index], shuffled_tx[start_index:end_index]
            
def stochastic_gradient_descent(
        y, tx, initial_w, batch_size, max_iters, gamma, lambda_):
    """Stochastic gradient descent algorithm."""
    # ***************************************************
    ws = [initial_w]
    losses = []
    w = initial_w
    for n_iter in range(max_iters):
        loss = calculate_loss(y, tx, w) + lambda_ * np.squeeze(w.T.dot(w))
        for minibatch_y, minibatch_tx in batch_iter(y, tx, batch_size):
            gradient = calculate_gradient(y, tx, w) + 2 * lambda_ * w
            # ***************************************************
            # update w by gradient
            w -= gamma * gradient   
            
        # store w and loss
        ws.append(w)
        losses.append(loss)
        if n_iter % 100 == 0:
            print("Gradient Descent({bi}/{ti}): loss={l}, w0={w0}, w1={w1}".format(
                  bi=n_iter, ti=max_iters - 1, l=loss, w0=w[0], w1=w[1]))
            
    # ***************************************************
    return losses, ws

In [64]:
w = np.zeros((X_new.shape[1], 1))
threshold = 1e-8
losses = []
max_iter = 300
lambda_ = 0.08
gamma = 0.00001
batch_size = 5000
max_iters = 1000

losses, ws = stochastic_gradient_descent(y, X_new, w, batch_size, max_iter, gamma, lambda_)

Gradient Descent(0/299): loss=173286.79513998624, w0=[0.0120085], w1=[-0.41697064]
Gradient Descent(100/299): loss=151768.76755397083, w0=[0.01630981], w1=[-0.67932002]
Gradient Descent(200/299): loss=151025.752212893, w0=[0.01256946], w1=[-0.6797545]


In [51]:
a= y.reshape(250000,1)

In [105]:
1/(1+np.exp(-227.91778431))

1.0

In [68]:
np.var(X, axis = 0)

array([1.65116124e+05, 1.24925594e+03, 1.66697530e+03, 4.05202959e+03,
       2.06551758e+05, 4.32925819e+05, 2.05226188e+05, 6.12947368e-01,
       4.96106539e+02, 1.33878515e+04, 7.13587788e-01, 1.42463906e+00,
       2.05749162e+05, 5.02299351e+02, 1.47398106e+00, 3.30061476e+00,
       4.86858853e+02, 1.60017344e+00, 3.30006328e+00, 1.08205651e+03,
       3.28413798e+00, 1.60020609e+04, 9.55358361e-01, 2.84048199e+05,
       2.39451000e+05, 2.39446692e+05, 2.30279570e+05, 2.05556795e+05,
       2.05560779e+05, 9.60703157e+03])