In [1]:
import csv
import numpy as np
from tqdm import tqdm

In [2]:
def load_results(path_dataset):
    """load data features."""
    to_int = dict(s = 1,b = 0)
    def convert(s):
        return to_int.get(s.decode("utf-8") , 0)
    
    data = data = np.genfromtxt(path_dataset, delimiter=",", skip_header=1, usecols=[1],
                                converters={1: convert})
    
    return data

def load_data_features(path_dataset):
    """load data features."""
    data = data = np.genfromtxt(path_dataset, delimiter=",", skip_header=1, 
                                usecols=tuple(range(2,32)))
    
    ids = np.genfromtxt(path_dataset, delimiter=",", skip_header=1, usecols=[0])
    
    return data, ids

In [3]:
def build_polynomial(x, degree):
    """polynomial basis functions for input data x, for j=0 up to j=degree."""
    Extended = np.empty((x.shape[0],0))
    
    for j in range(0, degree+1):
        for i in range(x.shape[1]):
            Extended = np.c_[Extended, x[:,i]**j]
    
    return Extended

In [4]:
def correlation_filter(X, y, threshold = 0.01):
    """Removes features which are correlated with y with less than threshold"""
    abs_corr = np.zeros(X.shape[1])
    for index, x in enumerate(X.T):
        abs_corr[index] = np.abs(np.corrcoef(y,x.T)[0,1])
        
    quality = np.where(abs_corr > threshold)
    
    return X[:,quality[0]], quality[0]

In [5]:
def normalize_data(data, mean = None, sigma = None):
    """Standardizes the data"""
    if mean is None:
        mean = np.nanmean(data[data != -999], axis = 0)
    
    if sigma is None:
        sigma = np.nanstd(data[data != -999], axis = 0)
    
    output = (data - mean)/sigma
    
    return output, mean, sigma

def standardize_data(data, min_value = None, max_value = None):
    """maps data to [0,1] range"""
    if min_value is None:
        min_value = np.min(data, axis = 0)
    
    if max_value is None:
        max_value = np.max(data, axis = 0)
        
    output = (data - min_value)/(max_value - min_value)
    
    return output, min_value, max_value

In [6]:
def create_subsets(data, y, ids):
    """Creates four subsets based on the number of jets,
    which is 0, 1 and 2 or 3. 2 and 3 are put in one group,
    since they keep same features and have similar correlation patterns
    """
    data_subsets = []
    y_subsets = []
    ids_subsets = []
    for i in range(3):
        if i ==2:
            mask = data[:,22] >= i
        else:
            mask = data[:,22] == i
        data_subsets.append(data[mask])
        if y is not None:
            y_subsets.append(y[mask])
            
        ids_subsets.append(ids[mask])
        
    return data_subsets, y_subsets, ids_subsets

In [7]:
def remove_zero_variance(data, mask = None):
    """removes zero variance columns based on the subset"""
    if mask is None:
        variance = np.var(data, axis = 0)
        mask = variance == 0
    return data[:, ~mask[:]], mask

In [8]:
def replace_missing(data, median = None):        
    """replaces nan by median value"""
    if median is None:
        median =[]
        for j in range(data.shape[1]):
            mask = data[:,j] != -999
            replace = np.median(data[mask,j])
            data[~mask,j] = replace
            median.append(replace)
    else:
        for j in range(data.shape[1]):
            mask = data[:,j] != -999
            data[~mask,j] = median[j]

    return data, median

In [9]:
def process_data(X_train, X_test, y_train, y_test, ids_train, ids_test):
    """
    Processes the test and training data by:
    -splitting data with respect to jet number, creating three groups
    -removing zero variance in each subgroup
    -removing columns which are lowly correlated to y
    -normalizing the data with mean and standard devation 
    -replacing -999 by median value of column
    """
      
    train_subsets, y_train, ids_train = create_subsets(X_train, y_train, ids_train)
    test_subsets, y_test, ids_test = create_subsets(X_test, y_test, ids_test)
    
    for i in range(3):
        # change training sets
        train_subsets[i], mask = remove_zero_variance(train_subsets[i])
        print("For subgroup",i,"The following columns were removed due to zero variance:",[i for i, x in enumerate(mask) if x])
        
        train_subsets[i], mean, sigma = normalize_data(train_subsets[i], mean = None, sigma = None) 
        train_subsets[i], median = replace_missing(train_subsets[i], median = None) 
        train_subsets[i], quality = correlation_filter(train_subsets[i], y_train[i], threshold = 0.01)
        print("For subgroup",i,"The following columns were kept after low correlation:",quality)
        print("Final shape of subset",i,"is:",train_subsets[i].shape)
        
        #change test sets accordingly to training sets
        test_subsets[i], _ = remove_zero_variance(test_subsets[i], mask)
        test_subsets[i], _, _ =  normalize_data(test_subsets[i], mean, sigma)
        test_subsets[i], _ = replace_missing(test_subsets[i], median)
        test_subsets[i] = test_subsets[i][:, quality]
        
    return train_subsets, test_subsets, y_train, y_test, ids_train, ids_test
        


In [10]:
def add_cross_terms(data):
    """Adds cross terms between columns"""
    enriched_data = data
    for x1 in data.T:
        for x2 in data.T:
            if np.sum(x1 - x2) != 0:
                enriched_data = np.c_[enriched_data, x1*x2]
                
    return enriched_data      

In [11]:
def build_k_indices(y, k_fold, seed):
    """build k indices for k-fold."""
    num_row = y.shape[0]
    interval = int(num_row / k_fold)
    np.random.seed(seed)
    indices = np.random.permutation(num_row)
    k_indices = [indices[k * interval: (k + 1) * interval] for k in range(k_fold)]
    return np.array(k_indices)

In [12]:
def add_log_terms(data):
    """Adds log terms to data"""
    extended = data
    for column in data.T:
        if np.sum(column <= -1) == 0:
            extended = np.c_[extended, np.log(1+ column)]
        
    return extended

In [13]:
def add_features(data, degree = None, sqrt = True, log = True, cross_terms = True):
    """
    Adds following features to data set:
    -log of features by log(1+x)
    -sqrt of features
    -polynomial extension of 0 up to degree
    -cross terms of features
    """ 
    #log
    if log:
        data = add_log_terms(data)
        output = data
    else:
        output = np.empty((data.shape[0],0))
        
    #polynomial
    if degree is not None:
        output = np.c_[output, build_polynomial(data, degree)]
      
    # add sqrt
    if sqrt:
        output = np.c_[output, np.sqrt(np.abs(data))]
        
    
    if cross_terms:
        output = np.c_[output, add_cross_terms(data)]
            
    return output

In [14]:
def compute_mse(y, tx, w):
    """compute the loss by mse."""
    e = y - tx.dot(w)
    mse = e.dot(e) / (2 * len(e))
    return mse


In [15]:
def stitch_solution(X_test, y_result, ids_test_group, ids_test):
    """
    Puts found y values back in right order for the complete data matrix,
    since it was split in four groups.
    X_test: original, preprocessed test data
    y_result: output of created model, list of three vectors containing predictions for group 1,2 and 3
    """
    y_final=[]
    for i in range(X_test.shape[0]):
        if ids_test[i] in ids_test_group[0]:
            index = np.where(ids_test_group[0] == ids_test[i])
            y_final.append(y_result[0][index])
            
        elif ids_test[i] in ids_test_group[1]:
            index = np.where(ids_test_group[1] == ids_test[i])
            y_final.append(y_result[1][index])
            
        elif ids_test[i] in ids_test_group[2]:
            index = np.where(ids_test_group[2] == ids_test[i])
            y_final.append(y_result[2][index])
            
    return y_final

In [50]:
def hyper_optimizing(X_train, y_train, methods = ["Ridge_regression"],
                     lambdas = [0.1], degrees = [1], gamma = 0.0000001,  max_iter = 3000):
    """Finds best lambda and degree to use on the given data, test possibilities are:
    -Ridge regression
    -Penalized Logistic regression"""
    
    # Check method names are correct
    if len([i for i in methods if i in ["Ridge_regression", "Penalized_logistic"]]) < len(methods):
        raise NameError("At least one method is wrong")
        
    all_losses = np.zeros((len(methods), len(degrees), len(lambdas)))
    best_parameters = []
    
    for degree_index, degree in enumerate(degrees):
        X_train_ex = add_features(X_train, degree = degree)
    
        for method_index, method in enumerate(methods):
            
            if method == "Ridge_regression":
                seed = 1
                k_fold = 5
                k_indices = build_k_indices(y_train, k_fold, seed)
                print("Start ridge regression test for degree", str(degree),"...")
                for index, lambda_ in enumerate(lambdas):
                    losses_te = []
                    for k in range(k_fold):
                        loss_te = cross_validation_ridge(y_train, X_train_ex, k_indices, k, lambda_)
                        losses_te.append(loss_te)
                    all_losses[method_index, degree_index, index] = np.mean(losses_te)
                 
                # Show percantage of correct results for this degree
                min_lambda = lambdas[np.argmin(all_losses[method_index, degree_index,:])]
                print("Lowest loss is for lambda:", min_lambda, "is:", min(all_losses[method_index, degree_index,:]))
                
                
            elif method == "Penalized_logistic":
                #less k-fold for reason of speed
                seed = 1
                k_fold = 4
                k_indices = build_k_indices(y_train, k_fold, seed)
                print("Start penalized_logistic test...")
                for index, lambda_ in enumerate(lambdas):
                    losses_te = []
                    for k in range(k_fold):
                        loss_te, _ = cross_validation_logistic(y_train, X_train_ex, k_indices,
                                                    k, lambda_, gamma, max_iter)
                        losses_te.append(loss_te)
                    all_losses[method_index, degree_index, index] = np.mean(losses_te) 
                
                # Show percantage of correct results for this degree
                min_lambda = lambdas[np.argmin(all_losses[method_index, degree_index,:])]
                print("Lowest loss is:", min(all_losses[method_index, degree_index,:]),"for lambda:", min_lambda)
                
            
    min_loss = np.argmin(all_losses)
    min_loss = np.unravel_index(min_loss, (len(methods), len(degrees), len(lambdas)))
    best_parameters.append(methods[min_loss[0]])
    best_parameters.append(degrees[min_loss[1]])
    best_parameters.append(lambdas[min_loss[2]])  
    print(best_parameters)
    return best_parameters, all_losses
    

In [17]:
def quantify_result(y_found, y_real):
    y_found[y_found<0.5]=0
    y_found[y_found>=0.5]=1
    summ = y_found + y_real
    TP = np.sum(summ == 2)
    TN = np.sum(summ == 0)
    diff = y_found - y_real
    FP = np.sum(diff == 1)
    FN = np.sum(diff == -1)
    accuracy = (TP+TN)/(TP +TN +FP + FN)
    F_score = TP/(TP + 0.5 * (FP +FN))
    recall = TP/(TP + FN)
    precision = TP/(TP + FP)
    return precision, recall, F_score, accuracy

In [36]:
def cross_validation_ridge(y, x, k_indices, k, lambda_):
    """return the loss of ridge regression."""
    y_test = y[k_indices[k]]
    x_test = x[k_indices[k], :]
    tr_indice = k_indices[~(np.arange(k_indices.shape[0]) == k)]
    tr_indice = tr_indice.reshape(-1)
    y_train = y[tr_indice]
    x_train = x[tr_indice, :]

    w = ridge_regression(y_train, x_train, lambda_)

    # calculate the loss for train and test data:
    loss_te = np.sqrt(2*compute_mse(y_test, x_test, w))
    # Calculate F_score, seems more reliable to compare different degrees
    #y_new = x_test @ w
    #precision, recall, F_score, accuracy = quantify_result(y_new, y_test)
    
    return loss_te

In [19]:
def ridge_regression(y, tx, lambda_):
    """ridge regression"""
    if len(tx.shape) > 1:
        w = np.linalg.solve(tx.T @ tx + (2*tx.shape[0]*lambda_)*np.identity(tx.shape[1]), tx.T @ y)
    else:
        w = 1/(tx.T @ tx + lambda_) * tx.T @ y                        

    return w



In [20]:
def learning_by_penalized_gradient(y, tx, w, gamma, lambda_):
    """
    Do one step of gradient descent, using the penalized logistic regression.
    Return the loss and updated w.
    """
    loss = calculate_loss(y, tx, w) + lambda_ * np.squeeze(w.T.dot(w))
    gradient = calculate_gradient(y, tx, w) + 2 * lambda_ * w
    w -= gradient*gamma
    
    return loss, w

In [21]:
def sigmoid(t):
    """applies the sigmoid function on t."""
    return 1/(1+np.exp(-t))

def calculate_loss(y, tx, w):
    """computes the loss: negative log likelihood."""
    inter_y = y.reshape(len(y),1)
    z = tx @ w
    a = np.sum(np.log(1 + np.exp(z)))
    b = inter_y.T @ z
    loss = a - b
    return np.squeeze(loss)

def calculate_gradient(y, tx, w):
    """computes the gradient of loss."""
    inter_y = y.reshape(len(y),1)
    gradient = tx.T @ (sigmoid(tx @ w) - inter_y)
    return gradient

In [22]:
def show_result_ridge(X_train, y_train, X_test, y_test, lambda_):
    """prints accuracy of ridge regression"""
    w = ridge_regression(y_train, X_train, lambda_)
    y_new = X_test @ w
    precision, recall, F_score, accuracy = quantify_result(y_new, y_test)
    print("Accuracy of the predictions is:",
          str(accuracy), " and F-score is:", str(F_score), "with lambda:",lambda_)
    print("Precision is:",str(precision), "and recall is:",str(recall))

In [23]:
def show_result_logistic(X_train, y_train, X_test, y_test, lambda_, gamma = 0.000001):
    """prints accuracy of logistic regression"""
    w = np.zeros((X_train.shape[1], 1))
    _, w = learning_by_penalized_gradient(y_train, X_train, w, gamma, lambda_)
    y_new = X_test @ w
    precision, recall, F_score, accuracy = quantify_result(y_new, y_test)
    print("Accuracy of the predictions is:",
          str(accuracy), " and F-score is:", str(F_score), "with lambda:",lambda_)
    print("Precision is:",str(precision), "and recall is:",str(recall))

In [38]:
def find_parameters(X_train, y_train, degrees, lambdas, methods):
    """Finding best parameters and losses per set
    input:
        X_train = ndarray of training data
        y_train =  1darray of goal output corresponding to the training data
        lamdas = list of 1darray of floats to use as lambdas
        degrees = list of 1darray of integers to use for polynomial expansion
        gamma = learning rate, only relevant if penalized logistic regresssion is used
    output:
        best_parameter_loss = list containg best combination of method, degree and lambda per subset
        losses_sets = list of arrays containing all losses for methods used.
        Array sizes are equal to the length of degrees, length of lambdas and length of methods.
        List length is equal to number of subsets, 3"""
    
    best_parameter_per_set = []
    losses_sets =[]
    
    for i in range(3):
        print("Testing for set",i)
        parameters, losses = hyper_optimizing(X_train[i], y_train[i],
                                methods, lambdas[i], degrees[i], 10**-10, 5)
        
        best_parameter_per_set.append(parameters)
        losses_sets.append(losses)
    
    return best_parameter_per_set, losses_sets

In [48]:
def cross_validation_logistic(y, x, k_indices, k,lambda_, gamma,max_iter):
    """return the loss of ridge regression."""
    # split according to k_indices
    y_test = y[k_indices[k]]
    x_test = x[k_indices[k], :]
    tr_indice = k_indices[~(np.arange(k_indices.shape[0]) == k)]
    tr_indice = tr_indice.reshape(-1)
    y_train = y[tr_indice]
    x_train = x[tr_indice, :]
    
    
    loss, w = logistic_regression_penalized_gradient_descent(y_train, x_train, lambda_, gamma,max_iter)
        
    # calculate the loss for train and test data:
    loss_te = calculate_loss(y_test, x_test, w)
    
    # Calculate F_score
    y_new = x_test @ w
    precision, recall, F_score, accuracy = quantify_result(y_new, y_test)
    print(accuracy, F_score)
    
    return loss_te, w

In [27]:
def logistic_regression_penalized_gradient_descent(y, x, lambda_, gamma,max_iter):
    # init parameters
    threshold = 1e-8
    losses = []

    w = np.random.randn(x.shape[1], 1)/10000
    threshold = 1e-8
    losses = []
    # start the logistic regression
    iter = 0
    while iter <max_iter:
        # get loss and update w.
        loss, w = learning_by_penalized_gradient(y, x, w, gamma, lambda_)
        # log info
        if iter % 999 == 0:
            print("Current iteration={i}, loss={l}".format(i=iter, l=loss))
            
        # check loss actually decreases, if not decrease gamma
        if iter > 0:
            if loss > losses[-1]:
                gamma = gamma/2
            if np.isinf(loss):
                iter = 0
                w = np.random.randn(x.shape[1], 1)
                gamma = gamma/10
                
        iter +=1
        # converge criterion
        losses.append(loss)
        if len(losses) > 1 and np.abs(losses[-1] - losses[-2]) < threshold:
            break
            
    return loss, w

In [31]:
print(best_parameter_per_set)
print(losses_per_set)

[array([[1.e-13, 3.e+00]])]
[array([[[10940.94732142, 10940.94732142, 10940.94732142, 10940.94732142],
        [11419.25563444, 11419.25563444, 11419.25563444, 11419.25563444],
        [11399.85266053, 11399.85266053, 11399.85266053, 11399.85266053],
        [11396.33972425, 11396.33972425, 11396.33972425, 11396.33972425],
        [11398.17682043, 11398.17682043, 11398.17682043, 11398.17682043],
        [11397.09021809, 11397.09021809, 11397.09021809, 11397.09021809],
        [11397.75450241, 11397.75450241, 11397.75450241, 11397.75450241]]])]


In [51]:
# path =  "C:/Users/jurri/OneDrive/Documenten/University/Exchange/ML/train"
# X, ids = load_data_features(path +"/train.csv")
# y = load_results(path +"/train.csv")

# """Use this cell if the training data will be split in a test and a train set"""

# X_train, y_train, ids_train = X[:int(0.8*len(X)),:], y[:int(0.8*len(X))], ids[:int(0.8*len(X))]
# X_test, y_test, ids_test = X[int(0.8*len(X)):,:], y[int(0.8*len(X)):], ids[int(0.8*len(X)):]

# X_train, X_test_pro, y_train, y_test_pro, ids_tr_group, ids_test_group = process_data(X_train, 
#                                                     X_test, y_train, y_test, ids_train, ids_test)

# #parameters to be changed in this function
#test for sepcific degrees for each subset
degrees = [np.linspace(3,9,7, dtype = int), np.linspace(6, 14, 9, dtype = int), np.linspace(9, 17, 8, dtype = int)]

#test for specific lambdas for each subset
lambdas = [np.logspace(-14,-9,7), np.logspace(-14,-5,10), np.logspace(-14, -9,7)]
lambdas = [np.array([1]),np.array([1]),np.array([1])]
degrees =[np.array([1]), np.array([1]), np.array([1])]
#test for both methods
methods = ["Ridge_regression", "Penalized_logistic"]
methods = ["Penalized_logistic"]
from implementations import least_squares

best_parameter_per_set, losses_per_set = find_parameters(X_train, y_train, degrees, lambdas, methods)

y_result = []
for i in range(3):
    lambda_ = best_parameter_per_set[i][0,0]
    degree = int(best_parameter_per_set[i][0,1])
    X_train_ex = add_features(X_train[i], degree = degree)
    X_test_pro_ex = add_features(X_test_pro[i], degree = degree)
    show_result_logistic(X_train_ex, y_train[i], X_test_pro_ex, y_test_pro[i], lambda_, gamma = 0.000001)
    
    w = ridge_regression(y_train[i], X_train_ex, lambda_)
    y_result.append(X_test_pro_ex @ w)

y_final = stitch_solution(X_test, y_result, ids_test_group, ids_test)
y_test = np.reshape(y_test, (len(y_test),1))
y_final = np.array(y_final)
quantify_result(y_final, y_test)




Testing for set 0
Start penalized_logistic test...
Current iteration=0, loss=41594.18427302348


  del sys.path[0]


0.7435345905657545 0.0
Current iteration=0, loss=41559.66708395982
0.7403831724275924 0.0
Current iteration=0, loss=41682.11814585011
0.751188034615577 0.0
Current iteration=0, loss=41647.71108821051
0.7452853784202891 0.0
Lowest loss is: 13764.991369319641 for lambda: 1
['Penalized_logistic', 1, 1]
Testing for set 1
Start penalized_logistic test...
Current iteration=0, loss=32278.57931492338
0.6489690721649485 0.0
Current iteration=0, loss=32256.97365076226
0.6353092783505154 0.0
Current iteration=0, loss=32278.031001706124
0.6385309278350515 0.0
Current iteration=0, loss=32264.252548287113
0.6462628865979382 0.0
Lowest loss is: 10750.703073719818 for lambda: 1
['Penalized_logistic', 1, 1]
Testing for set 2


KeyboardInterrupt: 

In [121]:
best_parameter_per_set, losses_per_set = find_parameters(X_train, y_train)
y_result = []
for i in range(1):
    lambda_ = best_parameter_per_set[i][0,0]
    degree = int(best_parameter_per_set[i][0,1])
    X_train_ex = add_features(X_train[i], degree = degree)
    X_test_pro_ex = add_features(X_test_pro[i], degree = degree)
    show_result_logistic(X_train_ex, y_train[i], X_test_pro_ex, y_test_pro[i], lambda_, gamma = 0.000001)
    
    #w = ridge_regression(y_train[i], X_train_ex, lambda_)
    #y_result.append(X_test_pro_ex @ w)

# y_final = stitch_solution(X_test, y_result, ids_test_group, ids_test)
# y_test = np.reshape(y_test, (len(y_test),1))
# y_final = np.array(y_final)
# quantify_result(y_final, y_test)

Testing for set 0
Start penalized_logistic test...
Current iteration=0, loss=41570.1158597216
Current iteration=999, loss=33418.35594736119
Current iteration=1998, loss=32946.80737836916
Current iteration=2997, loss=32601.647528690773


  del sys.path[0]


0.7435345905657545 0.0
Current iteration=0, loss=41570.1158597216
Current iteration=999, loss=33510.68712851258
Current iteration=1998, loss=33029.095641885884
Current iteration=2997, loss=32673.864366103073
0.7403831724275924 0.0
Current iteration=0, loss=41570.1158597216
Current iteration=999, loss=33817.637350342
Current iteration=1998, loss=33514.659703567944
Current iteration=2997, loss=33265.225666961254
0.751188034615577 0.0
Current iteration=0, loss=41570.1158597216
Current iteration=999, loss=33515.92405980315
Current iteration=1998, loss=33045.91342653857
Current iteration=2997, loss=32700.398558844783
0.7452608388396358 0.00010002537662662905
Current iteration=0, loss=41570.1158597216
Current iteration=999, loss=33418.35594736119
Current iteration=1998, loss=32946.80737836916
Current iteration=2997, loss=32601.647528690773
0.7435345905657545 0.0
Current iteration=0, loss=41570.1158597216
Current iteration=999, loss=33510.68712851258
Current iteration=1998, loss=33029.0956418

  if __name__ == '__main__':
  This is separate from the ipykernel package so we can avoid doing imports until


Current iteration=999, loss=34080.45465515149
Current iteration=1998, loss=34071.37203405078
Current iteration=2997, loss=34062.62077113262
0.7435345905657545 0.0
Current iteration=0, loss=41570.1158597216
Current iteration=999, loss=34183.80854629463
Current iteration=1998, loss=34179.87851219306
Current iteration=2997, loss=34176.02134235284
0.7403831724275924 0.0
Current iteration=0, loss=41570.1158597216
Current iteration=999, loss=34165.07958339976
Current iteration=1998, loss=34158.23790524526
Current iteration=2997, loss=34151.62649236898
0.751188034615577 0.0
Current iteration=0, loss=41570.1158597216
Current iteration=999, loss=34702.64538160622
Current iteration=1998, loss=34685.278052026246
Current iteration=2997, loss=34668.13448500001
0.7416780600642506 0.01429396807490647
Current iteration=0, loss=41570.1158597216
Current iteration=999, loss=34080.45465515149
Current iteration=1998, loss=34071.37203405078
Current iteration=2997, loss=34062.62077113262
0.7435345905657545 0

KeyboardInterrupt: 