In [31]:
from proj1_helpers import load_csv_data, standardize, build_model_data
from implementations import split_data, least_squares
import numpy as np

In [46]:

train_data = 'all/train.csv'
test_data = 'all/test.csv'

train = load_csv_data(train_data)
test = load_csv_data(test_data)


In [142]:
labels, input_data, ids, features = train
_, test_data, test_ids, _ = test

In [143]:
def extend_and_standardize(input_data, mean=None, std=None):
    if mean is not None and std is not None:
        mean_x = mean
        std_x = std
        tx = (input_data - mean) / std
        num_samples = input_data.shape[0]
        tx = np.c_[np.ones(num_samples), tx]
    else: 
        x, mean_x, std_x = standardize(input_data)
        tx = build_model_data(x)
    return tx, mean_x, std_x

In [144]:
def model_output(tx, ws, pri_jet_num_idx, clean_features, parameters):
    # Conditions
    cond_null = tx[:, pri_jet_num_idx] == 0
    cond_one = tx[:, pri_jet_num_idx] == 1
    cond_plural = tx[:, pri_jet_num_idx] >= 2
    conditions = (cond_null, cond_one, cond_plural)
    
    print(tx.shape)
    N = tx.shape[0]
    model_output = np.zeros(N)
    for pri_jet_num, cond in enumerate(conditions):
        select_features = clean_features[pri_jet_num]
        reduced_dset = tx[cond][:,select_features]
        mean, std = parameters[pri_jet_num]
        extended_dset,_,_ = extend_and_standardize(reduced_dset,mean,std)
        weight = ws[pri_jet_num]
        sub_output = extended_dset.dot(weight)
        model_output[cond] = sub_output
        
    return model_output

In [145]:
def compute_predictions(model_output):
    predictions = model_output
    predictions[predictions > 0.5] = 1
    predictions[predictions <= 0.5] = 0
    return predictions

In [146]:
def compute_accuracy(y, predictions):
    N = y.size
    accuracy = 1 - (np.count_nonzero(predictions-y)/N)
    print("Accuracy: {}".format(accuracy))

In [186]:

training_ratio = 0.8

X, y = input_data, labels

cond_null = X[:, pri_jet_num_idx] == 0
cond_one = X[:, pri_jet_num_idx] == 1
cond_plural = X[:, pri_jet_num_idx] >= 2

conditions = (cond_null, cond_one, cond_plural)

dsets = [X[cond] for cond in conditions]
ybs = [y[cond] for cond in conditions]

# For now, just remove any column with undefined -999 values. Also, before standardization, remove features with 0 variance.
clean_dsets = []
clean_features = []

for dset in dsets:
    no_undefined = np.all(dset != -999, axis = 0)
    no_constant = np.any(dset != dset[0], axis = 0)
    cleaned = no_undefined * no_constant
    clean_dset = dset[:,cleaned]
    print(clean_dset.shape)
    clean_dsets.append(clean_dset)
    clean_features.append(cleaned)
    
# Standardize and extend data, save mean and standard deviation of each dataset.
parameters = []
standardized_dsets = []

for clean_dset in clean_dsets:
    standardized_dset, mean_x, std_x = extend_and_standardize(clean_dset)
    standardized_dsets.append(standardized_dset)
    print(standardized_dset.shape)
    parameters.append((mean_x,std_x))


(99913, 17)
(77544, 21)
(72543, 29)
(99913, 18)
(77544, 22)
(72543, 30)


In [178]:
print([ds.shape for ds in standardized_dsets])

[(99913, 18), (77544, 22), (72543, 30)]


In [179]:
ws_LS = []
# Applies Least Squares to each sub-training-dataset
for jet_num, standardized_dset in enumerate(standardized_dsets):
    w_LS = least_squares(ybs[jet_num],standardized_dset)
    ws_LS.append(w_LS)

output_LS = model_output(x_te, ws_LS, pri_jet_num_idx, clean_features, parameters)
predictions = compute_predictions(output_LS)
compute_accuracy(y_te,predictions)

(50000, 30)
Accuracy: 0.75898


In [180]:
print([w.shape for w in ws_LS])

[(18,), (22,), (30,)]


In [181]:
print(test_data[:,22])

[0. 1. 0. ... 0. 1. 0.]


In [208]:

#print(sum([b for b in clean_features[0]]))
test_X, _, _ = extend_and_standardize(test_data)

for i, sample in enumerate(test_X):
    if i > 5:break
    jet = 2 if test_data[i,22] >= 2. else int(test_data[i,22])
        
    print(jet, clean_sample.shape)
    aug_clean_vector = np.concatenate((np.array([True]), clean_features[i]))
    clean_sample = sample[aug_clean_vector]
    calc = ws_LS[jet] * clean_sample    
        
    print(calc)


0 (30,)
[ 2.55141974e-01 -1.14327786e-01  2.92032830e-01 -2.06856600e+02
 -3.39727728e-01  1.75264955e+02  2.48960030e+00 -1.00065629e-02
 -1.74626589e-02 -1.42156633e+00  2.32335324e-04  1.05152955e-04
 -1.07162079e+00 -1.27866726e-03 -2.58919049e-05  1.11174849e-02
 -2.96344927e-03 -1.46887342e-02]
1 (18,)
[ 3.57345507e-01 -5.80370792e-02 -2.10528322e-02  4.73350789e-03
 -5.21623912e-02 -2.00161890e-02  2.74212958e+01 -3.46264929e-02
 -6.54494250e-02 -1.41918856e+01 -2.87959151e-03 -2.94046854e-03
  1.43933958e+01  4.56156432e-04  5.22096423e-04 -2.26407872e-02
 -1.28844911e-03  5.05765526e-03 -1.21559323e+03 -3.51792604e-04
 -8.46532924e-04 -4.48803489e+02]
0 (22,)


ValueError: operands could not be broadcast together with shapes (18,) (30,) 