# CS-433 Project 1: ML Higgs 
Changling Li, Julian Blackwell, Luca Bataillard

In [1]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2

## Load the training data into feature matrix, class labels, and event ids:

In [2]:
from proj1_helpers import *
from implementations import *

In [35]:
DATA_TRAIN_PATH = '../data/train.csv'
y, tx, ids = load_csv_data(DATA_TRAIN_PATH)

## Exploratory data analysis

### Step 1: Inspecting the labels

We begin our data analysis by inspecting the `y` array of labels. According to the documentation, all labels are $-1$ for background 'b' and $1$ is for signal 's'. We check this below:

In [36]:
((y == 1) | (y == -1)).all()

True

### Step 2: Inspecting the features

We first read appendix B of the original Higgs paper and discover several key points:
* All variables are floating point and continuous, apart from `PRI_jet_num`, which ranges in $\{0, 1, 2, 3\}$
* Undefined values are denoted by the value $-999.0$

We inspect the features of `tx` to confirm these claims. We can see that `PRI_jet_num` corresponds to column 22. We decide to do no further processing to this variable and treat it as continuous. We notice several undefinded variables.

In [37]:
N, D = tx.shape
print(f"Number of rows: {N}")
print(f"Number of columns: {D}")

Number of rows: 250000
Number of columns: 30


In [38]:
for i in range(D):
    col = tx[:5, i]
    print(f"Column {i:2d}: {col}")

Column  0: [ 138.47   160.937 -999.     143.905  175.864]
Column  1: [ 51.655  68.768 162.172  81.417  16.915]
Column  2: [ 97.827 103.235 125.953  80.943 134.805]
Column  3: [27.98  48.146 35.635  0.414 16.405]
Column  4: [ 9.10e-01 -9.99e+02 -9.99e+02 -9.99e+02 -9.99e+02]
Column  5: [ 124.711 -999.    -999.    -999.    -999.   ]
Column  6: [   2.666 -999.    -999.    -999.    -999.   ]
Column  7: [3.064 3.473 3.148 3.31  3.891]
Column  8: [41.928  2.078  9.336  0.414 16.405]
Column  9: [197.76  125.157 197.814  75.968  57.983]
Column 10: [1.582 0.879 3.776 2.354 1.056]
Column 11: [ 1.396  1.414  1.414 -1.285 -1.385]
Column 12: [ 2.00e-01 -9.99e+02 -9.99e+02 -9.99e+02 -9.99e+02]
Column 13: [32.638 42.014 32.154 22.647 28.209]
Column 14: [ 1.017  2.039 -0.705 -1.655 -2.197]
Column 15: [ 0.381 -3.011 -2.093  0.01  -2.231]
Column 16: [ 51.626  36.918 121.409  53.321  29.774]
Column 17: [ 2.273  0.501 -0.953 -0.522  0.798]
Column 18: [-2.414  0.103  1.052 -3.1    1.569]
Column 19: [16.824

### Step 3: Summary statistics

By computing the summary statistics on the data, we notice that undefined values are not present in all columns. However, in columns where they are present, they comprise a large proportion of the data. There are too many such rows to be simply discarded as outliers, so we will treat them as regular datapoints for the time being.

We also notice that each feature follows a very different distribution. To ease model computational cost and improve model performance, we will scale the dataset by subtracting the mean and dividing by the standard deviation. 

In [39]:
display_summary_statistics(tx)

Column |   Mean   |  Median  | Std dev  |   Max    |    Min   | # Undefined | % Undefined 
     0 |  -49.023    406.345    105.012   1192.026   -999.000    38114.000        15.246
     1 |   49.240     35.345     46.524    690.075      0.000        0.000         0.000
     2 |   81.182     40.829     73.752   1349.351      6.329        0.000         0.000
     3 |   57.896     63.656     38.468   2834.999      0.000        0.000         0.000
     4 | -708.421    454.480   -999.000      8.503   -999.000   177457.000        70.983
     5 | -601.237    657.971   -999.000   4974.979   -999.000   177457.000        70.983
     6 | -709.357    453.019   -999.000     16.690   -999.000   177457.000        70.983
     7 |    2.373      0.783      2.492      5.684      0.208        0.000         0.000
     8 |   18.917     22.273     12.316   2834.999      0.000        0.000         0.000
     9 |  158.432    115.706    120.665   1852.462     46.104        0.000         0.000
    10 |    1.438  

## Feature processing

In [40]:
tx = standardize(tx)

In [85]:
initial_w = np.zeros(D)
max_iters = 10
gamma = 0.00001
seed = 30

y[y < 0] = 0

w, loss = logistic_regression(y, tx, initial_w, max_iters, gamma, verbose=True)
w, loss

Gradient Descent (0/9): loss=173286.79513998044, w=[ 0.28375129 -0.41697064 -0.01667664  0.22843324  0.16806352  0.22753121
  0.16676834  0.01452931 -0.01813859  0.18181503 -0.23184037  0.32243466
  0.16770757  0.27911078 -0.00111917 -0.00522363 -0.03790594  0.00179902
  0.00489486  0.0266557   0.00886952  0.16079532  0.15845656  0.18699793
  0.17853236  0.17853091  0.16695815  0.16761137  0.16760042  0.1593424 ]
Gradient Descent (1/9): loss=244525.74171709412, w=[ 0.22963669 -0.42546708 -0.00877652 -0.18412621 -0.45057577 -0.29535328
 -0.45307479  0.35188497 -0.28550267 -0.33282999 -0.37049832  0.03337813
 -0.45126927  0.24781522 -0.0084681  -0.01099491 -0.19003566 -0.01047879
  0.00723302 -0.22734838  0.00598688 -0.34236837 -0.53399718 -0.38674256
 -0.38313004 -0.38313228 -0.45301797 -0.45145421 -0.45147425 -0.40678597]
Gradient Descent (2/9): loss=667944.0811474295, w=[ 6.59132875e-01 -9.49393015e-01 -6.03256421e-02  7.32665774e-01
  6.99674652e-01  8.60982224e-01  6.95874256e-01 -6

(array([ 0.41243049, -0.87810651, -0.26725325,  0.11487256, -0.37025633,
         0.24209908, -0.38140497,  0.61782287, -0.4795342 , -0.24454006,
        -0.48614175,  0.23537171, -0.37331806,  0.72333909, -0.01144714,
        -0.01483475,  0.14528769, -0.00970999,  0.01057138,  0.03276335,
         0.00416709, -0.4006061 , -0.67912641, -0.36437897, -0.3684077 ,
        -0.36839545, -0.3795049 , -0.37415551, -0.37424438, -0.48677999]),
 241287.9660238217)

In [84]:
pred = tx.dot(w)
pred[pred < 0.5] = 0
pred[pred >= 0.5] = 1
np.unique(np.abs(pred - y), return_counts=True)

(array([0., 1.]), array([153882,  96118]))

## Generate predictions and save ouput in csv format for submission:

In [25]:
DATA_TEST_PATH = '../data/test.csv' # TODO: download train data and supply path here 
_, tX_test, ids_test = load_csv_data(DATA_TEST_PATH)

In [26]:
OUTPUT_PATH = 'test.csv' # TODO: fill in desired name of output file for submission
y_pred = predict_labels(weights, tX_test)
create_csv_submission(ids_test, y_pred, OUTPUT_PATH)