In [3]:
# import modules
import pandas as pd
import numpy as np
import sklearn.linear_model as LinearRegression
from mae import angular_dist_score

In [2]:
def raw_event_to_proc_binary(event, aux_incl=False):
    """
    Given an event, this function returns a processed
    5160-tuple with the ith entry being 0 if that sensor_id
    was not pinged during this event, and a 1 otherwise
    """
    if aux_incl == False:
        event = event[event.auxiliary==False]
    
    # array to be returned
    proc = np.zeros((5160,))
    
    # find the sensors that got pinged, modify proc accordingly
    sensors = np.unique(event.sensor_id.values)
    for sensor in sensors:
        proc[sensor] = 1
    
    return proc

In [3]:
def raw_batch_to_proc_binary(batch, aux_incl=False):
    """
    Given a (sub)batch, this function returns a processed
    pandas DataFrame whose rows are the processed events
    according to raw_event_to_proc_binary
    """
    # DataFrame to be returned
    event_ids = np.unique(batch.index)
    df = pd.DataFrame(0, index=event_ids, columns=[i for i in range(5160)])

    # run the raw_event_to_proc_binary function on each event
    count = 0
    for event_id in event_ids:
        df.loc[event_id] = raw_event_to_proc_binary(batch.loc[event_id], aux_incl=aux_incl)
        if count % 1000 == 0:
            print('Working on',count)
        count += 1
    return df

In [44]:
def raw_event_to_proc_chargesum(event, aux_incl=False):
    """
    Given an event, this function returns a processed
    5160-tuple with the ith entry being the sum of all
    charges across all pulses registered by that sensor
    in this event
    """
    if aux_incl == False:
        event = event[event.auxiliary==False]
    
    # array to be returned
    proc = np.zeros((5160,))
    
    # find the sensors that got pinged, modify proc accordingly
    event = event.drop(['time','auxiliary'], axis=1).groupby('sensor_id').sum()
    for sensor in event.index:
        proc[sensor] = event.loc[sensor].values[0]
    
    return proc

In [45]:
def raw_batch_to_proc_chargesum(batch, aux_incl=False):
    """
    Given a (sub)batch, this function returns a processed
    pandas DataFrame whose rows are the processed events
    according to raw_event_to_proc_chargesum
    """
    # DataFrame to be returned
    event_ids = np.unique(batch.index)
    df = pd.DataFrame(0, index=event_ids, columns=[i for i in range(5160)])

    # run the raw_event_to_proc_binary function on each event
    count = 0
    for event_id in event_ids:
        df.loc[event_id] = raw_event_to_proc_chargesum(batch.loc[event_id], aux_incl=aux_incl)
        if count % 1000 == 0:
            print('Working on',count)
        count += 1
    return df

In [46]:
### MODEL TRAINING AND TESTING ### 

# Now we load our data and create various linear models 

# load batch of our data
batch10 = pd.read_parquet('../batches_train/batch_10.parquet')
sensor_geom = pd.read_csv('../sensor_geometry.csv')
meta10 = pd.read_parquet('../batches_train/batch10_meta.parquet')

# list of unique event ids
event_ids = np.sort(np.unique(batch10.index))

In [2]:
# Create processed binary data from batch10 using 
# batch10_proc_binary = raw_batch_to_proc_binary(batch10)
# but for convenience I've already run this and stored the 
# result as a .parquet file
batch10_proc_binary = pd.read_parquet('../batches_train/batch10_proc_binary.parquet')

In [None]:
batch10_proc_binary.to_csv('../batches_train/batch10_proc_binary.csv')

In [53]:
# Create processed chargesum data from batch10 using
# batch10_proc_chargesum = raw_batch_to_proc_chargesum(batch10)
# but for convenience I've already run this and stored the 
# result as a .parquet file
batch10_proc_chargesum = pd.read_parquet('../batches_train/batch10_proc_chargesum.parquet')

Working on 0


In [6]:
# the targets are the azimuth (az) and zenith (ze)
# which we extract from the provided meta data
batch10_true_directions = meta10[['azimuth', 'zenith']]

In [9]:
# This cell is used to downsize the data for debugging purposes
# so it runs faster. Comment out to run on full dataset. 
batch10_proc_binary = batch10_proc_binary[0:1000]
batch10_true_directions = batch10_true_directions[0:1000]

In [13]:
# Now we train test split on the whole batch10
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(batch10_proc_binary, 
                                                    batch10_true_directions,
                                                    shuffle=True,
                                                    test_size=.25,
                                                    random_state=134)

In [14]:
# Train test split batch10, k-fold cross validation
# this cell imitates the erdos lectures notes on kfold cross validation , k = 5
# random seed to all splits random_seed = 134

In [15]:
# on our training test we now perform k-fold cross validation
# we use k = 5 and random seed 134
from sklearn.model_selection import KFold
kfold = KFold(n_splits=5, 
              shuffle=True,
              random_state=134)


In [16]:
# demonstrate the split
for train_index, test_index in kfold.split(X_train, y_train):
    print("Train index:", train_index)
    print("Test index:", test_index)
    print()

Train index: [  0   1   2   3   4   5   6   7   8   9  11  12  13  14  15  16  17  18
  19  22  24  25  26  27  28  29  30  31  32  33  34  35  36  37  38  41
  42  43  44  46  47  48  49  51  52  53  55  57  59  60  61  62  63  66
  70  71  72  73  74  75  77  78  79  83  84  85  86  87  88  89  90  91
  92  93  94  95  96  98 100 101 102 104 105 106 107 108 109 110 111 112
 114 115 116 120 121 123 124 125 126 127 128 132 134 136 137 138 139 140
 141 142 145 146 147 148 150 151 152 153 154 155 156 157 158 159 160 161
 162 163 165 166 168 169 170 171 172 173 175 176 177 178 179 180 182 183
 184 186 187 188 189 190 191 192 193 194 195 196 199 200 201 203 204 206
 207 208 209 210 212 213 214 215 217 219 220 221 222 223 225 226 227 229
 230 232 234 235 236 237 238 240 241 243 244 245 246 247 248 250 251 252
 254 255 256 258 259 260 261 262 263 264 265 266 267 268 270 271 272 273
 274 275 277 278 279 281 282 283 284 285 286 288 289 290 291 292 293 294
 295 296 298 299 300 301 303 304 305 3

In [17]:
### CROSS-VALIDATION ###

# Defining model 1
from sklearn.linear_model import LinearRegression
model_1 = LinearRegression(copy_X=True)

# cross validation on model 1
maes = []
count = 0
for train_index, test_index in kfold.split(X_train, y_train):
    # assign X_tt, y_tt and X_ho, y_ho
    X_tt = X_train.iloc[train_index]
    y_tt = y_train.iloc[train_index]
    X_ho = X_train.iloc[test_index]
    y_ho = y_train.iloc[test_index]
    
    # fit our model 
    model_1.fit(X_tt, y_tt)
    
    # predict 
    pred = model_1.predict(X_ho)
    az_pred = pred[:,0]
    ze_pred = pred[:,1]
    
    # get error according to custom error function
    err = angular_dist_score(y_ho['azimuth'].values, 
                             y_ho['zenith'].values,
                             az_pred,
                             ze_pred)
    maes.append(err)

In [24]:
# This code was run on the Great Lakes Cluster to save compute time
# and therefore we define maes here ourselves to be the output of that
# job (the job could be run locally from this notebook and output the
# same result)
maes = [1.5615248170191087, 1.5610988936989836, 1.5701054461389465, 1.5631204382457082, 1.5665744120854157]
avg_mae = np.mean(maes)
print("Average mae of model_1:", avg_mae)

Average mae of model_1: 1.5644848014376325


In [5]:
# TESTING< DELETE LATER
a = pd.read_parquet('../batches_train/batch_10.parquet')

In [8]:
a = a.rename({'sensor_id':1, 'time':2, 'charge':3, 'auxiliary':4}, axis = 1)

In [9]:
a.rename()

Unnamed: 0_level_0,1,2,3,4
event_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
29296372,2599,29807,1.125,True
29296372,711,30746,1.125,True
29296372,14,31649,1.225,True
29296372,5070,31890,0.725,True
29296372,5076,32030,0.625,True
...,...,...,...,...
32567683,570,16812,0.675,True
32567683,631,17481,1.125,True
32567683,4134,17767,0.875,True
32567683,4973,18139,0.775,True


# Model 3: sensor binary and num-clusters

In [5]:
all_features = pd.read_csv('../batches_train/all-feature.csv')

In [9]:
batch10_num_clusters = all_features[['event_id','num_clusters']].set_index('event_id')

In [10]:
batch10_num_clusters

Unnamed: 0_level_0,num_clusters
event_id,Unnamed: 1_level_1
29296372,1.0
29296374,4.0
29296414,1.0
29296416,1.0
29296437,2.0
...,...
32567581,2.0
32567639,2.0
32567659,2.0
32567680,1.0


In [None]:
# Cross-validation on model_3

# Defining model 3
from sklearn.linear_model import LinearRegression
model_3 = LinearRegression(copy_X=True)

# cross validation on model 1
maes = []
count = 0
for train_index, test_index in kfold.split(X_train, y_train):
    # assign X_tt, y_tt and X_ho, y_ho
    X_tt = X_train.iloc[train_index]
    y_tt = y_train.iloc[train_index]
    X_ho = X_train.iloc[test_index]
    y_ho = y_train.iloc[test_index]
    
    # fit our model 
    model_1.fit(X_tt, y_tt)
    
    # predict 
    pred = model_1.predict(X_ho)
    az_pred = pred[:,0]
    ze_pred = pred[:,1]
    
    # get error according to custom error function
    err = angular_dist_score(y_ho['azimuth'].values, 
                             y_ho['zenith'].values,
                             az_pred,
                             ze_pred)
    maes.append(err)