In [27]:
from sklearn.linear_model import LinearRegression
import numpy as np
import pandas as pd

In [28]:
def angular_dist_score(az_true, zen_true, az_pred, zen_pred):
    '''
    calculate the MAE of the angular distance between two directions.
    The two vectors are first converted to cartesian unit vectors,
    and then their scalar product is computed, which is equal to
    the cosine of the angle between the two vectors. The inverse 
    cosine (arccos) thereof is then the angle between the two input vectors
    
    Parameters:
    -----------
    
    az_true : float (or array thereof)
        true azimuth value(s) in radian
    zen_true : float (or array thereof)
        true zenith value(s) in radian
    az_pred : float (or array thereof)
        predicted azimuth value(s) in radian
    zen_pred : float (or array thereof)
        predicted zenith value(s) in radian
    
    Returns:
    --------
    
    dist : float
        mean over the angular distance(s) in radian
    '''
    
    if not (np.all(np.isfinite(az_true)) and
            np.all(np.isfinite(zen_true)) and
            np.all(np.isfinite(az_pred)) and
            np.all(np.isfinite(zen_pred))):
        raise ValueError("All arguments must be finite")
    
    # pre-compute all sine and cosine values
    sa1 = np.sin(az_true)
    ca1 = np.cos(az_true)
    sz1 = np.sin(zen_true)
    cz1 = np.cos(zen_true)
    
    sa2 = np.sin(az_pred)
    ca2 = np.cos(az_pred)
    sz2 = np.sin(zen_pred)
    cz2 = np.cos(zen_pred)
    
    # scalar product of the two cartesian vectors (x = sz*ca, y = sz*sa, z = cz)
    scalar_prod = sz1*sz2*(ca1*ca2 + sa1*sa2) + (cz1*cz2)
    
    # scalar product of two unit vectors is always between -1 and 1, this is against nummerical instability
    # that might otherwise occure from the finite precision of the sine and cosine functions
    scalar_prod =  np.clip(scalar_prod, -1, 1)
    
    # convert back to an angle (in radian)
    return np.average(np.abs(np.arccos(scalar_prod)))

In [29]:
# Importing data
batch1 = pd.read_parquet('../batches_train/batch_1_repartitions/part.0.parquet')
sensor_geom = pd.read_csv('../sensor_geometry.csv')

In [30]:
meta = pd.read_parquet('../batches_train/meta_1_0.parquet')

In [31]:
event_ids = list(set(batch1.index))

In [32]:
# Part of meta that we need
meta.set_index('event_id',
                      inplace=True)
meta.drop(['batch_id', 'first_pulse_index', 'last_pulse_index'], 
                 axis=1,
                 inplace=True)

In [33]:
meta.head()

Unnamed: 0_level_0,azimuth,zenith
event_id,Unnamed: 1_level_1,Unnamed: 2_level_1
24,5.029555,2.087498
41,0.417742,1.549686
59,1.160466,2.401942
67,5.845952,0.759054
72,0.653719,0.939117


In [34]:
from sklearn.model_selection import train_test_split

In [35]:
train_indices, test_indices = train_test_split(event_ids,
                                               shuffle=True,
                                               random_state=123,
                                               test_size=0.25)

In [36]:
num_sensors = 5160

### Training set

In [37]:
df1_train = pd.DataFrame(columns=[i for i in range(0, num_sensors)]+['az','ze'])

In [38]:
%%time

# Now we generate feature data from the raw data
# We use aux = False only here
count = 0

batch1_no_aux = batch1[batch1.auxiliary==False]

for index in train_indices[:1000]:
    event = batch1_no_aux.loc[index]

    sensors = event.sensor_id.values

    for sensor in sensors:
        df1_train.loc[index,sensor] = 1
    
    az, ze = meta.loc[index,['azimuth','zenith']].values
    df1_train.loc[index,'az'] = az
    df1_train.loc[index,'ze'] = ze

    count = count + 1
    if count % 100 == 0:
        print("Working on event", count)
        
df1_train.fillna(0, inplace=True)

Working on event 100
Working on event 200
Working on event 300
Working on event 400
Working on event 500
Working on event 600
Working on event 700
Working on event 800
Working on event 900
Working on event 1000
CPU times: total: 16.8 s
Wall time: 46.7 s


In [39]:
# This is the DataFrame that represents our training data
df1_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5152,5153,5154,5155,5156,5157,5158,5159,az,ze
1023496,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2.248502,0.432784
681492,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,3.299984,2.809545
38448,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,5.291643,0.989836
892244,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0.758019,2.730688
717306,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,3.283212,0.080609


### Testing set

In [40]:
# Now we set up the testing data
df1_test = pd.DataFrame(columns=[i for i in range(0, num_sensors)]+['az','ze'])

count = 0

batch1_no_aux = batch1[batch1.auxiliary==False]

for index in test_indices[:300]:
    event = batch1_no_aux.loc[index]

    sensors = event.sensor_id.values

    for sensor in sensors:
        df1_test.loc[index,sensor] = 1
    
    az, ze = meta.loc[index,['azimuth','zenith']].values
    df1_test.loc[index,'az'] = az
    df1_test.loc[index,'ze'] = ze

    count = count + 1
    if count % 100 == 0:
        print("Working on event", count)

df1_test.fillna(0, inplace=True)

Working on event 100
Working on event 200
Working on event 300


### Fitting

In [41]:
X1_train = df1_train[range(0,num_sensors)]
y1_train = df1_train[['az','ze']]

In [42]:
X1_test = df1_test[range(0,num_sensors)]
y1_test = df1_test[['az','ze']]

In [43]:
slr1 = LinearRegression(copy_X=True)

In [44]:
slr1.fit(X1_train.values, y1_train.values)

### Baseline model

This model just "trains" by computing the mean of the output azimuth and zenith and predicts these numbers always

In [45]:
train_mean = np.mean(y1_train.values, axis=0)

In [46]:
train_mean

array([3.33623343, 1.52299244])

In [47]:
baseline_pred_az = train_mean[0]*np.ones(len(y1_test))
baseline_pred_ze = train_mean[1]*np.ones(len(y1_test))

In [48]:
baseline_error = angular_dist_score(y1_test['az'].values,
                                    y1_test['ze'].values,
                                    baseline_pred_az,
                                    baseline_pred_ze)

print("The angular distance score for the baseline model is", baseline_error)

The angular distance score for the baseline model is 1.5288671288320204


So the baseline model is accurate up to basically $\pi/2$ radians, i.e. statistically speaking the true direction will be in the same hemisphere that the baseline model guesses.

### Model 1

This model just puts a 1 in a sensor where it went off for that event, and a 0 in the sensors that didn't go off

In [49]:
az_pred_1 = slr1.predict(X1_test.values)[:,0]
ze_pred_1 = slr1.predict(X1_test.values)[:,1]

model_1_error = angular_dist_score(y1_test['az'].values,
                                   y1_test['ze'].values,
                                   az_pred_1,
                                   ze_pred_1)

print("The angular distance score for model 1 is", model_1_error)

The angular distance score for model 1 is 1.5847234132004213


### Model 2

This model includes aux = True

In [50]:
df2_train = pd.DataFrame(columns=[i for i in range(0, num_sensors)]+['az','ze'])

In [51]:
# Now we generate feature data from the raw data
# We use aux = True as well here
count = 0

for index in train_indices[:1000]:
    event = batch1.loc[index]

    sensors = event.sensor_id.values

    for sensor in sensors:
        df2_train.loc[index,sensor] = 1
    
    az, ze = meta.loc[index,['azimuth','zenith']].values
    df2_train.loc[index,'az'] = az
    df2_train.loc[index,'ze'] = ze

    count = count + 1
    if count % 100 == 0:
        print("Working on event", count)
        
df2_train.fillna(0, inplace=True)

Working on event 100
Working on event 200
Working on event 300
Working on event 400
Working on event 500
Working on event 600
Working on event 700
Working on event 800
Working on event 900
Working on event 1000


In [52]:
X2_train = df2_train[range(0,num_sensors)]
y2_train = df2_train[['az','ze']]

In [54]:
slr2 = LinearRegression(copy_X=True)
slr2.fit(X2_train, y2_train)

In [55]:
# Now we set up the testing data
df2_test = pd.DataFrame(columns=[i for i in range(0, num_sensors)]+['az','ze'])

count = 0

for index in test_indices[:300]:
    event = batch1.loc[index]

    sensors = event.sensor_id.values

    for sensor in sensors:
        df2_test.loc[index,sensor] = 1
    
    az, ze = meta.loc[index,['azimuth','zenith']].values
    df2_test.loc[index,'az'] = az
    df2_test.loc[index,'ze'] = ze

    count = count + 1
    if count % 100 == 0:
        print("Working on event", count)

df2_test.fillna(0, inplace=True)

X2_test = df2_test[range(0,num_sensors)]
y2_test = df2_test[['az','ze']]

Working on event 100
Working on event 200
Working on event 300


In [56]:
az_pred_2 = slr2.predict(X2_test.values)[:,0]
ze_pred_2 = slr2.predict(X2_test.values)[:,1]

model_2_error = angular_dist_score(y2_test['az'].values,
                                   y2_test['ze'].values,
                                   az_pred_2,
                                   ze_pred_2)

print("The angular distance score for model 2 is", model_2_error)

The angular distance score for model 2 is 1.5805230532332462


This improves very slightly over model 1

### Model 3: charge and aux = False only

In [57]:
df3_train = pd.DataFrame(columns=[i for i in range(0, num_sensors)]+['az','ze'])

# Now we generate feature data from the raw data
# We use aux = False only here
count = 0

batch1_model_3 = batch1[batch1.auxiliary==False]

for index in train_indices[:1000]:
    event = batch1_model_3.loc[index]
    event.set_index('sensor_id',
                    inplace=True)
    event = event.groupby('sensor_id').sum()

    for sensor in event.index:
        df3_train.loc[index,sensor] = event.loc[sensor,'charge']
    
    az, ze = meta.loc[index,['azimuth','zenith']].values
    df3_train.loc[index,'az'] = az
    df3_train.loc[index,'ze'] = ze

    count = count + 1
    if count % 10 == 0:
        print("Working on event", count)
        
df3_train.fillna(0, inplace=True)

Working on event 10
Working on event 20
Working on event 30
Working on event 40
Working on event 50
Working on event 60
Working on event 70
Working on event 80
Working on event 90
Working on event 100
Working on event 110
Working on event 120
Working on event 130
Working on event 140
Working on event 150
Working on event 160
Working on event 170
Working on event 180
Working on event 190
Working on event 200
Working on event 210
Working on event 220
Working on event 230
Working on event 240
Working on event 250
Working on event 260
Working on event 270
Working on event 280
Working on event 290
Working on event 300
Working on event 310
Working on event 320
Working on event 330
Working on event 340
Working on event 350
Working on event 360
Working on event 370
Working on event 380
Working on event 390
Working on event 400
Working on event 410
Working on event 420
Working on event 430
Working on event 440
Working on event 450
Working on event 460
Working on event 470
Working on event 480
W

In [58]:
df3_test = pd.DataFrame(columns=[i for i in range(0, num_sensors)]+['az','ze'])

# Now we generate feature data from the raw data
# We use aux = False only here
count = 0

for index in test_indices[:300]:
    event = batch1_model_3.loc[index]
    event.set_index('sensor_id',
                    inplace=True)
    event = event.groupby('sensor_id').sum()

    for sensor in event.index:
        df3_test.loc[index,sensor] = event.loc[sensor,'charge']
    
    az, ze = meta.loc[index,['azimuth','zenith']].values
    df3_test.loc[index,'az'] = az
    df3_test.loc[index,'ze'] = ze

    count = count + 1
    if count % 10 == 0:
        print("Working on event", count)
        
df3_test.fillna(0, inplace=True)

Working on event 10
Working on event 20
Working on event 30
Working on event 40
Working on event 50
Working on event 60
Working on event 70
Working on event 80
Working on event 90
Working on event 100
Working on event 110
Working on event 120
Working on event 130
Working on event 140
Working on event 150
Working on event 160
Working on event 170
Working on event 180
Working on event 190
Working on event 200
Working on event 210
Working on event 220
Working on event 230
Working on event 240
Working on event 250
Working on event 260
Working on event 270
Working on event 280
Working on event 290
Working on event 300


In [59]:
X3_train = df3_train[range(0,num_sensors)]
y3_train = df3_train[['az','ze']]

X3_test = df3_test[range(0,num_sensors)]
y3_test =df3_test[['az','ze']]

In [60]:
# Fit the regression
slr3 = LinearRegression(copy_X=True)
slr3.fit(X3_train, y3_train)

In [61]:
az_pred_3 = slr3.predict(X3_test.values)[:,0]
ze_pred_3 = slr3.predict(X3_test.values)[:,1]

model_3_error = angular_dist_score(y3_test['az'].values,
                                   y3_test['ze'].values,
                                   az_pred_3,
                                   ze_pred_3)

print("The angular distance score for model 3 is", model_3_error)

The angular distance score for model 3 is 1.5453663141818437


### Model 4: charge and aux = True

In [None]:
df4_train = pd.DataFrame(columns=[i for i in range(0, num_sensors)]+['az','ze'])

# Now we generate feature data from the raw data
# We use aux = True as well here
count = 0

for index in train_indices[:1000]:
    event = batch1.loc[index]
    event.set_index('sensor_id',
                    inplace=True)
    event = event.groupby('sensor_id').sum()

    for sensor in event.index:
        df4_train.loc[index,sensor] = event.loc[sensor,'charge']
    
    az, ze = meta.loc[index,['azimuth','zenith']].values
    df4_train.loc[index,'az'] = az
    df4_train.loc[index,'ze'] = ze

    count = count + 1
    if count % 10 == 0:
        print("Working on event", count)
        
df4_train.fillna(0, inplace=True)

In [None]:
df4_test = pd.DataFrame(columns=[i for i in range(0, num_sensors)]+['az','ze'])

# Now we generate feature data from the raw data
# We use aux = False only here
count = 0

for index in test_indices[:300]:
    event = batch1.loc[index]
    event.set_index('sensor_id',
                    inplace=True)
    event = event.groupby('sensor_id').sum()

    for sensor in event.index:
        df4_test.loc[index,sensor] = event.loc[sensor,'charge']
    
    az, ze = meta.loc[index,['azimuth','zenith']].values
    df4_test.loc[index,'az'] = az
    df4_test.loc[index,'ze'] = ze

    count = count + 1
    if count % 10 == 0:
        print("Working on event", count)
        
df4_test.fillna(0, inplace=True)

In [None]:
X4_train = df4_train[range(0,num_sensors)]
y4_train = df4_train[['az','ze']]

X4_test = df4_test[range(0,num_sensors)]
y4_test = df4_test[['az','ze']]

In [None]:
# Fitting the regression
slr4 = LinearRegression(copy_X=True)
slr4.fit(X4_train, y4_train)

In [None]:
az_pred_4 = slr4.predict(X4_test.values)[:,0]
ze_pred_4 = slr4.predict(X4_test.values)[:,1]

model_4_error = angular_dist_score(y4_test['az'].values,
                                   y4_test['ze'].values,
                                   az_pred_4,
                                   ze_pred_4)

print("The angular distance score for model 4 is", model_4_error)

### Model 5 - a CNN before I know what they actually are, so this might go badly

In [None]:
from keras import models
from keras import layers
from keras import optimizers
from keras import losses
from keras import metrics
from keras.utils.np_utils import to_categorical
