In [1]:
from sklearn.linear_model import LinearRegression
import numpy as np
import pandas as pd

In [2]:
def angular_dist_score(az_true, zen_true, az_pred, zen_pred):
    '''
    calculate the MAE of the angular distance between two directions.
    The two vectors are first converted to cartesian unit vectors,
    and then their scalar product is computed, which is equal to
    the cosine of the angle between the two vectors. The inverse 
    cosine (arccos) thereof is then the angle between the two input vectors
    
    Parameters:
    -----------
    
    az_true : float (or array thereof)
        true azimuth value(s) in radian
    zen_true : float (or array thereof)
        true zenith value(s) in radian
    az_pred : float (or array thereof)
        predicted azimuth value(s) in radian
    zen_pred : float (or array thereof)
        predicted zenith value(s) in radian
    
    Returns:
    --------
    
    dist : float
        mean over the angular distance(s) in radian
    '''
    
    if not (np.all(np.isfinite(az_true)) and
            np.all(np.isfinite(zen_true)) and
            np.all(np.isfinite(az_pred)) and
            np.all(np.isfinite(zen_pred))):
        raise ValueError("All arguments must be finite")
    
    # pre-compute all sine and cosine values
    sa1 = np.sin(az_true)
    ca1 = np.cos(az_true)
    sz1 = np.sin(zen_true)
    cz1 = np.cos(zen_true)
    
    sa2 = np.sin(az_pred)
    ca2 = np.cos(az_pred)
    sz2 = np.sin(zen_pred)
    cz2 = np.cos(zen_pred)
    
    # scalar product of the two cartesian vectors (x = sz*ca, y = sz*sa, z = cz)
    scalar_prod = sz1*sz2*(ca1*ca2 + sa1*sa2) + (cz1*cz2)
    
    # scalar product of two unit vectors is always between -1 and 1, this is against nummerical instability
    # that might otherwise occure from the finite precision of the sine and cosine functions
    scalar_prod =  np.clip(scalar_prod, -1, 1)
    
    # convert back to an angle (in radian)
    return np.average(np.abs(np.arccos(scalar_prod)))

In [3]:
# Importing data
batch1 = pd.read_parquet('../batches_train/batch_1_repartitions/part.0.parquet')
sensor_geom = pd.read_csv('../sensor_geometry.csv')

In [4]:
# Importing meta data
meta = pd.read_parquet('../batches_train/train_meta.parquet')

In [5]:
event_ids = list(set(batch1.index))

In [6]:
meta_1_0 = meta[meta.event_id.isin(event_ids)]

In [7]:
meta_1_0.to_parquet('../batches_train/meta_1_0.parquet')

In [8]:
meta = pd.read_parquet('../batches_train/meta_1_0.parquet')

In [9]:
meta

Unnamed: 0,batch_id,event_id,first_pulse_index,last_pulse_index,azimuth,zenith
0,1,24,0,60,5.029555,2.087498
1,1,41,61,111,0.417742,1.549686
2,1,59,112,147,1.160466,2.401942
3,1,67,148,289,5.845952,0.759054
4,1,72,290,351,0.653719,0.939117
...,...,...,...,...,...,...
69360,1,1129136,10924325,10924401,0.248121,1.680275
69361,1,1129140,10924402,10924463,3.103800,1.914090
69362,1,1129144,10924464,10924502,5.134968,2.632402
69363,1,1129153,10924503,10924549,2.563419,1.626596


In [6]:
# Part of meta that we need
meta_batch1 = meta[meta.event_id.isin(event_ids)]
meta_batch1.set_index('event_id',
                      inplace=True)
meta_batch1.drop(['batch_id', 'first_pulse_index', 'last_pulse_index'], 
                 axis=1,
                 inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  meta_batch1.drop(['batch_id', 'first_pulse_index', 'last_pulse_index'],


In [21]:
meta_batch1.head()

Unnamed: 0_level_0,azimuth,zenith
event_id,Unnamed: 1_level_1,Unnamed: 2_level_1
24,5.029555,2.087498
41,0.417742,1.549686
59,1.160466,2.401942
67,5.845952,0.759054
72,0.653719,0.939117


In [7]:
from sklearn.model_selection import train_test_split

In [8]:
train_indices, test_indices = train_test_split(event_ids,
                                               shuffle=True,
                                               random_state=123,
                                               test_size=0.25)

In [9]:
num_sensors = 5160

In [10]:
df_train = pd.DataFrame(columns=[i for i in range(0, num_sensors)]+['az','ze'])

In [11]:
# Now we generate feature data from the raw data
# We use aux = False only here
count = 0

batch1_no_aux = batch1[batch1.auxiliary==False]

for index in train_indices[:1000]:
    event = batch1_no_aux.loc[index]

    sensors = event.sensor_id.values

    for sensor in sensors:
        df_train.loc[index,sensor] = 1
    
    az, ze = meta_batch1.loc[index,['azimuth','zenith']].values
    df_train.loc[index,'az'] = az
    df_train.loc[index,'ze'] = ze

    count = count + 1
    if count % 100 == 0:
        print("Working on event", count)

Working on event 100
Working on event 200
Working on event 300
Working on event 400
Working on event 500
Working on event 600
Working on event 700
Working on event 800
Working on event 900
Working on event 1000


In [77]:
df_train.fillna(0, inplace=True)

In [78]:
# This is the DataFrame that represents our training data
df_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5152,5153,5154,5155,5156,5157,5158,5159,az,ze
1023496,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2.248502,0.432784
681492,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,3.299984,2.809545
38448,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,5.291643,0.989836
892244,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0.758019,2.730688
717306,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,3.283212,0.080609


In [79]:
slr1 = LinearRegression(copy_X=True)

In [80]:
Xs_train = df_train[range(0,num_sensors)]
ys_train = df_train[['az','ze']]

In [81]:
slr1.fit(Xs_train.values, ys_train.values)

In [82]:
slr1.coef_.shape

(2, 5160)

In [83]:
# Now we set up the testing data
df_test = pd.DataFrame(columns=[i for i in range(0, num_sensors)]+['az','ze'])

count = 0

batch1_no_aux = batch1[batch1.auxiliary==False]

for index in test_indices[:300]:
    event = batch1_no_aux.loc[index]

    sensors = event.sensor_id.values

    for sensor in sensors:
        df_test.loc[index,sensor] = 1
    
    az, ze = meta_batch1.loc[index,['azimuth','zenith']].values
    df_test.loc[index,'az'] = az
    df_test.loc[index,'ze'] = ze

    count = count + 1
    if count % 100 == 0:
        print("Working on event", count)

df_test.fillna(0, inplace=True)

Working on event 100
Working on event 200
Working on event 300


In [84]:
Xs_test = df_test[range(0,num_sensors)]
ys_test = df_test[['az','ze']]

In [21]:
slr1.predict(Xs_test)

NameError: name 'slr1' is not defined

### Baseline model

This model just "trains" by computing the mean of the output azimuth and zenith and predicts these numbers always

In [89]:
train_mean = np.mean(ys_train.values, axis=0)

In [90]:
train_mean

array([3.33623343, 1.52299244])

In [95]:
baseline_pred_az = train_mean[0]*np.ones(len(ys_test))
baseline_pred_ze = train_mean[1]*np.ones(len(ys_test))

In [100]:
baseline_error = angular_dist_score(ys_test['az'].values,
                                    ys_test['ze'].values,
                                    baseline_pred_az,
                                    baseline_pred_ze)

print("The angular distance score for the baseline model is", baseline_error)

The angular distance score for the baseline model is 1.5288671288320204


So the baseline model is accurate up to basically $\pi/2$ radians, i.e. statistically speaking the true direction will be in the same hemisphere that the baseline model guesses.

### Model 1

This model just puts a 1 in a sensor where it went off for that event, and a 0 in the sensors that didn't go off

In [103]:
az_pred_1 = slr1.predict(Xs_test.values)[:,0]
ze_pred_1 = slr1.predict(Xs_test.values)[:,1]

model_1_error = angular_dist_score(ys_test['az'].values,
                                   ys_test['ze'].values,
                                   az_pred_1,
                                   ze_pred_1)

print("The angular distance score for model 1 is", model_1_error)

The angular distance score for model 1 is 1.5847234132004213


### Model 2

This model includes aux = True

In [13]:
df_train = pd.DataFrame(columns=[i for i in range(0, num_sensors)]+['az','ze'])

In [14]:
# Now we generate feature data from the raw data
# We use aux = True as well here
count = 0

for index in train_indices[:1000]:
    event = batch1.loc[index]

    sensors = event.sensor_id.values

    for sensor in sensors:
        df_train.loc[index,sensor] = 1
    
    az, ze = meta_batch1.loc[index,['azimuth','zenith']].values
    df_train.loc[index,'az'] = az
    df_train.loc[index,'ze'] = ze

    count = count + 1
    if count % 100 == 0:
        print("Working on event", count)

Working on event 100
Working on event 200
Working on event 300
Working on event 400
Working on event 500
Working on event 600
Working on event 700
Working on event 800
Working on event 900
Working on event 1000


In [17]:
df_train.fillna(0, inplace=True)
Xs_train = df_train[range(0,num_sensors)]
ys_train = df_train[['az','ze']]

In [18]:
slr2 = LinearRegression(copy_X=True)
slr2.fit(Xs_train, ys_train)

In [19]:
# Now we set up the testing data
df_test = pd.DataFrame(columns=[i for i in range(0, num_sensors)]+['az','ze'])

count = 0

for index in test_indices[:300]:
    event = batch1.loc[index]

    sensors = event.sensor_id.values

    for sensor in sensors:
        df_test.loc[index,sensor] = 1
    
    az, ze = meta_batch1.loc[index,['azimuth','zenith']].values
    df_test.loc[index,'az'] = az
    df_test.loc[index,'ze'] = ze

    count = count + 1
    if count % 100 == 0:
        print("Working on event", count)

df_test.fillna(0, inplace=True)

Xs_test = df_test[range(0,num_sensors)]
ys_test = df_test[['az','ze']]

Working on event 100
Working on event 200
Working on event 300


In [20]:
slr2.predict(Xs_test)

array([[ 3.7584406 ,  1.38591386],
       [ 0.93770954,  1.10396083],
       [ 4.75752019,  0.91810188],
       [ 4.26397059,  1.08561153],
       [ 4.79136348,  1.72646885],
       [ 2.80571476,  1.6763304 ],
       [ 3.19038151,  1.13495321],
       [ 2.44321865,  2.10253234],
       [ 3.80568447,  1.75455573],
       [ 3.35174696,  1.70141403],
       [ 3.375917  ,  1.41960598],
       [ 0.1256769 ,  1.23073215],
       [ 2.41610337,  1.51902429],
       [ 4.22550394,  2.09272606],
       [ 4.58881738,  0.94026285],
       [ 2.33853783,  1.69719775],
       [ 5.04076333,  1.97699137],
       [ 4.9628013 ,  1.98930894],
       [ 4.48895259,  0.9515598 ],
       [ 3.88960851,  1.62627877],
       [ 4.23603269,  1.4198982 ],
       [ 3.80046778,  1.66372024],
       [ 3.5151331 ,  1.45436096],
       [ 3.08917254,  1.3391126 ],
       [ 2.38480371,  1.90618564],
       [ 3.05241363,  1.0169507 ],
       [ 2.55503065,  2.40361124],
       [ 3.82013862,  1.44393475],
       [ 4.83033551,