In [2]:
from sklearn.linear_model import LinearRegression
import numpy as np
import pandas as pd

In [3]:
def angular_dist_score(az_true, zen_true, az_pred, zen_pred):
    '''
    calculate the MAE of the angular distance between two directions.
    The two vectors are first converted to cartesian unit vectors,
    and then their scalar product is computed, which is equal to
    the cosine of the angle between the two vectors. The inverse 
    cosine (arccos) thereof is then the angle between the two input vectors
    
    Parameters:
    -----------
    
    az_true : float (or array thereof)
        true azimuth value(s) in radian
    zen_true : float (or array thereof)
        true zenith value(s) in radian
    az_pred : float (or array thereof)
        predicted azimuth value(s) in radian
    zen_pred : float (or array thereof)
        predicted zenith value(s) in radian
    
    Returns:
    --------
    
    dist : float
        mean over the angular distance(s) in radian
    '''
    
    if not (np.all(np.isfinite(az_true)) and
            np.all(np.isfinite(zen_true)) and
            np.all(np.isfinite(az_pred)) and
            np.all(np.isfinite(zen_pred))):
        raise ValueError("All arguments must be finite")
    
    # pre-compute all sine and cosine values
    sa1 = np.sin(az_true)
    ca1 = np.cos(az_true)
    sz1 = np.sin(zen_true)
    cz1 = np.cos(zen_true)
    
    sa2 = np.sin(az_pred)
    ca2 = np.cos(az_pred)
    sz2 = np.sin(zen_pred)
    cz2 = np.cos(zen_pred)
    
    # scalar product of the two cartesian vectors (x = sz*ca, y = sz*sa, z = cz)
    scalar_prod = sz1*sz2*(ca1*ca2 + sa1*sa2) + (cz1*cz2)
    
    # scalar product of two unit vectors is always between -1 and 1, this is against nummerical instability
    # that might otherwise occure from the finite precision of the sine and cosine functions
    scalar_prod =  np.clip(scalar_prod, -1, 1)
    
    # convert back to an angle (in radian)
    return np.average(np.abs(np.arccos(scalar_prod)))

In [4]:
# Importing data
batch1 = pd.read_parquet('../batches_train/batch_1_repartitions/part.0.parquet')
sensor_geom = pd.read_csv('../sensor_geometry.csv')

meta = pd.read_parquet('../batches_train/meta_1_0.parquet')

event_ids = list(set(batch1.index))

# Part of meta that we need
meta.set_index('event_id',
                      inplace=True)
meta.drop(['batch_id', 'first_pulse_index', 'last_pulse_index'], 
                 axis=1,
                 inplace=True)

num_sensors = 5160

In [5]:
from sklearn.model_selection import train_test_split

In [6]:
train_indices, test_indices = train_test_split(event_ids,
                                               shuffle=True,
                                               random_state=123,
                                               test_size=0.25)

In [7]:
# Make a function that outputs (x,y,z) for a sensor_id input
def id_to_xyz(sen):
    row = tuple(sensor_geom.loc[sen][1:4])
    return row

### Model 5: including clustering

In [18]:
df1_train = pd.DataFrame(columns=[i for i in range(0, num_sensors)]+['num_clust']+['az','ze'])

In [19]:
import scipy.cluster.hierarchy as hcluster
threshhold = 150

# Now we generate feature data from the raw data
# We use aux = False only here
count = 0

batch1_no_aux = batch1[batch1.auxiliary==False]

for index in train_indices[:1000]:
    event = batch1_no_aux.loc[index]
    event.set_index('sensor_id',
                    inplace=True)
    event = event.groupby('sensor_id').sum()
    
    sensors = event.index

    for sensor in sensors:
        df1_train.loc[index,sensor] = event.loc[sensor,'charge']
    
    az, ze = meta.loc[index,['azimuth','zenith']].values
    df1_train.loc[index,'az'] = az
    df1_train.loc[index,'ze'] = ze
    
    # Clustering
    raw_data = [id_to_xyz(sen) for sen in sensors]
    clusters = hcluster.fclusterdata(raw_data,threshhold,criterion='distance')
    num_clusters = len(set(clusters))
    df1_train.loc[index,'num_clust'] = num_clusters
    

    count = count + 1
    if count % 10 == 0:
        print("Working on event", count)
        
df1_train.fillna(0, inplace=True)

Working on event 10
Working on event 20
Working on event 30
Working on event 40
Working on event 50
Working on event 60
Working on event 70
Working on event 80
Working on event 90
Working on event 100
Working on event 110
Working on event 120
Working on event 130
Working on event 140
Working on event 150
Working on event 160
Working on event 170
Working on event 180
Working on event 190
Working on event 200
Working on event 210
Working on event 220
Working on event 230
Working on event 240
Working on event 250
Working on event 260
Working on event 270
Working on event 280
Working on event 290
Working on event 300
Working on event 310
Working on event 320
Working on event 330
Working on event 340
Working on event 350
Working on event 360
Working on event 370
Working on event 380
Working on event 390
Working on event 400
Working on event 410
Working on event 420
Working on event 430
Working on event 440
Working on event 450
Working on event 460
Working on event 470
Working on event 480
W

In [20]:
# Now we generate test data from the raw data
# We use aux = False only here
df1_test = pd.DataFrame(columns=[i for i in range(0, num_sensors)]+['num_clust']+['az','ze'])


count = 0

for index in test_indices[:300]:
    event = batch1_no_aux.loc[index]
    event.set_index('sensor_id',
                    inplace=True)
    event = event.groupby('sensor_id').sum()
    
    sensors = event.index

    for sensor in sensors:
        df1_test.loc[index,sensor] = event.loc[sensor,'charge']
    
    az, ze = meta.loc[index,['azimuth','zenith']].values
    df1_test.loc[index,'az'] = az
    df1_test.loc[index,'ze'] = ze
    
    # Clustering
    raw_data = [id_to_xyz(sen) for sen in sensors]
    clusters = hcluster.fclusterdata(raw_data,threshhold,criterion='distance')
    num_clusters = len(set(clusters))
    df1_test.loc[index,'num_clust'] = num_clusters
    

    count = count + 1
    if count % 10 == 0:
        print("Working on event", count)
        
df1_test.fillna(0, inplace=True)

Working on event 10
Working on event 20
Working on event 30
Working on event 40
Working on event 50
Working on event 60
Working on event 70
Working on event 80
Working on event 90
Working on event 100
Working on event 110
Working on event 120
Working on event 130
Working on event 140
Working on event 150
Working on event 160
Working on event 170
Working on event 180
Working on event 190
Working on event 200
Working on event 210
Working on event 220
Working on event 230
Working on event 240
Working on event 250
Working on event 260
Working on event 270
Working on event 280
Working on event 290
Working on event 300


In [38]:
X5_train = df1_train[[i for i in range(0,num_sensors)]+['num_clust']]
y5_train = df1_train[['az','ze']]

X5_test = df1_test[[i for i in range(0,num_sensors)]+['num_clust']]
y5_test = df1_test[['az','ze']]

In [40]:
# Fitting the model
slr5 = LinearRegression(copy_X=True)

slr5.fit(X5_train.values, y5_train.values)

In [41]:
az_pred_5 = slr5.predict(X5_test.values)[:,0]
ze_pred_5 = slr5.predict(X5_test.values)[:,1]

model_5_error = angular_dist_score(y5_test['az'],
                                   y5_test['ze'],
                                   az_pred_5,
                                   ze_pred_5)

print("The angular distance score for model 5 is", model_5_error)

The angular distance score for model 5 is 1.5533778231049427
