In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
## Read Data
batch1 = pd.read_parquet("C:/Users/k_vsl/Documents/Erdos/IceCubeData/batch_1.parquet/batch_1.parquet")
directions = pd.read_parquet("C:/Users/k_vsl/Documents/Erdos/IceCubeData/train_meta.parquet/train_meta.parquet")
sensors = pd.read_csv("C:/Users/k_vsl/Documents/Erdos/IceCubeData/sensor_geometry.csv")

In [38]:
batch1.head()

Unnamed: 0_level_0,sensor_id,time,charge,auxiliary
event_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
24,3918,5928,1.325,True
24,4157,6115,1.175,True
24,3520,6492,0.925,True
24,5041,6665,0.225,True
24,2948,8054,1.575,True


In [3]:
# Modelling approach as detailed in Ice Cube Papers
# r = r_0 + vt
# Get a formula for v as 
# v = (<r_it_i> - <r_i><t_i>)/(<t_i^2> - <t_i>^2)

def get_velocity(event, geometry):
    
    # Isolate the detections we want to use
    # detections = (batch.loc[event_id])
    # if aux_incl == False: 
    #    detections = detections[detections["auxiliary"] == False]
    stats = event.reset_index()
    
    time0 = stats["time"].iloc[0]
    stats["t"] = stats.apply(lambda df: df.time - time0, axis = 1)
    
    n = len(event)
    
    for i in range(0,n):
        id = stats.loc[i, "sensor_id"]
        t = stats.loc[i, "t"]
        stats.loc[i, "x"] = geometry.iloc[id].x
        stats.loc[i, "y"] = geometry.iloc[id].y
        stats.loc[i, "z"] = geometry.iloc[id].z
        stats.loc[i, "xt"] = stats.loc[i, "x"] * t
        stats.loc[i, "yt"] = stats.loc[i, "y"] * t
        stats.loc[i, "zt"] = stats.loc[i, "z"] * t
        stats.loc[i, "t2"] = t^2
    
    # Preliminary Averages
    means = stats.mean(axis=0)
    
    # Compute Velocity
    denom = (means.t2 - np.square(means.t))
    vx = (means.xt - means.x*means.t)/denom
    vy = (means.yt - means.y*means.t)/denom
    vz = (means.zt - means.z*means.t)/denom
    
    # return stats
    return (vx,vy,vz)

In [98]:
event24 = event24 = batch1.loc[24]
event24 = event24[event24.auxiliary == False]
m = get_velocity(event24, sensors)
m

(-0.00025147607860548393, 0.007345699184711872, -0.014367654891710787)

In [4]:
# Function giving azimuth and zenith based on velocity
def get_angle(v): 
    mod2 = np.square(v[0]) + np.square(v[1]) + np.square(v[2])
    if (mod2 == 0):
        mod = 1
    else: 
        mod = np.sqrt(mod2)
    x = v[0] / mod
    y = v[1] / mod
    z = v[2] / mod
    ze = np.arccos(z)
    az = np.arctan2(y,x)
    return (az,ze)

In [99]:
d = get_angle(m)
d

(1.6050174265309547, 2.6687351836307927)

In [5]:
# Function to compute Mean Angular error
def get_mae(az_true, zen_true, az_pred, zen_pred): 
    if not (np.all(np.isfinite(az_true)) and
            np.all(np.isfinite(zen_true)) and
            np.all(np.isfinite(az_pred)) and
            np.all(np.isfinite(zen_pred))):
        raise ValueError("All arguments must be finite")
    
    # pre-compute all sine and cosine values
    sa1 = np.sin(az_true)
    ca1 = np.cos(az_true)
    sz1 = np.sin(zen_true)
    cz1 = np.cos(zen_true)
    
    sa2 = np.sin(az_pred)
    ca2 = np.cos(az_pred)
    sz2 = np.sin(zen_pred)
    cz2 = np.cos(zen_pred)
    
    # scalar product of the two cartesian vectors (x = sz*ca, y = sz*sa, z = cz)
    scalar_prod = sz1*sz2*(ca1*ca2 + sa1*sa2) + (cz1*cz2)
    
    # scalar product of two unit vectors is always between -1 and 1, this is against nummerical instability
    # that might otherwise occure from the finite precision of the sine and cosine functions
    scalar_prod =  np.clip(scalar_prod, -1, 1)
    
    # convert back to an angle (in radian)
    return np.average(np.abs(np.arccos(scalar_prod)))

In [100]:
az_true, zen_true = directions.loc[0, ["azimuth", "zenith"]]
print(az_true, zen_true)

5.029554633390068 2.0874975005610428


In [101]:
get_mae(az_true, zen_true, d[0], d[1])

1.5111863466562705

In [6]:
def find_event_error(event_id, event, geometry, directions):
    # Get velocity for event
    # print("Event ID: " + str(event_id))
    v = get_velocity(event, geometry)
    (az_pred, ze_pred) = get_angle(v)
    # print("The azimuth predicted is " + str(az_pred) + " and the zenith predicted is " + str(ze_pred))
    df = directions[directions.event_id == event_id]
    ze = df.iloc[0].zenith
    az = df.iloc[0].azimuth
    # print("The azimuth is " + str(az) + " and the zenith is " + str(ze))
    mae = get_mae(az, ze, az_pred, ze_pred)
    # print("The MAE is " + str(mae))
    
    
    return (v[0], v[1], v[2], az_pred, ze_pred,az, ze, mae)

In [102]:
error = find_event_error(24, event24, sensors, directions)
print(error)

(-0.00025147607860548393, 0.007345699184711872, -0.014367654891710787, 1.6050174265309547, 2.6687351836307927, 1.5111863466562705)


In [7]:
# Package functions together
def find_error(aux_incl, batch, geometry, directions): 
    if aux_incl == False: 
        batch_aux = batch[batch.auxiliary==False]
    else: 
        batch_aux = batch
    event_ids = list(set(batch.index))
    n = len(event_ids)
    print(n)
    data = pd.DataFrame(event_ids, columns = ["event_id"])
    for i in range(0, 50000): 
        event_id = event_ids[i]
        event = batch_aux.loc[event_id]
        stats = find_event_error(event_id, event, geometry, directions)
        data.loc[i, "vx"] = stats[0]
        data.loc[i, "vy"] = stats[1]
        data.loc[i, "vz"] = stats[2]
        data.loc[i, "az_pred"] = stats[3]
        data.loc[i, "ze_pred"] = stats[4]
        data.loc[i, "az_true"] = stats[5]
        data.loc[i, "ze_true"] = stats[6]
        data.loc[i, "mae"] = stats[7]
        if (i % 100 == 0): 
            print("Testing complete for " + str(i) + " events")
    
    return data

In [318]:
event24 = batch1.loc[24]

In [319]:
event24.head()

Unnamed: 0_level_0,sensor_id,time,charge,auxiliary
event_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
24,3918,5928,1.325,True
24,4157,6115,1.175,True
24,3520,6492,0.925,True
24,5041,6665,0.225,True
24,2948,8054,1.575,True


In [110]:
data24 = find_error(False, event24, sensors, directions)
data24.head()

1
Testing complete for 0 events


Unnamed: 0,event_id,vx,vy,vz,az_pred,ze_pred,az_true,ze_true,mae
0,24,-0.000251,0.007346,-0.014368,1.605017,2.668735,5.029555,2.087498,1.511186


In [111]:
event2441 = batch1.loc[24:41]
event2441.head(100)

Unnamed: 0_level_0,sensor_id,time,charge,auxiliary
event_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
24,3918,5928,1.325,True
24,4157,6115,1.175,True
24,3520,6492,0.925,True
24,5041,6665,0.225,True
24,2948,8054,1.575,True
...,...,...,...,...
41,2214,13110,0.825,True
41,492,13177,0.725,True
41,919,13678,0.975,True
41,1151,13769,0.875,True


In [112]:
data = find_error(False, event2441, sensors, directions)
data.head()

2
Testing complete for 0 events


Unnamed: 0,event_id,vx,vy,vz,az_pred,ze_pred,az_true,ze_true,mae
0,24,-0.000251,0.007346,-0.014368,1.605017,2.668735,5.029555,2.087498,1.511186
1,41,0.001462,0.083666,0.07347,1.553328,0.850264,0.417742,1.549686,1.233782


In [285]:
event = 2573708
v = get_velocity(event, False, batch1, sensors)
print(v)
z = np.arctan2(0,0)
print(z)
y = np.arccos(0)
print(y)

(0.0, 0.0, 0.0)
0.0
1.5707963267948966


In [323]:
stats = find_event_error(2573708, False, batch1, sensors, directions)
stats

(0.0, 0.0, 0.0, 0.0, 1.5707963267948966, 1.6976580544973947)

In [118]:
data = find_error(False, batch1, sensors, directions)
data_to_csv = data.to_csv('best-fit.csv', index = True) 

200000
Testing complete for 0 events
Testing complete for 100 events
Testing complete for 200 events
Testing complete for 300 events
Testing complete for 400 events
Testing complete for 500 events
Testing complete for 600 events
Testing complete for 700 events
Testing complete for 800 events
Testing complete for 900 events
Testing complete for 1000 events
Testing complete for 1100 events
Testing complete for 1200 events
Testing complete for 1300 events
Testing complete for 1400 events
Testing complete for 1500 events
Testing complete for 1600 events
Testing complete for 1700 events
Testing complete for 1800 events
Testing complete for 1900 events
Testing complete for 2000 events
Testing complete for 2100 events
Testing complete for 2200 events
Testing complete for 2300 events
Testing complete for 2400 events
Testing complete for 2500 events
Testing complete for 2600 events
Testing complete for 2700 events
Testing complete for 2800 events
Testing complete for 2900 events
Testing complet

Testing complete for 24500 events
Testing complete for 24600 events
Testing complete for 24700 events
Testing complete for 24800 events
Testing complete for 24900 events
Testing complete for 25000 events
Testing complete for 25100 events
Testing complete for 25200 events
Testing complete for 25300 events
Testing complete for 25400 events
Testing complete for 25500 events
Testing complete for 25600 events
Testing complete for 25700 events
Testing complete for 25800 events
Testing complete for 25900 events
Testing complete for 26000 events
Testing complete for 26100 events
Testing complete for 26200 events
Testing complete for 26300 events
Testing complete for 26400 events
Testing complete for 26500 events
Testing complete for 26600 events
Testing complete for 26700 events
Testing complete for 26800 events
Testing complete for 26900 events
Testing complete for 27000 events
Testing complete for 27100 events
Testing complete for 27200 events
Testing complete for 27300 events
Testing comple

Testing complete for 48600 events
Testing complete for 48700 events
Testing complete for 48800 events
Testing complete for 48900 events
Testing complete for 49000 events
Testing complete for 49100 events
Testing complete for 49200 events
Testing complete for 49300 events
Testing complete for 49400 events
Testing complete for 49500 events
Testing complete for 49600 events
Testing complete for 49700 events
Testing complete for 49800 events
Testing complete for 49900 events


In [123]:
data.size


1800000

In [119]:
# Package functions together
def find_error(aux_incl, batch, geometry, directions): 
    if aux_incl == False: 
        batch_aux = batch[batch.auxiliary==False]
    else: 
        batch_aux = batch
    event_ids = list(set(batch.index))
    n = len(event_ids)
    print(n)
    data = pd.DataFrame(event_ids, columns = ["event_id"])
    for i in range(50000, 150000): 
        event_id = event_ids[i]
        event = batch_aux.loc[event_id]
        stats = find_event_error(event_id, event, geometry, directions)
        data.loc[i, "vx"] = stats[0]
        data.loc[i, "vy"] = stats[1]
        data.loc[i, "vz"] = stats[2]
        data.loc[i, "az_pred"] = stats[3]
        data.loc[i, "ze_pred"] = stats[4]
        data.loc[i, "az_true"] = stats[5]
        data.loc[i, "ze_true"] = stats[6]
        data.loc[i, "mae"] = stats[7]
        if (i % 100 == 0): 
            print("Testing complete for " + str(i) + " events")
    
    return data

In [120]:
data2 = find_error(False, batch1, sensors, directions)
data_to_csv2 = data2.to_csv('best-fit2.csv', index = True) 

200000
Testing complete for 50000 events
Testing complete for 50100 events
Testing complete for 50200 events
Testing complete for 50300 events
Testing complete for 50400 events
Testing complete for 50500 events
Testing complete for 50600 events
Testing complete for 50700 events
Testing complete for 50800 events
Testing complete for 50900 events
Testing complete for 51000 events
Testing complete for 51100 events
Testing complete for 51200 events
Testing complete for 51300 events
Testing complete for 51400 events
Testing complete for 51500 events
Testing complete for 51600 events
Testing complete for 51700 events
Testing complete for 51800 events
Testing complete for 51900 events
Testing complete for 52000 events
Testing complete for 52100 events
Testing complete for 52200 events
Testing complete for 52300 events
Testing complete for 52400 events
Testing complete for 52500 events
Testing complete for 52600 events
Testing complete for 52700 events
Testing complete for 52800 events
Testing

Testing complete for 74100 events
Testing complete for 74200 events
Testing complete for 74300 events
Testing complete for 74400 events
Testing complete for 74500 events
Testing complete for 74600 events
Testing complete for 74700 events
Testing complete for 74800 events
Testing complete for 74900 events
Testing complete for 75000 events
Testing complete for 75100 events
Testing complete for 75200 events
Testing complete for 75300 events
Testing complete for 75400 events
Testing complete for 75500 events
Testing complete for 75600 events
Testing complete for 75700 events
Testing complete for 75800 events
Testing complete for 75900 events
Testing complete for 76000 events
Testing complete for 76100 events
Testing complete for 76200 events
Testing complete for 76300 events
Testing complete for 76400 events
Testing complete for 76500 events
Testing complete for 76600 events
Testing complete for 76700 events
Testing complete for 76800 events
Testing complete for 76900 events
Testing comple

Testing complete for 98200 events
Testing complete for 98300 events
Testing complete for 98400 events
Testing complete for 98500 events
Testing complete for 98600 events
Testing complete for 98700 events
Testing complete for 98800 events
Testing complete for 98900 events
Testing complete for 99000 events
Testing complete for 99100 events
Testing complete for 99200 events
Testing complete for 99300 events
Testing complete for 99400 events
Testing complete for 99500 events
Testing complete for 99600 events
Testing complete for 99700 events
Testing complete for 99800 events
Testing complete for 99900 events
Testing complete for 100000 events
Testing complete for 100100 events
Testing complete for 100200 events
Testing complete for 100300 events
Testing complete for 100400 events
Testing complete for 100500 events
Testing complete for 100600 events
Testing complete for 100700 events
Testing complete for 100800 events
Testing complete for 100900 events
Testing complete for 101000 events
Tes

Testing complete for 121700 events
Testing complete for 121800 events
Testing complete for 121900 events
Testing complete for 122000 events
Testing complete for 122100 events
Testing complete for 122200 events
Testing complete for 122300 events
Testing complete for 122400 events
Testing complete for 122500 events
Testing complete for 122600 events
Testing complete for 122700 events
Testing complete for 122800 events
Testing complete for 122900 events
Testing complete for 123000 events
Testing complete for 123100 events
Testing complete for 123200 events
Testing complete for 123300 events
Testing complete for 123400 events
Testing complete for 123500 events
Testing complete for 123600 events
Testing complete for 123700 events
Testing complete for 123800 events
Testing complete for 123900 events
Testing complete for 124000 events
Testing complete for 124100 events
Testing complete for 124200 events
Testing complete for 124300 events
Testing complete for 124400 events
Testing complete for

Testing complete for 145200 events
Testing complete for 145300 events
Testing complete for 145400 events
Testing complete for 145500 events
Testing complete for 145600 events
Testing complete for 145700 events
Testing complete for 145800 events
Testing complete for 145900 events
Testing complete for 146000 events
Testing complete for 146100 events
Testing complete for 146200 events
Testing complete for 146300 events
Testing complete for 146400 events
Testing complete for 146500 events
Testing complete for 146600 events
Testing complete for 146700 events
Testing complete for 146800 events
Testing complete for 146900 events
Testing complete for 147000 events
Testing complete for 147100 events
Testing complete for 147200 events
Testing complete for 147300 events
Testing complete for 147400 events
Testing complete for 147500 events
Testing complete for 147600 events
Testing complete for 147700 events
Testing complete for 147800 events
Testing complete for 147900 events
Testing complete for

In [124]:
data_to_csv2 = data2.to_csv('best-fit2.csv', index = True) 

In [121]:
# Package functions together
def find_error(aux_incl, batch, geometry, directions): 
    if aux_incl == False: 
        batch_aux = batch[batch.auxiliary==False]
    else: 
        batch_aux = batch
    event_ids = list(set(batch.index))
    n = len(event_ids)
    print(n)
    data = pd.DataFrame(event_ids, columns = ["event_id"])
    for i in range(150000, 180000): 
        event_id = event_ids[i]
        event = batch_aux.loc[event_id]
        stats = find_event_error(event_id, event, geometry, directions)
        data.loc[i, "vx"] = stats[0]
        data.loc[i, "vy"] = stats[1]
        data.loc[i, "vz"] = stats[2]
        data.loc[i, "az_pred"] = stats[3]
        data.loc[i, "ze_pred"] = stats[4]
        data.loc[i, "az_true"] = stats[5]
        data.loc[i, "ze_true"] = stats[6]
        data.loc[i, "mae"] = stats[7]
        if (i % 100 == 0): 
            print("Testing complete for " + str(i) + " events")
    
    return data

In [122]:
data3 = find_error(False, batch1, sensors, directions)
data_to_csv3 = data.to_csv('best-fit3.csv', index = True) 

200000
Testing complete for 150000 events
Testing complete for 150100 events
Testing complete for 150200 events
Testing complete for 150300 events
Testing complete for 150400 events
Testing complete for 150500 events
Testing complete for 150600 events
Testing complete for 150700 events
Testing complete for 150800 events
Testing complete for 150900 events
Testing complete for 151000 events
Testing complete for 151100 events
Testing complete for 151200 events
Testing complete for 151300 events
Testing complete for 151400 events
Testing complete for 151500 events
Testing complete for 151600 events
Testing complete for 151700 events
Testing complete for 151800 events
Testing complete for 151900 events
Testing complete for 152000 events
Testing complete for 152100 events
Testing complete for 152200 events
Testing complete for 152300 events
Testing complete for 152400 events
Testing complete for 152500 events
Testing complete for 152600 events
Testing complete for 152700 events
Testing compl

Testing complete for 173400 events
Testing complete for 173500 events
Testing complete for 173600 events
Testing complete for 173700 events
Testing complete for 173800 events
Testing complete for 173900 events
Testing complete for 174000 events
Testing complete for 174100 events
Testing complete for 174200 events
Testing complete for 174300 events
Testing complete for 174400 events
Testing complete for 174500 events
Testing complete for 174600 events
Testing complete for 174700 events
Testing complete for 174800 events
Testing complete for 174900 events
Testing complete for 175000 events
Testing complete for 175100 events
Testing complete for 175200 events
Testing complete for 175300 events
Testing complete for 175400 events
Testing complete for 175500 events
Testing complete for 175600 events
Testing complete for 175700 events
Testing complete for 175800 events
Testing complete for 175900 events
Testing complete for 176000 events
Testing complete for 176100 events
Testing complete for

In [125]:
data_to_csv3 = data3.to_csv('best-fit3.csv', index = True) 

In [19]:
def find_error(aux_incl, batch, geometry, directions): 
    if aux_incl == False: 
        batch_aux = batch[batch.auxiliary==False]
    else: 
        batch_aux = batch
    event_ids = list(set(batch.index))
    n = len(event_ids)
    print(n)
    data = pd.DataFrame(event_ids, columns = ["event_id"])
    for i in range(182677, 182700): 
        event_id = event_ids[i]
        event = batch_aux.loc[event_id]
        print("For test " + str(i) + ", there are " + str(len(event)) + "observations.")
        print(event)
        stats = find_event_error(event_id, event, geometry, directions)
        data.loc[i, "vx"] = stats[0]
        data.loc[i, "vy"] = stats[1]
        data.loc[i, "vz"] = stats[2]
        data.loc[i, "az_pred"] = stats[3]
        data.loc[i, "ze_pred"] = stats[4]
        data.loc[i, "az_true"] = stats[5]
        data.loc[i, "ze_true"] = stats[6]
        data.loc[i, "mae"] = stats[7]
        #if (i % 10 == 0): 
        #    print("Testing complete for " + str(i) + " events")
    
    return data

In [20]:
data4 = find_error(False, batch1, sensors, directions)
data_to_csv4 = data.to_csv('best-fit4.csv', index = True) 

200000
For test 182677, there are 19observations.
          sensor_id   time  charge  auxiliary
event_id                                     
2049423        4629   9873   1.275      False
2049423        4630   9876   0.575      False
2049423        4630   9893   0.975      False
2049423        4630   9930   1.375      False
2049423        4630   9946   1.075      False
2049423        4629   9960   0.575      False
2049423        4630  10046   0.775      False
2049423        4631  10067   0.875      False
2049423        4273  10358   0.975      False
2049423        4275  10403   0.875      False
2049423        4271  10446   0.825      False
2049423        4276  10471   1.375      False
2049423         807  10475   1.025      False
2049423         806  10514   1.175      False
2049423         804  10622   0.975      False
2049423        4276  10798   0.975      False
2049423         806  10955   1.025      False
2049423        3853  11314   0.975      False
2049423        3854  11477   1

KeyError: 'time'