# Calculate Best Fit Line only using Spatial Component

In this notebook, we use PCA to find the best fit line between the points for each event. 

In [8]:
# Import packages 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.decomposition import PCA

In [2]:
# Read data (a batch of events, the metadata for all events, and the locations of the sensors)
batch10 = pd.read_parquet("C:/Users/k_vsl/Documents/Erdos/IceCubeData/batch_10.parquet")
meta = pd.read_parquet("C:/Users/k_vsl/Documents/Erdos/IceCubeData/train_meta.parquet/train_meta.parquet")
sensor = pd.read_csv("C:/Users/k_vsl/Documents/Erdos/IceCubeData/sensor_geometry.csv")
batch = batch10[batch10.auxiliary == False]

In [3]:
# # Make a function that outputs (x,y,z) for a sensor_id input
def id_to_xyz(sen, geometry):
    row = tuple(geometry.loc[sen][1:4])
    return row

In [31]:
def get_angle(v): 
    mod2 = np.square(v[0]) + np.square(v[1]) + np.square(v[2])
    if (mod2 == 0):
        mod = 1
    else: 
        mod = np.sqrt(mod2)
    x = v[0] / mod
    y = v[1] / mod
    z = v[2] / mod
    ze = np.arccos(z)
    az = np.arctan2(y,x)
    return (az,ze)

In [32]:
# A function which finds the velocity of the best fit line with PCA spatially
def find_linear_fit(event, geometry):
    coords = [id_to_xyz(sen, geometry) for sen in event.sensor_id]
    pca = PCA(n_components=1)
    pca.fit(coords)
    dir_cartesian = pca.components_
    #print(dir_cartesian)
    vx = dir_cartesian[0,0]
    vy = dir_cartesian[0,1]
    vz = dir_cartesian[0,2]
    az, ze = get_angle((vx,vy,vz))
    return (vx,vy,vz,az,ze)


In [34]:
#Package functions by find
def find_linear_fits(aux_incl, batch, geometry):
    if aux_incl == False: 
        batch_aux = batch[batch.auxiliary==False]
    else: 
        batch_aux = batch
    event_ids = list(set(batch.index))
    n = len(event_ids)
    print(n)
    data = pd.DataFrame(event_ids, columns = ["event_id"])
    
    # Loop through the events and populate the data frame
    for i in range(0, 100): 
        event_id = event_ids[i]
        event = batch_aux.loc[event_id]
        stats = find_linear_fit(event, geometry)
        data.loc[i, "vx"] = stats[0]
        data.loc[i, "vy"] = stats[1]
        data.loc[i, "vz"] = stats[2]
        data.loc[i, "az"] = stats[3]
        data.loc[i, "ze"] = stats[4]
        if (i % 100 == 0): 
            print("Testing complete for " + str(i) + " events")
            
    return data

In [35]:
data = find_linear_fits(False, batch, sensor)

200000
Testing complete for 0 events


In [39]:
data.head(20)

Unnamed: 0,event_id,vx,vy,vz,az,ze
0,29360129,-0.6156145,0.08909313,-0.782995,2.997868,2.470263
1,31981571,0.0,0.0,-1.0,0.0,3.141593
2,31981573,0.5376055,-0.6634707,-0.520372,-0.889811,2.118083
3,32505863,0.2781392,-0.1831419,-0.94292,-0.582295,2.802088
4,32505864,-0.9862843,-0.1564275,0.052666,-2.9843,1.518105
5,31457290,0.2683564,-0.9628051,-0.031483,-1.298972,1.602284
6,31981579,-0.0,2.220446e-16,-1.0,1.570796,3.141593
7,29884429,-0.07115675,-0.6827982,-0.727134,-1.674635,2.384934
8,31457294,2.0714639999999999e-19,-1.514972e-16,-1.0,-1.569429,3.141593
9,31981581,-0.0,-0.0,1.0,-3.141593,0.0


In [24]:
data1 = data.loc[0]

In [25]:
data1

event_id    2.936013e+07
vx         -6.156145e-01
vy          8.909313e-02
vz         -7.829951e-01
Name: 0, dtype: float64

In [26]:
get_angle(data1.values)

(-2.0967702772566903e-08, 1.5707963237604028)

In [37]:
batch.loc[31981571]

Unnamed: 0_level_0,sensor_id,time,charge,auxiliary
event_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
31981571,2643,9844,1.675,False
31981571,2642,9855,0.475,False
31981571,2642,9861,1.025,False
31981571,2644,9870,0.775,False
31981571,2645,9978,0.925,False
31981571,2644,10641,1.475,False
