# Various linear regressions using extracted features

In this notebook, we investigate which of our extracted features may be useful in an attempt to find a model. Unfortunately, linear regression did not appear to help. 

In [505]:
# import modules
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

In [506]:
# Various useful functions

def get_mae(az_true, zen_true, az_pred, zen_pred): 
    """
    Given a predicted and true azimuth and zenith, compute the mae (mean angular error)
    """    
    if not (np.all(np.isfinite(az_true)) and
            np.all(np.isfinite(zen_true)) and
            np.all(np.isfinite(az_pred)) and
            np.all(np.isfinite(zen_pred))):
        raise ValueError("All arguments must be finite")
    
    # pre-compute all sine and cosine values
    sa1 = np.sin(az_true)
    ca1 = np.cos(az_true)
    sz1 = np.sin(zen_true)
    cz1 = np.cos(zen_true)
    
    sa2 = np.sin(az_pred)
    ca2 = np.cos(az_pred)
    sz2 = np.sin(zen_pred)
    cz2 = np.cos(zen_pred)
    
    # scalar product of the two cartesian vectors (x = sz*ca, y = sz*sa, z = cz)
    scalar_prod = sz1*sz2*(ca1*ca2 + sa1*sa2) + (cz1*cz2)
    
    # scalar product of two unit vectors is always between -1 and 1, this is against nummerical instability
    # that might otherwise occure from the finite precision of the sine and cosine functions
    scalar_prod =  np.clip(scalar_prod, -1, 1)
    
    # convert back to an angle (in radian)
    return np.average(np.abs(np.arccos(scalar_prod)))

def mae(y_true, y_pred): 
    return get_mae(y_true[0], y_true[1], y_pred[0], y_pred[1])

def get_maes(y_pred, y_true): 
    """
    Given a list of predictions and true values of azimuth and zenith, compute mae
    """
    n = len(y_pred)
    maes = np.zeros(n)
    for i in range (0,n):
        az_true = y_true[i][0]
        ze_true = y_true[i][1]
        az_pred = y_pred[i][0]
        ze_pred = y_pred[i][1]
        mae = get_mae(az_true, ze_true, az_pred, ze_pred)
        maes[i] = mae
        
    return maes.mean()

In [507]:
# Read data
event_data = pd.read_csv("C:/Users/k_vsl/Documents/Erdos/Boot Camp/ice-cube-katja/features-final.csv")

In [508]:
event_data.columns

Index(['event_id', 'vx_t', 'vy_t', 'vz_t', 'az_t_pred', 'ze_t_pred', 'mae_t',
       'mse_squared', 'mse', 'vx_pca', 'vy_pca', 'vz_pca', 'az_pca_pred',
       'ze_pca_pred', 'az_true', 'ze_true', 'num_clusters', 'dot_product',
       'mse_cat', 'cat_1.0', 'cat_2.0', 'cat_3.0', 'cat_4.0', 'cat_5.0',
       'cat_6.0', 'cat_7.0', 'cat_8.0', 'cat_9.0', 'cat_10.0', 'cat_11.0',
       'per_x', 'per_y', 'per_z', 'cat_x', 'cat_y', 'cat_z'],
      dtype='object')

In [509]:
# Separate training parameters into features and output
X = event_data
X = X.set_index("event_id")
y = event_data[['event_id', 'az_true', 'ze_true']]
y = y.set_index("event_id")

In [510]:
# Separate out a final training set
# random seed = 134
# test size = 25%
X_train, X_val, y_train, y_val = train_test_split(X, y, 
                                                             shuffle = True,
                                                             random_state = 134, 
                                                             test_size = .25)

In [511]:
# k-fold cross validation
# this cell imitates the erdos lectures notes on kfold cross validation , k = 5
# random seed to all splits random_seed = 134
kfold = KFold(n_splits = 5,
             shuffle = True,
             random_state = 134)

In [512]:
# Function to run sklearn Linear Regression
def run_lr(V, w, features): 
    lr = LinearRegression(copy_X = True)
    V_train = V[features]
    w_train = w
    
    i = 0
    maes = np.zeros(5)
    
    for train_index, test_index in kfold.split(V_train, w_train):
        ## get the kfold training data
        V_train_train = V_train.iloc[train_index,:]
        w_train_train = w_train.iloc[train_index]
    
        ## get the holdout data
        V_holdout = V_train.iloc[test_index,:]
        w_holdout = w_train.iloc[test_index]
    
        lr.fit(V_train_train, w_train_train)
        w_pred = lr.predict(V_holdout)
        mae = get_maes(w_pred, w_holdout.values)   
    
        maes[i] = mae
        i += 1
    
    return maes.mean()

In [531]:
# Function to run sklearn SGDRegression with epsilon_insensitive
def run_sgd(V, w, features, loss_fun): 
    model_az = SGDRegressor(loss = loss_fun, max_iter = 50000)
    model_ze = SGDRegressor(loss = loss_fun, max_iter = 50000)
    
    V_train = V[features]
    w_train_az = w['az_true']
    w_train_ze = w['ze_true']
    
    i = 0
    maes = np.zeros(5)
    
    for train_index, test_index in kfold.split(V_train, w_train_az):
        ## get the kfold training data
        V_train_train = V_train.iloc[train_index,:]
        w_train_train_az = w_train_az.iloc[train_index]
        w_train_train_ze = w_train_ze.iloc[train_index]
    
        ## get the holdout data
        V_holdout = V_train.iloc[test_index,:]
        w_holdout_az = w_train_az.iloc[test_index]
        w_holdout_ze = w_train_ze.iloc[test_index]

        model_az.fit(V_train_train, w_train_train_az)
        model_ze.fit(V_train_train, w_train_train_ze)
        w_pred_az = model_az.predict(V_holdout)
        w_pred_ze = model_ze.predict(V_holdout)
        w_pred = np.zeros((len(w_holdout_az), 2))
        w_true = np.zeros((len(w_holdout_ze), 2))
        w_pred[:,0] = w_pred_az
        w_pred[:,1] = w_pred_ze
        w_true[:,0] = w_holdout_az
        w_true[:,1] = w_holdout_ze
        mae = get_maes(w_pred, w_true) 
    
        maes[i] = mae
        i += 1
    
    return maes.mean()

In [513]:
# Model 1: No Linear Regression
# Just use the time best fit line and average the mae's from the line itself
print("Model 1: " + str(X_val.mae_t.mean()))

Model 1: 1.2134628158840939


In [520]:
# Linear Regression models
features = [['az_t_pred', 'ze_t_pred'], 
            ['az_t_pred', 'ze_t_pred', 'num_clusters'], 
            ['az_t_pred', 'ze_t_pred', 'mse'], 
            ['az_t_pred', 'ze_t_pred', 'dot_product'], 
            ['az_t_pred', 'ze_t_pred', 'az_pca_pred', 'ze_pca_pred'], 
            ['az_t_pred', 'ze_t_pred', 'mse_cat'], 
            ['az_pca_pred', 'ze_pca_pred'],
            ['az_t_pred', 'ze_t_pred','cat_1.0','cat_2.0', 'cat_3.0', 'cat_4.0', 'cat_5.0', 'cat_6.0', 'cat_7.0','cat_8.0', 'cat_9.0', 'cat_10.0'], 
            ['az_t_pred', 'ze_t_pred', 'cat_x', 'cat_y', 'cat_z', 'mse', 'cat_1.0','cat_2.0', 'cat_3.0', 'cat_4.0', 'cat_5.0', 'cat_6.0', 'cat_7.0','cat_8.0', 'cat_9.0', 'cat_10.0'], 
            ['az_t_pred', 'ze_t_pred', 'cat_x', 'cat_y', 'cat_z', 'mse_cat', 'cat_1.0','cat_2.0', 'cat_3.0', 'cat_4.0', 'cat_5.0', 'cat_6.0', 'cat_7.0','cat_8.0', 'cat_9.0', 'cat_10.0']
        ]


n = len(features)
maes = np.zeros(n)

i = 0
for i in range(0,n):
    mae = run_lr(X_train, y_train, features[i])
    maes[i] = mae
    print("Using features: " + str(features[i]) + ", the MAE is " + str(mae))
    i += 1


Using features: ['az_t_pred', 'ze_t_pred'], the MAE is 1.5114392346721135
Using features: ['az_t_pred', 'ze_t_pred', 'num_clusters'], the MAE is 1.5091467497314988
Using features: ['az_t_pred', 'ze_t_pred', 'mse'], the MAE is 1.5100998640596575
Using features: ['az_t_pred', 'ze_t_pred', 'dot_product'], the MAE is 1.5096530615494284
Using features: ['az_t_pred', 'ze_t_pred', 'az_pca_pred', 'ze_pca_pred'], the MAE is 1.5105590649667482
Using features: ['az_t_pred', 'ze_t_pred', 'mse_cat'], the MAE is 1.5094332297633348
Using features: ['az_pca_pred', 'ze_pca_pred'], the MAE is 1.5219531894197487
Using features: ['az_t_pred', 'ze_t_pred', 'cat_1.0', 'cat_2.0', 'cat_3.0', 'cat_4.0', 'cat_5.0', 'cat_6.0', 'cat_7.0', 'cat_8.0', 'cat_9.0', 'cat_10.0'], the MAE is 1.507800883963829
Using features: ['az_t_pred', 'ze_t_pred', 'cat_x', 'cat_y', 'cat_z', 'mse', 'cat_1.0', 'cat_2.0', 'cat_3.0', 'cat_4.0', 'cat_5.0', 'cat_6.0', 'cat_7.0', 'cat_8.0', 'cat_9.0', 'cat_10.0'], the MAE is 1.5067696839319

In [None]:
# SGDRegression with loss = 'epsilon_insensitive'

features = [['az_t_pred', 'ze_t_pred'], 
            ['az_t_pred', 'ze_t_pred', 'num_clusters'], 
            ['az_t_pred', 'ze_t_pred', 'mse'], 
            ['az_t_pred', 'ze_t_pred', 'dot_product'], 
            ['az_t_pred', 'ze_t_pred', 'az_pca_pred', 'ze_pca_pred'], 
            ['az_t_pred', 'ze_t_pred', 'mse_cat'], 
            ['az_pca_pred', 'ze_pca_pred'],
            ['az_t_pred', 'ze_t_pred','cat_1.0','cat_2.0', 'cat_3.0', 'cat_4.0', 'cat_5.0', 'cat_6.0', 'cat_7.0','cat_8.0', 'cat_9.0', 'cat_10.0'], 
            ['az_t_pred', 'ze_t_pred', 'cat_x', 'cat_y', 'cat_z', 'mse', 'cat_1.0','cat_2.0', 'cat_3.0', 'cat_4.0', 'cat_5.0', 'cat_6.0', 'cat_7.0','cat_8.0', 'cat_9.0', 'cat_10.0'], 
            ['az_t_pred', 'ze_t_pred', 'cat_x', 'cat_y', 'cat_z', 'mse_cat', 'cat_1.0','cat_2.0', 'cat_3.0', 'cat_4.0', 'cat_5.0', 'cat_6.0', 'cat_7.0','cat_8.0', 'cat_9.0', 'cat_10.0']
        ]



n = len(features)
maes_sgd = np.zeros(n)

i = 0
for i in range(0,n):
    mae = run_sgd(X_train, y_train, features[i], 'epsilon_insensitive')
    maes_sgd[i] = mae
    print("Using features: " + str(features[i]) + ", the MAE is " + str(mae))
    i += 1

Using features: ['az_t_pred', 'ze_t_pred'], the MAE is 1.461555593845072
Using features: ['az_t_pred', 'ze_t_pred', 'num_clusters'], the MAE is 1.45741346561603
Using features: ['az_t_pred', 'ze_t_pred', 'mse'], the MAE is 1.5649012556033466
Using features: ['az_t_pred', 'ze_t_pred', 'dot_product'], the MAE is 1.460313197178092


In [None]:
# SGDRegression with loss = 'huber'

features = [['az_t_pred', 'ze_t_pred'], 
            ['az_t_pred', 'ze_t_pred', 'num_clusters'], 
            ['az_t_pred', 'ze_t_pred', 'mse'], 
            ['az_t_pred', 'ze_t_pred', 'dot_product'], 
            ['az_t_pred', 'ze_t_pred', 'az_pca_pred', 'ze_pca_pred'], 
            ['az_t_pred', 'ze_t_pred', 'mse_cat'], 
            ['az_pca_pred', 'ze_pca_pred'],
            ['az_t_pred', 'ze_t_pred','cat_1.0','cat_2.0', 'cat_3.0', 'cat_4.0', 'cat_5.0', 'cat_6.0', 'cat_7.0','cat_8.0', 'cat_9.0', 'cat_10.0'], 
            ['az_t_pred', 'ze_t_pred', 'cat_x', 'cat_y', 'cat_z', 'mse', 'cat_1.0','cat_2.0', 'cat_3.0', 'cat_4.0', 'cat_5.0', 'cat_6.0', 'cat_7.0','cat_8.0', 'cat_9.0', 'cat_10.0'], 
            ['az_t_pred', 'ze_t_pred', 'cat_x', 'cat_y', 'cat_z', 'mse_cat', 'cat_1.0','cat_2.0', 'cat_3.0', 'cat_4.0', 'cat_5.0', 'cat_6.0', 'cat_7.0','cat_8.0', 'cat_9.0', 'cat_10.0']
        ]


n = len(features)
maes_sgdh = np.zeros(n)

i = 0
for i in range(0,n):
    mae = run_sgd(X_train, y_train, features[i], 'huber')
    maes_sgdh[i] = mae
    print("Using features: " + str(features[i]) + ", the MAE is " + str(mae))
    i += 1

In [502]:
# Model 2: Linear regression on the az and ze predicted by the time best fit line
features = ['az_t_pred', 'ze_t_pred']
mae = run_lr(X_train, y_train, features)

lr_2 = LinearRegression(copy_X = True)

X_train_2 = X_train[['az_t_pred', 'ze_t_pred']]
y_train_2 = y_train

mae_2 = np.zeros(5)
 
i = 0
for train_index, test_index in kfold.split(X_train_2, y_train_2):
    ## get the kfold training data
    X_train_train_2 = X_train_2.iloc[train_index,:]
    y_train_train_2 = y_train_2.iloc[train_index]
    
    ## get the holdout data
    X_holdout = X_train_2.iloc[test_index,:]
    y_holdout = y_train_2.iloc[test_index]
    
    lr_2.fit(X_train_train_2, y_train_train_2)
    y_pred = lr_2.predict(X_holdout)
    mae = get_maes(y_pred, y_holdout.values)   
    
    mae_2[i] = mae
    i += 1

print(mae_2)

[1.50740675 1.5082165  1.51644227 1.51144212 1.51368853]


In [490]:
# Model 3: Linear regression on the az and ze predicted by the time best fit line, in addition to clustering
lr_3 = LinearRegression(copy_X = True)
X_train_3 = X_train[['az_t_pred', 'ze_t_pred', 'num_clusters']]
y_train_3 = y_train

mae_3 = np.zeros(5)
 
i = 0
for train_index, test_index in kfold.split(X_train_3, y_train_3):
    ## get the kfold training data
    X_train_train_3 = X_train_3.iloc[train_index,:]
    y_train_train_3 = y_train_3.iloc[train_index]
    
    ## get the holdout data
    X_holdout = X_train_3.iloc[test_index,:]
    y_holdout = y_train_3.iloc[test_index]
    
    lr_3.fit(X_train_train_3, y_train_train_3)
    y_pred = lr_3.predict(X_holdout)
    mae = get_maes(y_pred, y_holdout.values)   
    
    mae_3[i] = mae
    i += 1

print(mae_3)
mae_3.mean()

[1.50534555 1.50573215 1.51397336 1.50915352 1.51152918]


1.5091467497314988

In [399]:
# Model 4: Linear regression on the az and ze predicted by the time best fit line, in addtion to the mse value
lr_4 = LinearRegression(copy_X = True)
X_train_4 = X_train[['az_t_pred', 'ze_t_pred', 'mse']]
y_train_4 = y_train

mae_4 = np.zeros(5)
 
i = 0
for train_index, test_index in kfold.split(X_train_4, y_train_4):
    ## get the kfold training data
    X_train_train_4 = X_train_4.iloc[train_index,:]
    y_train_train_4 = y_train_4.iloc[train_index]
    
    ## get the holdout data
    X_holdout = X_train_4.iloc[test_index,:]
    y_holdout = y_train_4.iloc[test_index]
    
    lr_4.fit(X_train_train_4, y_train_train_4)
    y_pred = lr_4.predict(X_holdout)
    mae = get_maes(y_pred, y_holdout.values)   
    
    mae_4[i] = mae
    i += 1

print(mae_4)

[1.50605267 1.50692562 1.51505989 1.51034495 1.51211618]


In [400]:
# Model 5: Linear regression on the az and ze predicted by the time best fit line, in addtion to the dot product
lr_5 = LinearRegression(copy_X = True)
X_train_5 = X_train[['az_t_pred', 'ze_t_pred', 'dot_product']]
y_train_5 = y_train

mae_5 = np.zeros(5)
 
i = 0
for train_index, test_index in kfold.split(X_train_5, y_train_5):
    ## get the kfold training data
    X_train_train_5 = X_train_5.iloc[train_index,:]
    y_train_train_5 = y_train_5.iloc[train_index]
    
    ## get the holdout data
    X_holdout = X_train_5.iloc[test_index,:]
    y_holdout = y_train_5.iloc[test_index]
    
    lr_5.fit(X_train_train_5, y_train_train_5)
    y_pred = lr_5.predict(X_holdout)
    mae = get_maes(y_pred, y_holdout.values)   
    
    mae_5[i] = mae
    i += 1

print(mae_5)

[1.50603265 1.50656034 1.51439981 1.50947405 1.51179846]


In [401]:
# Model 6: Linear regression on the az and ze predicted by the time best fit line, in addition to the pca best fit line
lr_6 = LinearRegression(copy_X = True)
X_train_6 = X_train[['az_t_pred', 'ze_t_pred', 'az_pca_pred', 'ze_pca_pred']]
y_train_6 = y_train

mae_6 = np.zeros(5)
 
i = 0
for train_index, test_index in kfold.split(X_train_6, y_train_6):
    ## get the kfold training data
    X_train_train_6 = X_train_6.iloc[train_index,:]
    y_train_train_6 = y_train_6.iloc[train_index]
    
    ## get the holdout data
    X_holdout = X_train_6.iloc[test_index,:]
    y_holdout = y_train_6.iloc[test_index]
    
    lr_6.fit(X_train_train_6, y_train_train_6)
    y_pred = lr_6.predict(X_holdout)
    mae = get_maes(y_pred, y_holdout.values)   
    
    mae_6[i] = mae
    i += 1

print(mae_6)

[1.50632911 1.50733308 1.5156     1.51075943 1.51277371]


In [402]:
# Model 7: Linear regression on the az and ze predicted by the time best fit line, in addtion to treating mse as categorical variable

# Get categorical variable
lr_7 = LinearRegression(copy_X = True)
X_train_7 = X_train[['az_t_pred', 'ze_t_pred', 'mse_cat']]
y_train_7 = y_train
 
i = 0
for train_index, test_index in kfold.split(X_train_7, y_train_7):
    ## get the kfold training data
    X_train_train_7 = X_train_7.iloc[train_index,:]
    y_train_train_7 = y_train_7.iloc[train_index]
    
    ## get the holdout data
    X_holdout = X_train_7.iloc[test_index,:]
    y_holdout = y_train_7.iloc[test_index]
    
    lr_7.fit(X_train_train_7, y_train_train_7)
    y_pred = lr_7.predict(X_holdout)
    mae = get_maes(y_pred, y_holdout.values)   
    mae_7[i] = mae
    i += 1

print(mae_7)

[1.50584927 1.50610422 1.51399367 1.50961388 1.51160511]


In [403]:
# Model 8: Just using linear regression on az and ze predicted by pca best fit line
lr_8 = LinearRegression(copy_X = True)

X_train_8 = X_train[['az_pca_pred', 'ze_pca_pred']]
y_train_8 = y_train

mae_8 = np.zeros(5)
 
i = 0
for train_index, test_index in kfold.split(X_train_8, y_train_8):
    ## get the kfold training data
    X_train_train_8 = X_train_8.iloc[train_index,:]
    y_train_train_8 = y_train_8.iloc[train_index]
    
    ## get the holdout data
    X_holdout = X_train_8.iloc[test_index,:]
    y_holdout = y_train_8.iloc[test_index]
    
    lr_8.fit(X_train_train_8, y_train_train_8)
    y_pred = lr_8.predict(X_holdout)
    mae = get_maes(y_pred, y_holdout.values)   
    
    mae_8[i] = mae
    i += 1

print(mae_8)

[1.51720295 1.51869047 1.52736457 1.52278137 1.52372659]


In [410]:
# Model 9: Using clustering as a categorical variable
lr_9 = LinearRegression(copy_X = True)

X_train_9 = X_train[['az_t_pred', 'ze_t_pred','cat_1.0',
       'cat_2.0', 'cat_3.0', 'cat_4.0', 'cat_5.0', 'cat_6.0', 'cat_7.0',
       'cat_8.0', 'cat_9.0', 'cat_10.0']]
y_train_9 = y_train

mae_9 = np.zeros(5)
 
i = 0
for train_index, test_index in kfold.split(X_train_9, y_train_9):
    ## get the kfold training data
    X_train_train_9 = X_train_9.iloc[train_index,:]
    y_train_train_9 = y_train_9.iloc[train_index]
    
    ## get the holdout data
    X_holdout = X_train_9.iloc[test_index,:]
    y_holdout = y_train_9.iloc[test_index]
    
    lr_9.fit(X_train_train_9, y_train_train_9)
    y_pred = lr_9.predict(X_holdout)
    mae = get_maes(y_pred, y_holdout.values)   
    
    mae_9[i] = mae
    i += 1

print(mae_9)

[1.50397942 1.50464298 1.51234034 1.50790023 1.51014145]


In [415]:
# Model 10 Using the biased sides of sensors
lr_10 = LinearRegression(copy_X = True)

X_train_10 = X_train[['az_t_pred', 'ze_t_pred', 'cat_x', 'cat_y', 'cat_z']]
y_train_10 = y_train

mae_10 = np.zeros(5)
 
i = 0
for train_index, test_index in kfold.split(X_train_10, y_train_10):
    ## get the kfold training data
    X_train_train_10 = X_train_10.iloc[train_index,:]
    y_train_train_10 = y_train_10.iloc[train_index]
    
    ## get the holdout data
    X_holdout = X_train_10.iloc[test_index,:]
    y_holdout = y_train_10.iloc[test_index]
    
    lr_10.fit(X_train_train_10, y_train_train_10)
    y_pred = lr_10.predict(X_holdout)
    mae = get_maes(y_pred, y_holdout.values)   
    
    mae_10[i] = mae
    i += 1

print(mae_10)

[1.50682839 1.50778389 1.51570358 1.51068535 1.51299756]


In [427]:
# Model 11: Using biased sides + categorical variables for side and mse
lr_11 = LinearRegression(copy_X = True)

X_train_11 = X_train[['az_t_pred', 'ze_t_pred', 'cat_x', 'cat_y', 'cat_z', 'mse_cat']]
y_train_11 = y_train

mae_11 = np.zeros(5)
 
i = 0
for train_index, test_index in kfold.split(X_train_11, y_train_11):
    ## get the kfold training data
    X_train_train_11 = X_train_11.iloc[train_index,:]
    y_train_train_11 = y_train_11.iloc[train_index]
    
    ## get the holdout data
    X_holdout = X_train_11.iloc[test_index,:]
    y_holdout = y_train_11.iloc[test_index]
    
    lr_11.fit(X_train_train_11, y_train_train_11)
    y_pred = lr_11.predict(X_holdout)
    mae = get_maes(y_pred, y_holdout.values)   
    
    mae_11[i] = mae
    i += 1

print(mae_11)

[1.50525599 1.50567671 1.51326882 1.50885468 1.51092673]


In [429]:
# Model 12: Using time best fit + all categorical variables
lr_12 = LinearRegression(copy_X = True)

X_train_12 = X_train[['az_t_pred', 'ze_t_pred', 'cat_x', 'cat_y', 'cat_z', 'mse_cat', 'cat_1.0',
       'cat_2.0', 'cat_3.0', 'cat_4.0', 'cat_5.0', 'cat_6.0', 'cat_7.0',
       'cat_8.0', 'cat_9.0', 'cat_10.0']]
y_train_12 = y_train

mae_12 = np.zeros(5)
 
i = 0
for train_index, test_index in kfold.split(X_train_12, y_train_12):
    ## get the kfold training data
    X_train_train_12 = X_train_12.iloc[train_index,:]
    y_train_train_12 = y_train_12.iloc[train_index]
    
    ## get the holdout data
    X_holdout = X_train_12.iloc[test_index,:]
    y_holdout = y_train_12.iloc[test_index]
    
    lr_12.fit(X_train_train_12, y_train_train_12)
    y_pred = lr_12.predict(X_holdout)
    mae = get_maes(y_pred, y_holdout.values)   
    
    mae_12[i] = mae
    i += 1

print(mae_12)

[1.50294107 1.50351744 1.51067776 1.50649056 1.50871341]


In [432]:
# Model 13: Same as model 12 but using actual MSE
lr_13 = LinearRegression(copy_X = True)

X_train_13 = X_train[['az_t_pred', 'ze_t_pred', 'cat_x', 'cat_y', 'cat_z', 'mse', 'cat_1.0',
       'cat_2.0', 'cat_3.0', 'cat_4.0', 'cat_5.0', 'cat_6.0', 'cat_7.0',
       'cat_8.0', 'cat_9.0', 'cat_10.0']]
y_train_13 = y_train

mae_13 = np.zeros(5)
 
i = 0
for train_index, test_index in kfold.split(X_train_13, y_train_13):
    ## get the kfold training data
    X_train_train_13 = X_train_13.iloc[train_index,:]
    y_train_train_13 = y_train_13.iloc[train_index]
    
    ## get the holdout data
    X_holdout = X_train_13.iloc[test_index,:]
    y_holdout = y_train_13.iloc[test_index]
    
    lr_13.fit(X_train_train_13, y_train_train_13)
    y_pred = lr_13.predict(X_holdout)
    mae = get_maes(y_pred, y_holdout.values)   
    
    mae_13[i] = mae
    i += 1

print(mae_13)

[1.50300838 1.5038687  1.51121546 1.50680626 1.50894962]


In [433]:
print("Model 1: " + str(mae_1.mean()))
print("Model 2: " + str(mae_2.mean()))
print("Model 3: " + str(mae_3.mean()))
print("Model 4: " + str(mae_4.mean()))
print("Model 5: " + str(mae_5.mean()))
print("Model 6: " + str(mae_6.mean()))
print("Model 7: " + str(mae_7.mean()))
print("Model 8: " + str(mae_8.mean()))
print("Model 9: " + str(mae_9.mean()))
print("Model 10: " + str(mae_10.mean()))
print("Model 11: " + str(mae_11.mean()))
print("Model 12: " + str(mae_12.mean()))
print("Model 13: " + str(mae_13.mean()))

Model 1: 1.2137944928512907
Model 2: 1.5114392346721135
Model 3: 1.5091467497314988
Model 4: 1.5100998640596575
Model 5: 1.5096530615494284
Model 6: 1.5105590649667482
Model 7: 1.5094332297633348
Model 8: 1.5219531894197487
Model 9: 1.507800883963829
Model 10: 1.5107997525418733
Model 11: 1.5087965866205237
Model 12: 1.5064680472951655
Model 13: 1.5067696839319975


In [435]:
# Try linear regression with custom loss function
from sklearn.linear_model import SGDRegressor


In [441]:
# Model 2 with SGDRegressor
model_az = SGDRegressor(loss = 'epsilon_insensitive', max_iter = 50000)
model_ze = SGDRegressor(loss = 'epsilon_insensitive', max_iter = 50000)

X_train_2 = X_train[['az_t_pred', 'ze_t_pred']]
y_train_az = y_train['az_true']
y_train_ze = y_train['ze_true']

mae2 = np.zeros(5)
 
i = 0
for train_index, test_index in kfold.split(X_train_2, y_train_az):
    ## get the kfold training data
    X_train_train_2 = X_train_2.iloc[train_index,:]
    y_train_train_az = y_train_az.iloc[train_index]
    y_train_train_ze = y_train_ze.iloc[train_index]
    
    ## get the holdout data
    X_holdout = X_train_2.iloc[test_index,:]
    y_holdout_az = y_train_az.iloc[test_index]
    y_holdout_ze = y_train_ze.iloc[test_index]
    
    model_az.fit(X_train_train_2, y_train_train_az)
    model_ze.fit(X_train_train_2, y_train_train_ze)
    y_pred_az = model_az.predict(X_holdout)
    y_pred_ze = model_ze.predict(X_holdout)
    y_pred = np.zeros((len(y_holdout_az), 2))
    y_true = np.zeros((len(y_holdout_az), 2))
    y_pred[:,0] = y_pred_az
    y_pred[:,1] = y_pred_ze
    y_true[:,0] = y_holdout_az
    y_true[:,1] = y_holdout_ze
    mae = get_maes(y_pred, y_true)   
    
    mae2[i] = mae
    i += 1

print(mae2)

[1.46446438 1.45966545 1.46088716 1.47005354 1.46105984]


In [443]:
# Model 2 with SGDRegressor
model_az = SGDRegressor(loss = 'huber', max_iter = 50000)
model_ze = SGDRegressor(loss = 'huber', max_iter = 50000)

X_train_2 = X_train[['az_t_pred', 'ze_t_pred']]
y_train_az = y_train['az_true']
y_train_ze = y_train['ze_true']

mae3 = np.zeros(5)
 
i = 0
for train_index, test_index in kfold.split(X_train_2, y_train_az):
    ## get the kfold training data
    X_train_train_2 = X_train_2.iloc[train_index,:]
    y_train_train_az = y_train_az.iloc[train_index]
    y_train_train_ze = y_train_ze.iloc[train_index]
    
    ## get the holdout data
    X_holdout = X_train_2.iloc[test_index,:]
    y_holdout_az = y_train_az.iloc[test_index]
    y_holdout_ze = y_train_ze.iloc[test_index]
    
    model_az.fit(X_train_train_2, y_train_train_az)
    model_ze.fit(X_train_train_2, y_train_train_ze)
    y_pred_az = model_az.predict(X_holdout)
    y_pred_ze = model_ze.predict(X_holdout)
    y_pred = np.zeros((len(y_holdout_az), 2))
    y_true = np.zeros((len(y_holdout_az), 2))
    y_pred[:,0] = y_pred_az
    y_pred[:,1] = y_pred_ze
    y_true[:,0] = y_holdout_az
    y_true[:,1] = y_holdout_ze
    mae = get_maes(y_pred, y_true)   
    
    mae3[i] = mae
    i += 1

print(mae3)

[1.45403883 1.45593671 1.46064882 1.45614789 1.45751829]


In [447]:
# Trying some other models
model_az = SGDRegressor(loss = 'epsilon_insensitive', max_iter = 50000)
model_ze = SGDRegressor(loss = 'epsilon_insensitive', max_iter = 50000)

X_train_2 = X_train[['az_t_pred', 'ze_t_pred', 'cat_x', 'cat_y', 'cat_z', 'mse', 'cat_1.0',
       'cat_2.0', 'cat_3.0', 'cat_4.0', 'cat_5.0', 'cat_6.0', 'cat_7.0',
       'cat_8.0', 'cat_9.0', 'cat_10.0']]
y_train_az = y_train['az_true']
y_train_ze = y_train['ze_true']

mae4 = np.zeros(5)
 
i = 0
for train_index, test_index in kfold.split(X_train_2, y_train_az):
    ## get the kfold training data
    X_train_train_2 = X_train_2.iloc[train_index,:]
    y_train_train_az = y_train_az.iloc[train_index]
    y_train_train_ze = y_train_ze.iloc[train_index]
    
    ## get the holdout data
    X_holdout = X_train_2.iloc[test_index,:]
    y_holdout_az = y_train_az.iloc[test_index]
    y_holdout_ze = y_train_ze.iloc[test_index]
    
    model_az.fit(X_train_train_2, y_train_train_az)
    model_ze.fit(X_train_train_2, y_train_train_ze)
    y_pred_az = model_az.predict(X_holdout)
    y_pred_ze = model_ze.predict(X_holdout)
    y_pred = np.zeros((len(y_holdout_az), 2))
    y_true = np.zeros((len(y_holdout_az), 2))
    y_pred[:,0] = y_pred_az
    y_pred[:,1] = y_pred_ze
    y_true[:,0] = y_holdout_az
    y_true[:,1] = y_holdout_ze
    mae = get_maes(y_pred, y_true)   
    
    mae4[i] = mae
    i += 1

print(mae4)

[1.57917627 1.569862   1.58042174 1.57101033 1.56666928]


In [444]:
print(mae2.mean())

1.4558522632010233


In [445]:
print(mae3.mean())

1.4568581088086567


In [None]:
# Try 

In [446]:
import tensorflow


