# Various linear regressions using extracted features

In this notebook, we investigate which of our extracted features may be useful in an attempt to find a model. Unfortunately, linear regression did not appear to help. 

In [589]:
# import modules
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.linear_model import SGDRegressor

# Loss Function

In [590]:
# Various useful functions

def get_mae(az_true, zen_true, az_pred, zen_pred): 
    """
    Given a predicted and true azimuth and zenith, compute the mae (mean angular error)
    """    
    if not (np.all(np.isfinite(az_true)) and
            np.all(np.isfinite(zen_true)) and
            np.all(np.isfinite(az_pred)) and
            np.all(np.isfinite(zen_pred))):
        raise ValueError("All arguments must be finite")
    
    # pre-compute all sine and cosine values
    sa1 = np.sin(az_true)
    ca1 = np.cos(az_true)
    sz1 = np.sin(zen_true)
    cz1 = np.cos(zen_true)
    
    sa2 = np.sin(az_pred)
    ca2 = np.cos(az_pred)
    sz2 = np.sin(zen_pred)
    cz2 = np.cos(zen_pred)
    
    # scalar product of the two cartesian vectors (x = sz*ca, y = sz*sa, z = cz)
    scalar_prod = sz1*sz2*(ca1*ca2 + sa1*sa2) + (cz1*cz2)
    
    # scalar product of two unit vectors is always between -1 and 1, this is against nummerical instability
    # that might otherwise occure from the finite precision of the sine and cosine functions
    scalar_prod =  np.clip(scalar_prod, -1, 1)
    
    # convert back to an angle (in radian)
    return np.average(np.abs(np.arccos(scalar_prod)))

def mae(y_true, y_pred): 
    return get_mae(y_true[0], y_true[1], y_pred[0], y_pred[1])

def get_maes(y_pred, y_true): 
    """
    Given a list of predictions and true values of azimuth and zenith, compute mae
    """
    n = len(y_pred)
    maes = np.zeros(n)
    for i in range (0,n):
        az_true = y_true[i][0]
        ze_true = y_true[i][1]
        az_pred = y_pred[i][0]
        ze_pred = y_pred[i][1]
        mae = get_mae(az_true, ze_true, az_pred, ze_pred)
        maes[i] = mae
        
    return maes.mean()

# Format data

In [591]:
# Read data
event_data = pd.read_csv("C:/Users/k_vsl/Documents/Erdos/Boot Camp/ice-cube-katja/features-final.csv")

In [592]:
event_data.columns

Index(['event_id', 'vx_t', 'vy_t', 'vz_t', 'az_t_pred', 'ze_t_pred', 'mae_t',
       'mse_squared', 'mse', 'vx_pca', 'vy_pca', 'vz_pca', 'az_pca_pred',
       'ze_pca_pred', 'az_true', 'ze_true', 'num_clusters', 'dot_product',
       'mse_cat', 'cat_1.0', 'cat_2.0', 'cat_3.0', 'cat_4.0', 'cat_5.0',
       'cat_6.0', 'cat_7.0', 'cat_8.0', 'cat_9.0', 'cat_10.0', 'cat_11.0',
       'per_x', 'per_y', 'per_z', 'cat_x', 'cat_y', 'cat_z'],
      dtype='object')

In [593]:
# Separate training parameters into features and output
X = event_data
X = X.set_index("event_id")
y = event_data[['event_id', 'az_true', 'ze_true']]
y = y.set_index("event_id")

In [594]:
# Separate out a final training set
# random seed = 134
# test size = 25%
X_train, X_val, y_train, y_val = train_test_split(X, y, 
                                                             shuffle = True,
                                                             random_state = 134, 
                                                             test_size = .25)

In [595]:
# k-fold cross validation
# this cell imitates the erdos lectures notes on kfold cross validation , k = 5
# random seed to all splits random_seed = 134
kfold = KFold(n_splits = 5,
             shuffle = True,
             random_state = 134)

# Helper Functions to run multiple regressions

In [596]:
# Function to run sklearn Linear Regression
def run_lr(V, w, features): 
    V_train = V[features]
    w_train = w
    
    i = 0
    maes = np.zeros(5)
    
    for train_index, test_index in kfold.split(V_train, w_train):
        ## get the kfold training data
        V_train_train = V_train.iloc[train_index,:]
        w_train_train = w_train.iloc[train_index]
    
        ## get the holdout data
        V_holdout = V_train.iloc[test_index,:]
        w_holdout = w_train.iloc[test_index]
        
        lr = LinearRegression(copy_X = True)
        lr.fit(V_train_train, w_train_train)
        # Toggle on and off to see coefficients
        # print(lr.coef_)
        w_pred = lr.predict(V_holdout)
        mae = get_maes(w_pred, w_holdout.values)   
        maes[i] = mae
        i += 1
    
    return maes.mean()

In [597]:
# Function to run sklearn SGDRegression with epsilon_insensitive
def run_sgd(V, w, features, loss_fun): 
    
    V_train = V[features]
    w_train_az = w['az_true']
    w_train_ze = w['ze_true']
    
    i = 0
    maes = np.zeros(5)
    
    for train_index, test_index in kfold.split(V_train, w_train_az):
        ## get the kfold training data
        V_train_train = V_train.iloc[train_index,:]
        w_train_train_az = w_train_az.iloc[train_index]
        w_train_train_ze = w_train_ze.iloc[train_index]
    
        ## get the holdout data
        V_holdout = V_train.iloc[test_index,:]
        w_holdout_az = w_train_az.iloc[test_index]
        w_holdout_ze = w_train_ze.iloc[test_index]
        
        model_az = SGDRegressor(loss = loss_fun, max_iter = 50000)
        model_ze = SGDRegressor(loss = loss_fun, max_iter = 50000)
        model_az.fit(V_train_train, w_train_train_az)
        model_ze.fit(V_train_train, w_train_train_ze)
        w_pred_az = model_az.predict(V_holdout)
        w_pred_ze = model_ze.predict(V_holdout)
        w_pred = np.zeros((len(w_holdout_az), 2))
        w_true = np.zeros((len(w_holdout_ze), 2))
        w_pred[:,0] = w_pred_az
        w_pred[:,1] = w_pred_ze
        w_true[:,0] = w_holdout_az
        w_true[:,1] = w_holdout_ze
        mae = get_maes(w_pred, w_true) 
    
        maes[i] = mae
        i += 1
    
    return maes.mean()

# Model 1: Baseline

In [598]:
# Model 1: No Linear Regression
# Just use the time best fit line and average the mae's from the line itself
X_mae = X_train['mae_t']
y_mae = y_train
maes = np.zeros(5)
i = 0
for train_index, test_index in kfold.split(X_mae, y_mae): 
    X_holdout = X_mae.iloc[test_index]
    maes[i] = X_holdout.mean()
    i += 1
print("Model 1: " + str(maes.mean()))

Model 1: 1.2137944928512907


Model 1 has k-fold mae: 1.2137944928512907

# Base Linear Regression Models

In [599]:
# Linear Regression models
features = [['az_t_pred', 'ze_t_pred'], 
            ['az_t_pred', 'ze_t_pred', 'num_clusters'], 
            ['az_t_pred', 'ze_t_pred', 'mse'], 
            ['az_t_pred', 'ze_t_pred', 'dot_product'], 
            ['az_t_pred', 'ze_t_pred', 'az_pca_pred', 'ze_pca_pred'], 
            ['az_t_pred', 'ze_t_pred', 'mse_cat'], 
            ['az_pca_pred', 'ze_pca_pred'],
            ['az_t_pred', 'ze_t_pred','cat_1.0','cat_2.0', 'cat_3.0', 'cat_4.0', 'cat_5.0', 'cat_6.0', 'cat_7.0','cat_8.0', 'cat_9.0', 'cat_10.0'], 
            ['az_t_pred', 'ze_t_pred', 'cat_x', 'cat_y', 'cat_z', 'mse', 'cat_1.0','cat_2.0', 'cat_3.0', 'cat_4.0', 'cat_5.0', 'cat_6.0', 'cat_7.0','cat_8.0', 'cat_9.0', 'cat_10.0'], 
            ['az_t_pred', 'ze_t_pred', 'cat_x', 'cat_y', 'cat_z', 'mse_cat', 'cat_1.0','cat_2.0', 'cat_3.0', 'cat_4.0', 'cat_5.0', 'cat_6.0', 'cat_7.0','cat_8.0', 'cat_9.0', 'cat_10.0']
        ]


n = len(features)
maes = np.zeros(n)

i = 0
for i in range(0,n):
    mae = run_lr(X_train, y_train, features[i])
    maes[i] = mae
    print("Using features: " + str(features[i]) + ", the MAE is " + str(mae))
    i += 1


Using features: ['az_t_pred', 'ze_t_pred'], the MAE is 1.5114392346721135
Using features: ['az_t_pred', 'ze_t_pred', 'num_clusters'], the MAE is 1.5091467497314988
Using features: ['az_t_pred', 'ze_t_pred', 'mse'], the MAE is 1.5100998640596575
Using features: ['az_t_pred', 'ze_t_pred', 'dot_product'], the MAE is 1.5096530615494284
Using features: ['az_t_pred', 'ze_t_pred', 'az_pca_pred', 'ze_pca_pred'], the MAE is 1.5105590649667482
Using features: ['az_t_pred', 'ze_t_pred', 'mse_cat'], the MAE is 1.5094332297633348
Using features: ['az_pca_pred', 'ze_pca_pred'], the MAE is 1.5219531894197487
Using features: ['az_t_pred', 'ze_t_pred', 'cat_1.0', 'cat_2.0', 'cat_3.0', 'cat_4.0', 'cat_5.0', 'cat_6.0', 'cat_7.0', 'cat_8.0', 'cat_9.0', 'cat_10.0'], the MAE is 1.507800883963829
Using features: ['az_t_pred', 'ze_t_pred', 'cat_x', 'cat_y', 'cat_z', 'mse', 'cat_1.0', 'cat_2.0', 'cat_3.0', 'cat_4.0', 'cat_5.0', 'cat_6.0', 'cat_7.0', 'cat_8.0', 'cat_9.0', 'cat_10.0'], the MAE is 1.5067696839319

Best k-fold mae comes from features: 
['az_t_pred', 'ze_t_pred', 'cat_x', 'cat_y', 'cat_z', 'mse_cat', 'cat_1.0', 'cat_2.0', 'cat_3.0', 'cat_4.0', 'cat_5.0', 'cat_6.0', 'cat_7.0', 'cat_8.0', 'cat_9.0', 'cat_10.0']
Value: 1.5064680472951655

In [600]:
# SGDRegression with loss = 'epsilon_insensitive'

features = [['az_t_pred', 'ze_t_pred'], 
            ['az_t_pred', 'ze_t_pred', 'num_clusters'], 
            ['az_t_pred', 'ze_t_pred', 'mse'], 
            ['az_t_pred', 'ze_t_pred', 'dot_product'], 
            ['az_t_pred', 'ze_t_pred', 'az_pca_pred', 'ze_pca_pred'], 
            ['az_t_pred', 'ze_t_pred', 'mse_cat'], 
            ['az_pca_pred', 'ze_pca_pred'],
            ['az_t_pred', 'ze_t_pred','cat_1.0','cat_2.0', 'cat_3.0', 'cat_4.0', 'cat_5.0', 'cat_6.0', 'cat_7.0','cat_8.0', 'cat_9.0', 'cat_10.0'], 
            ['az_t_pred', 'ze_t_pred', 'cat_x', 'cat_y', 'cat_z', 'mse', 'cat_1.0','cat_2.0', 'cat_3.0', 'cat_4.0', 'cat_5.0', 'cat_6.0', 'cat_7.0','cat_8.0', 'cat_9.0', 'cat_10.0'], 
            ['az_t_pred', 'ze_t_pred', 'cat_x', 'cat_y', 'cat_z', 'mse_cat', 'cat_1.0','cat_2.0', 'cat_3.0', 'cat_4.0', 'cat_5.0', 'cat_6.0', 'cat_7.0','cat_8.0', 'cat_9.0', 'cat_10.0']
        ]



n = len(features)
maes_sgd = np.zeros(n)

i = 0
for i in range(0,n):
    mae = run_sgd(X_train, y_train, features[i], 'epsilon_insensitive')
    maes_sgd[i] = mae
    print("Using features: " + str(features[i]) + ", the MAE is " + str(mae))
    i += 1

Using features: ['az_t_pred', 'ze_t_pred'], the MAE is 1.4618878946957579
Using features: ['az_t_pred', 'ze_t_pred', 'num_clusters'], the MAE is 1.4557881752583124
Using features: ['az_t_pred', 'ze_t_pred', 'mse'], the MAE is 1.5743695565091955
Using features: ['az_t_pred', 'ze_t_pred', 'dot_product'], the MAE is 1.4603143893595898
Using features: ['az_t_pred', 'ze_t_pred', 'az_pca_pred', 'ze_pca_pred'], the MAE is 1.466331498919548
Using features: ['az_t_pred', 'ze_t_pred', 'mse_cat'], the MAE is 1.4567743744906714
Using features: ['az_pca_pred', 'ze_pca_pred'], the MAE is 1.4812092394751233
Using features: ['az_t_pred', 'ze_t_pred', 'cat_1.0', 'cat_2.0', 'cat_3.0', 'cat_4.0', 'cat_5.0', 'cat_6.0', 'cat_7.0', 'cat_8.0', 'cat_9.0', 'cat_10.0'], the MAE is 1.4577674734067174
Using features: ['az_t_pred', 'ze_t_pred', 'cat_x', 'cat_y', 'cat_z', 'mse', 'cat_1.0', 'cat_2.0', 'cat_3.0', 'cat_4.0', 'cat_5.0', 'cat_6.0', 'cat_7.0', 'cat_8.0', 'cat_9.0', 'cat_10.0'], the MAE is 1.5748026707150

Best k-fold mae comes from features: 
['az_t_pred', 'ze_t_pred', 'cat_x', 'cat_y', 'cat_z', 'mse_cat', 'cat_1.0', 'cat_2.0', 'cat_3.0', 'cat_4.0', 'cat_5.0', 'cat_6.0', 'cat_7.0', 'cat_8.0', 'cat_9.0', 'cat_10.0']
Value: 1.4511509147669819

In [601]:
# SGDRegression with loss = 'huber'

features = [['az_t_pred', 'ze_t_pred'], 
            ['az_t_pred', 'ze_t_pred', 'num_clusters'], 
            ['az_t_pred', 'ze_t_pred', 'mse'], 
            ['az_t_pred', 'ze_t_pred', 'dot_product'], 
            ['az_t_pred', 'ze_t_pred', 'az_pca_pred', 'ze_pca_pred'], 
            ['az_t_pred', 'ze_t_pred', 'mse_cat'], 
            ['az_pca_pred', 'ze_pca_pred'],
            ['az_t_pred', 'ze_t_pred','cat_1.0','cat_2.0', 'cat_3.0', 'cat_4.0', 'cat_5.0', 'cat_6.0', 'cat_7.0','cat_8.0', 'cat_9.0', 'cat_10.0'], 
            ['az_t_pred', 'ze_t_pred', 'cat_x', 'cat_y', 'cat_z', 'mse', 'cat_1.0','cat_2.0', 'cat_3.0', 'cat_4.0', 'cat_5.0', 'cat_6.0', 'cat_7.0','cat_8.0', 'cat_9.0', 'cat_10.0'], 
            ['az_t_pred', 'ze_t_pred', 'cat_x', 'cat_y', 'cat_z', 'mse_cat', 'cat_1.0','cat_2.0', 'cat_3.0', 'cat_4.0', 'cat_5.0', 'cat_6.0', 'cat_7.0','cat_8.0', 'cat_9.0', 'cat_10.0']
        ]


n = len(features)
maes_sgdh = np.zeros(n)

i = 0
for i in range(0,n):
    mae = run_sgd(X_train, y_train, features[i], 'huber')
    maes_sgdh[i] = mae
    print("Using features: " + str(features[i]) + ", the MAE is " + str(mae))
    i += 1

Using features: ['az_t_pred', 'ze_t_pred'], the MAE is 1.4566696024827528
Using features: ['az_t_pred', 'ze_t_pred', 'num_clusters'], the MAE is 1.4501782143801938
Using features: ['az_t_pred', 'ze_t_pred', 'mse'], the MAE is 1.5772359830142508
Using features: ['az_t_pred', 'ze_t_pred', 'dot_product'], the MAE is 1.457254319227922
Using features: ['az_t_pred', 'ze_t_pred', 'az_pca_pred', 'ze_pca_pred'], the MAE is 1.455977078537934
Using features: ['az_t_pred', 'ze_t_pred', 'mse_cat'], the MAE is 1.4529140703749999
Using features: ['az_pca_pred', 'ze_pca_pred'], the MAE is 1.478973825824332
Using features: ['az_t_pred', 'ze_t_pred', 'cat_1.0', 'cat_2.0', 'cat_3.0', 'cat_4.0', 'cat_5.0', 'cat_6.0', 'cat_7.0', 'cat_8.0', 'cat_9.0', 'cat_10.0'], the MAE is 1.4511278581810632
Using features: ['az_t_pred', 'ze_t_pred', 'cat_x', 'cat_y', 'cat_z', 'mse', 'cat_1.0', 'cat_2.0', 'cat_3.0', 'cat_4.0', 'cat_5.0', 'cat_6.0', 'cat_7.0', 'cat_8.0', 'cat_9.0', 'cat_10.0'], the MAE is 1.560045171605290

In [None]:
Best k-fold mae comes from features: 
['az_t_pred', 'ze_t_pred', 'cat_x', 'cat_y', 'cat_z', 'mse_cat', 'cat_1.0', 'cat_2.0', 'cat_3.0', 'cat_4.0', 'cat_5.0', 'cat_6.0', 'cat_7.0', 'cat_8.0', 'cat_9.0', 'cat_10.0']
Value: 1.4459629108551826

# Tune features
Since the categorical variables were useful, let's tune them slightly for various cutoffs and attempt to reduce the number of categorical variables by classifying clusters as low, medium, high

In [None]:
# Get coefficients for original good models to get an idea of what reasonable cluster cutoffs could be
features = ['az_t_pred', 'ze_t_pred', 'cat_x', 'cat_y', 'cat_z', 'mse_cat', 'cat_1.0','cat_2.0', 'cat_3.0', 'cat_4.0', 'cat_5.0', 'cat_6.0', 'cat_7.0','cat_8.0', 'cat_9.0', 'cat_10.0']
mae = run_lr(X_train, y_train, features)

In [602]:
# Tune cutoffs for skews with cluster cutoffs 4 and 7
print("Running")
cutoffs = np.arange(.5, 1, .05)
n = len(cutoffs)
maes_lr = np.zeros(n)
maes_h = np.zeros(n)
i = 0
for i in range(0,n): 
    cutoff = cutoffs[i]
    print("The cutoff is " + str(cutoff))
    V = X[['az_t_pred', 'ze_t_pred', 'num_clusters','mse_cat','per_x', 'per_y', 'per_z']]
    V['x_skew'] = [(val > cutoff) for val in V.per_x]
    V['y_skew'] = [(val > cutoff) for val in V.per_y]
    V['z_skew'] = [(val > cutoff) for val in V.per_z]
    V['low_cluster'] = [(c < 4) for c in V.num_clusters]
    V['high_cluster'] = [(c >= 8) for c in V.num_clusters]
    V.replace({False: 0, True: 1}, inplace=True)
    
    V_train, V_val, y_train, y_val = train_test_split(V, y, 
                                                             shuffle = True,
                                                             random_state = 134, 
                                                             test_size = .25)
    feature = ['az_t_pred', 'ze_t_pred', 'x_skew', 'y_skew', 'z_skew', 'mse_cat', 'low_cluster', 'high_cluster']
    mae_lr = run_lr(V_train, y_train, feature)
    mae_h = run_sgd(V_train, y_train, feature, 'huber')
    maes_lr[i] = mae_lr
    maes_h[i] = mae_h
    print("The linear regression mae is " + str(mae_lr))
    print("The huber regression mae is " + str(mae_h))
    i += 1

Running
The cutoff is 0.5


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  V['x_skew'] = [(val > cutoff) for val in V.per_x]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  V['y_skew'] = [(val > cutoff) for val in V.per_y]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  V['z_skew'] = [(val > cutoff) for val in V.per_z]
A value is trying to be set on a copy of a slice from a

The linear regression mae is 1.508651054534671
The huber regression mae is 1.4491486596684073
The cutoff is 0.55


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  V['x_skew'] = [(val > cutoff) for val in V.per_x]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  V['y_skew'] = [(val > cutoff) for val in V.per_y]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  V['z_skew'] = [(val > cutoff) for val in V.per_z]
A value is trying to be set on a copy of a slice from a

The linear regression mae is 1.5086545320799822
The huber regression mae is 1.4491314797005057
The cutoff is 0.6000000000000001


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  V['x_skew'] = [(val > cutoff) for val in V.per_x]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  V['y_skew'] = [(val > cutoff) for val in V.per_y]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  V['z_skew'] = [(val > cutoff) for val in V.per_z]
A value is trying to be set on a copy of a slice from a

The linear regression mae is 1.5086410947642679
The huber regression mae is 1.4502039169873284
The cutoff is 0.6500000000000001


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  V['x_skew'] = [(val > cutoff) for val in V.per_x]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  V['y_skew'] = [(val > cutoff) for val in V.per_y]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  V['z_skew'] = [(val > cutoff) for val in V.per_z]
A value is trying to be set on a copy of a slice from a

The linear regression mae is 1.5085738058788245
The huber regression mae is 1.4489026860837613
The cutoff is 0.7000000000000002


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  V['x_skew'] = [(val > cutoff) for val in V.per_x]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  V['y_skew'] = [(val > cutoff) for val in V.per_y]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  V['z_skew'] = [(val > cutoff) for val in V.per_z]
A value is trying to be set on a copy of a slice from a

The linear regression mae is 1.5085254238604633
The huber regression mae is 1.4484286511643447
The cutoff is 0.7500000000000002


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  V['x_skew'] = [(val > cutoff) for val in V.per_x]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  V['y_skew'] = [(val > cutoff) for val in V.per_y]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  V['z_skew'] = [(val > cutoff) for val in V.per_z]
A value is trying to be set on a copy of a slice from a

The linear regression mae is 1.508378976015092
The huber regression mae is 1.449504454859626
The cutoff is 0.8000000000000003


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  V['x_skew'] = [(val > cutoff) for val in V.per_x]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  V['y_skew'] = [(val > cutoff) for val in V.per_y]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  V['z_skew'] = [(val > cutoff) for val in V.per_z]
A value is trying to be set on a copy of a slice from a

The linear regression mae is 1.5082524414724656
The huber regression mae is 1.4481613231424049
The cutoff is 0.8500000000000003


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  V['x_skew'] = [(val > cutoff) for val in V.per_x]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  V['y_skew'] = [(val > cutoff) for val in V.per_y]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  V['z_skew'] = [(val > cutoff) for val in V.per_z]
A value is trying to be set on a copy of a slice from a

The linear regression mae is 1.5080094872742822
The huber regression mae is 1.448532162388818
The cutoff is 0.9000000000000004


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  V['x_skew'] = [(val > cutoff) for val in V.per_x]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  V['y_skew'] = [(val > cutoff) for val in V.per_y]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  V['z_skew'] = [(val > cutoff) for val in V.per_z]
A value is trying to be set on a copy of a slice from a

The linear regression mae is 1.5077816036883729
The huber regression mae is 1.448053028077291
The cutoff is 0.9500000000000004


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  V['x_skew'] = [(val > cutoff) for val in V.per_x]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  V['y_skew'] = [(val > cutoff) for val in V.per_y]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  V['z_skew'] = [(val > cutoff) for val in V.per_z]
A value is trying to be set on a copy of a slice from a

The linear regression mae is 1.50744727457383
The huber regression mae is 1.447934764357909


In [572]:
print(maes_lr)
print(maes_h)
print(cutoffs)

[1.50865105 1.50865453 1.50864109 1.50857381 1.50852542 1.50837898
 1.50825244 1.50800949 1.5077816  1.50744727]
[1.44916114 1.44958978 1.44979919 1.44833299 1.44943984 1.44840595
 1.4487227  1.44829029 1.44887322 1.44646112]
[0.5  0.55 0.6  0.65 0.7  0.75 0.8  0.85 0.9  0.95]


In [None]:
# Graph for different cutoffs
plt.figure(figsize=(8,6))

plt.scatter(cutoffs,
               maes_lr, 
           c = 'orange')
plt.scatter(cutoffs,
               maes_h, 
           c = 'blue')

plt.xticks(fontsize=10)
plt.yticks(fontsize=10)

ax.set_xlabel("Cutoff")
ax.set_ylabel("MAE")
plt.title("Cutoffs for skewed data")

plt.show()

Takeaway: The ideal cutoff appears to be around 90 degrees. 

In [573]:
# Tune cutoffs for skews with cluster cutoffs 3 and 7

print("Running")
cutoffs = np.arange(.5, 1, .05)
n = len(cutoffs)
maes_lr = np.zeros(n)
maes_h = np.zeros(n)
i = 0
for i in range(0,n): 
    cutoff = cutoffs[i]
    print("The cutoff is " + str(cutoff))
    V = X[['az_t_pred', 'ze_t_pred', 'num_clusters','mse_cat','per_x', 'per_y', 'per_z']]
    V['x_skew'] = [(val > cutoff) for val in V.per_x]
    V['y_skew'] = [(val > cutoff) for val in V.per_y]
    V['z_skew'] = [(val > cutoff) for val in V.per_z]
    V['low_cluster'] = [(c < 3) for c in V.num_clusters]
    V['high_cluster'] = [(c >= 8) for c in V.num_clusters]
    V.replace({False: 0, True: 1}, inplace=True)
    
    V_train, V_val, y_train, y_val = train_test_split(V, y, 
                                                             shuffle = True,
                                                             random_state = 134, 
                                                             test_size = .25)
    feature = ['az_t_pred', 'ze_t_pred', 'x_skew', 'y_skew', 'z_skew', 'mse_cat', 'low_cluster', 'high_cluster']
    mae_lr = run_lr(V_train, y_train, feature)
    mae_h = run_sgd(V_train, y_train, feature, 'huber')
    maes_lr[i] = mae_lr
    maes_h[i] = mae_h
    print("The linear regression mae is " + str(mae_lr))
    print("The huber regression mae is " + str(mae_h))
    i += 1

Running
The cutoff is 0.5


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  V['x_skew'] = [(val > cutoff) for val in V.per_x]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  V['y_skew'] = [(val > cutoff) for val in V.per_y]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  V['z_skew'] = [(val > cutoff) for val in V.per_z]
A value is trying to be set on a copy of a slice from a

The linear regression mae is 1.5082585118844432
The huber regression mae is 1.4474887120420745
The cutoff is 0.55


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  V['x_skew'] = [(val > cutoff) for val in V.per_x]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  V['y_skew'] = [(val > cutoff) for val in V.per_y]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  V['z_skew'] = [(val > cutoff) for val in V.per_z]
A value is trying to be set on a copy of a slice from a

The linear regression mae is 1.508264154401136
The huber regression mae is 1.4474392370339833
The cutoff is 0.6000000000000001


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  V['x_skew'] = [(val > cutoff) for val in V.per_x]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  V['y_skew'] = [(val > cutoff) for val in V.per_y]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  V['z_skew'] = [(val > cutoff) for val in V.per_z]
A value is trying to be set on a copy of a slice from a

The linear regression mae is 1.5082568711271818
The huber regression mae is 1.447926259694236
The cutoff is 0.6500000000000001


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  V['x_skew'] = [(val > cutoff) for val in V.per_x]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  V['y_skew'] = [(val > cutoff) for val in V.per_y]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  V['z_skew'] = [(val > cutoff) for val in V.per_z]
A value is trying to be set on a copy of a slice from a

The linear regression mae is 1.5081993405019127
The huber regression mae is 1.4474979069399623
The cutoff is 0.7000000000000002


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  V['x_skew'] = [(val > cutoff) for val in V.per_x]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  V['y_skew'] = [(val > cutoff) for val in V.per_y]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  V['z_skew'] = [(val > cutoff) for val in V.per_z]
A value is trying to be set on a copy of a slice from a

The linear regression mae is 1.5081605425447262
The huber regression mae is 1.4486260777471354
The cutoff is 0.7500000000000002


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  V['x_skew'] = [(val > cutoff) for val in V.per_x]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  V['y_skew'] = [(val > cutoff) for val in V.per_y]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  V['z_skew'] = [(val > cutoff) for val in V.per_z]
A value is trying to be set on a copy of a slice from a

The linear regression mae is 1.5080302564702923
The huber regression mae is 1.447889712176469
The cutoff is 0.8000000000000003


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  V['x_skew'] = [(val > cutoff) for val in V.per_x]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  V['y_skew'] = [(val > cutoff) for val in V.per_y]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  V['z_skew'] = [(val > cutoff) for val in V.per_z]
A value is trying to be set on a copy of a slice from a

The linear regression mae is 1.5079226462109412
The huber regression mae is 1.4473179147406896
The cutoff is 0.8500000000000003


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  V['x_skew'] = [(val > cutoff) for val in V.per_x]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  V['y_skew'] = [(val > cutoff) for val in V.per_y]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  V['z_skew'] = [(val > cutoff) for val in V.per_z]
A value is trying to be set on a copy of a slice from a

The linear regression mae is 1.5077044126302737
The huber regression mae is 1.4473443836017554
The cutoff is 0.9000000000000004


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  V['x_skew'] = [(val > cutoff) for val in V.per_x]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  V['y_skew'] = [(val > cutoff) for val in V.per_y]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  V['z_skew'] = [(val > cutoff) for val in V.per_z]
A value is trying to be set on a copy of a slice from a

The linear regression mae is 1.5075001807950428
The huber regression mae is 1.446154045736834
The cutoff is 0.9500000000000004


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  V['x_skew'] = [(val > cutoff) for val in V.per_x]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  V['y_skew'] = [(val > cutoff) for val in V.per_y]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  V['z_skew'] = [(val > cutoff) for val in V.per_z]
A value is trying to be set on a copy of a slice from a

The linear regression mae is 1.5071996990590406
The huber regression mae is 1.4460004449091488


In [574]:
print(maes_lr)
print(maes_h)
print(cutoffs)

[1.50825851 1.50826415 1.50825687 1.50819934 1.50816054 1.50803026
 1.50792265 1.50770441 1.50750018 1.5071997 ]
[1.44748871 1.44743924 1.44792626 1.44749791 1.44862608 1.44788971
 1.44731791 1.44734438 1.44615405 1.44600044]
[0.5  0.55 0.6  0.65 0.7  0.75 0.8  0.85 0.9  0.95]


[1.50825851 1.50826415 1.50825687 1.50819934 1.50816054 1.50803026
 1.50792265 1.50770441 1.50750018 1.5071997 ]
[1.44748871 1.44743924 1.44792626 1.44749791 1.44862608 1.44788971
 1.44731791 1.44734438 1.44615405 1.44600044]
[0.5  0.55 0.6  0.65 0.7  0.75 0.8  0.85 0.9  0.95]

In [577]:
# Tune cutoffs for skews with cluster cutoffs 2 and 9
print("Running")
cutoffs = np.arange(.8, 1, .05)
n = len(cutoffs)
maes_lr = np.zeros(n)
maes_h = np.zeros(n)
i = 0
for i in range(0,n): 
    cutoff = cutoffs[i]
    print("The cutoff is " + str(cutoff))
    V = X[['az_t_pred', 'ze_t_pred', 'num_clusters','mse_cat','per_x', 'per_y', 'per_z']]
    V['x_skew'] = [(val > cutoff) for val in V.per_x]
    V['y_skew'] = [(val > cutoff) for val in V.per_y]
    V['z_skew'] = [(val > cutoff) for val in V.per_z]
    V['low_cluster'] = [(c < 2) for c in V.num_clusters]
    V['high_cluster'] = [(c > 9) for c in V.num_clusters]
    V.replace({False: 0, True: 1}, inplace=True)
    
    V_train, V_val, y_train, y_val = train_test_split(V, y, 
                                                             shuffle = True,
                                                             random_state = 134, 
                                                             test_size = .25)
    feature = ['az_t_pred', 'ze_t_pred', 'x_skew', 'y_skew', 'z_skew', 'mse_cat', 'low_cluster', 'high_cluster']
    mae_lr = run_lr(V_train, y_train, feature)
    mae_h = run_sgd(V_train, y_train, feature, 'huber')
    maes_lr[i] = mae_lr
    maes_h[i] = mae_h
    print("The linear regression mae is " + str(mae_lr))
    print("The huber regression mae is " + str(mae_h))
    i += 1

Running
The cutoff is 0.8


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  V['x_skew'] = [(val > cutoff) for val in V.per_x]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  V['y_skew'] = [(val > cutoff) for val in V.per_y]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  V['z_skew'] = [(val > cutoff) for val in V.per_z]
A value is trying to be set on a copy of a slice from a

[[ 0.18051694 -0.01565296  0.01032815  0.04945566  0.03202425 -0.0417141
   0.16547307 -0.70941001]
 [-0.00913826  0.15541085  0.00167505  0.00387142  0.05020645  0.05965399
  -0.03252094 -0.20293921]]
[[ 0.18155117 -0.01717815 -0.00589003  0.05689884  0.03704312 -0.04248639
   0.17122941  0.32210757]
 [-0.00829287  0.15513062  0.0032784   0.00546635  0.04886067  0.05785393
  -0.03025072 -0.276125  ]]
[[ 0.18045666 -0.0136946   0.00212204  0.05320397  0.04382109 -0.0438815
   0.16166194 -0.70939719]
 [-0.00840434  0.15315     0.00242368  0.00592167  0.0477392   0.05103778
  -0.03200013 -0.20450033]]
[[ 0.18023586 -0.01792074  0.00571268  0.05648074  0.04013085 -0.05101004
   0.16114709 -2.77410025]
 [-0.00816924  0.15570997  0.0056558   0.00486077  0.04647955  0.05397528
  -0.03325066 -0.05977422]]
[[ 0.18127814 -0.01853172  0.00802505  0.0515932   0.04321307 -0.04138959
   0.16223957 -0.70944938]
 [-0.00837184  0.15302442  0.00450888  0.00291596  0.04880645  0.05249022
  -0.0342516  -

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  V['x_skew'] = [(val > cutoff) for val in V.per_x]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  V['y_skew'] = [(val > cutoff) for val in V.per_y]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  V['z_skew'] = [(val > cutoff) for val in V.per_z]
A value is trying to be set on a copy of a slice from a

[[ 0.18071209 -0.01525307  0.01190117  0.05531756  0.03746963 -0.04120979
   0.16172032 -0.71196996]
 [-0.00908166  0.15531471  0.00331858  0.00423291  0.05192179  0.06012882
  -0.03455703 -0.20486534]]
[[ 0.18173603 -0.01682031 -0.00138306  0.06498727  0.04129029 -0.04177344
   0.16725187  0.32531114]
 [-0.00816476  0.15520174  0.00454586  0.00651452  0.05341391  0.05843141
  -0.03276443 -0.27582579]]
[[ 0.18064988 -0.01341311  0.00517798  0.0597575   0.04786532 -0.04327868
   0.15773337 -0.71223561]
 [-0.00829974  0.15313389  0.00363032  0.00635704  0.0509475   0.05151826
  -0.03417544 -0.2063717 ]]
[[ 0.1803927  -0.01762039  0.00757692  0.06386201  0.04339381 -0.05046531
   0.15729762 -2.78534315]
 [-0.00809073  0.15570421  0.00775167  0.0050795   0.04932967  0.05448594
  -0.03546675 -0.06488031]]
[[ 0.18146738 -0.01827456  0.01019355  0.05928513  0.04627153 -0.04094519
   0.1583446  -0.71248241]
 [-0.00826456  0.15301289  0.00655243  0.00381529  0.05166905  0.05296078
  -0.03652923

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  V['x_skew'] = [(val > cutoff) for val in V.per_x]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  V['y_skew'] = [(val > cutoff) for val in V.per_y]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  V['z_skew'] = [(val > cutoff) for val in V.per_z]
A value is trying to be set on a copy of a slice from a

[[ 0.18086437 -0.0150153   0.01308629  0.06227286  0.03919902 -0.04119469
   0.15773691 -0.71149497]
 [-0.00895093  0.15530568  0.00313251  0.00584452  0.05557421  0.06041354
  -0.03719355 -0.20643332]]
[[ 1.81837764e-01 -1.66074830e-02 -9.18877938e-06  6.94046114e-02
   4.30085993e-02 -4.16576877e-02  1.63498566e-01  3.24014523e-01]
 [-8.05733020e-03  1.55168230e-01  5.02177619e-03  8.59063661e-03
   5.59425152e-02  5.87369774e-02 -3.54294987e-02 -2.76967899e-01]]
[[ 0.18084809 -0.01302492  0.00912696  0.06728514  0.05215011 -0.04297855
   0.1526056  -0.71347702]
 [-0.00820389  0.1531116   0.00306355  0.00726922  0.05430247  0.05177644
  -0.03663959 -0.20792249]]
[[ 0.1805536  -0.01729588  0.01275029  0.07035134  0.04563429 -0.05022967
   0.15250064 -2.78625781]
 [-0.00799167  0.1556439   0.00821269  0.00633167  0.05141276  0.05475132
  -0.03784494 -0.06164946]]
[[ 0.18153831 -0.01818995  0.0124765   0.06469018  0.04549358 -0.04097157
   0.1546009  -0.71249584]
 [-0.00811176  0.153040

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  V['x_skew'] = [(val > cutoff) for val in V.per_x]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  V['y_skew'] = [(val > cutoff) for val in V.per_y]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  V['z_skew'] = [(val > cutoff) for val in V.per_z]
A value is trying to be set on a copy of a slice from a

[[ 0.18111919 -0.0145406   0.01170842  0.07042539  0.0455025  -0.04155483
   0.15259709 -0.69319514]
 [-0.00887078  0.15521347  0.00426875  0.00593283  0.05683516  0.06020856
  -0.03948094 -0.20701317]]
[[ 0.18215294 -0.01598115  0.0015753   0.0757626   0.05272604 -0.04173756
   0.15750561  0.32517772]
 [-0.00798371  0.15506318  0.00698022  0.00875513  0.05668201  0.0585018
  -0.03782274 -0.27911075]]
[[ 0.18120714 -0.01236339  0.00851913  0.07571998  0.06266358 -0.04314499
   0.14600102 -0.69404076]
 [-0.0081047   0.15309026  0.00444415  0.00769987  0.05637689  0.05168549
  -0.03916763 -0.2078642 ]]
[[ 0.18085818 -0.01668094  0.01336647  0.0780428   0.05390299 -0.05051158
   0.14640676 -2.72449777]
 [-0.00790203  0.15560553  0.00963508  0.00680809  0.05284622  0.05455748
  -0.04019554 -0.05854675]]
[[ 0.18183919 -0.01759086  0.01449381  0.07403178  0.05328315 -0.04128848
   0.14823346 -0.69249435]
 [-0.00799712  0.15302592  0.00844434  0.00727846  0.05783345  0.05302673
  -0.04223305 

In [578]:
print(maes_lr)
print(maes_h)

[1.50619523 1.50602608 1.5058829  1.50566216]
[1.4430938  1.44297574 1.44299279 1.44302201]


[1.50619523 1.50602608 1.5058829  1.50566216]
[1.4430938  1.44297574 1.44299279 1.44302201]

Based on looking at the coefficients and performing a few tests, it seems the best cutoff is around .9 for the skew and around 2 and 9 for the clusters