# Base Notebook

Input data files are available in the read-only "../input/" directory
For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Imports

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn.model_selection import train_test_split

import h2o
from h2o.automl import H2OAutoML

import os
import glob

## Look at the data

In [2]:
# Print all files in the input directory

neutrino_folder = 'E:\\Neutrino'

for dirname, _, filenames in os.walk(neutrino_folder) :
    for filename in filenames:
        print(os.path.join(dirname, filename))

E:\Neutrino\sample_submission.parquet
E:\Neutrino\sensor_geometry.csv
E:\Neutrino\test_meta.parquet
E:\Neutrino\train_meta.parquet
E:\Neutrino\test\batch_661.parquet
E:\Neutrino\train\batch_1.parquet
E:\Neutrino\train\batch_10.parquet
E:\Neutrino\train\batch_100.parquet
E:\Neutrino\train\batch_101.parquet
E:\Neutrino\train\batch_102.parquet
E:\Neutrino\train\batch_103.parquet
E:\Neutrino\train\batch_104.parquet
E:\Neutrino\train\batch_105.parquet
E:\Neutrino\train\batch_106.parquet
E:\Neutrino\train\batch_107.parquet
E:\Neutrino\train\batch_108.parquet
E:\Neutrino\train\batch_109.parquet
E:\Neutrino\train\batch_11.parquet
E:\Neutrino\train\batch_110.parquet
E:\Neutrino\train\batch_111.parquet
E:\Neutrino\train\batch_112.parquet
E:\Neutrino\train\batch_113.parquet
E:\Neutrino\train\batch_114.parquet
E:\Neutrino\train\batch_115.parquet
E:\Neutrino\train\batch_116.parquet
E:\Neutrino\train\batch_117.parquet
E:\Neutrino\train\batch_118.parquet
E:\Neutrino\train\batch_119.parquet
E:\Neutrin

In [3]:
# Look at the first parquet file

train_batch_example = pd.read_parquet(neutrino_folder + '\\train\\batch_1.parquet')
train_batch_example.head()

Unnamed: 0_level_0,sensor_id,time,charge,auxiliary
event_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
24,3918,5928,1.325,True
24,4157,6115,1.175,True
24,3520,6492,0.925,True
24,5041,6665,0.225,True
24,2948,8054,1.575,True


In [4]:
# Look at the first parquet shape
train_batch_example.shape

(32792416, 4)

In [5]:
# look at the sample submission file
sample_submission = pd.read_parquet(neutrino_folder + '\\sample_submission.parquet')
sample_submission.head()

Unnamed: 0,event_id,azimuth,zenith
0,2092,1,1
1,7344,1,1
2,9482,1,1


In [6]:
# look at the test file
test = pd.read_parquet(neutrino_folder + '\\test\\batch_661.parquet')
test.head()

Unnamed: 0_level_0,sensor_id,time,charge,auxiliary
event_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2092,4066,6170,1.275,True
2092,3512,6374,0.975,True
2092,897,6378,1.475,True
2092,2060,6590,0.925,True
2092,3072,6625,1.075,True


In [7]:
# look at the train metadata file
train_metadata = pd.read_parquet(neutrino_folder + '\\train_meta.parquet')
train_metadata.head()

Unnamed: 0,batch_id,event_id,first_pulse_index,last_pulse_index,azimuth,zenith
0,1,24,0,60,5.029555,2.087498
1,1,41,61,111,0.417742,1.549686
2,1,59,112,147,1.160466,2.401942
3,1,67,148,289,5.845952,0.759054
4,1,72,290,351,0.653719,0.939117


In [8]:
# look at the test metadata file
test_metadata = pd.read_parquet(neutrino_folder + '\\test_meta.parquet')
test_metadata.head()

Unnamed: 0,batch_id,event_id,first_pulse_index,last_pulse_index
0,661,2092,0,298
1,661,7344,299,334
2,661,9482,335,377


In [9]:
# look at the sensor geometry csv file
sensor_geometry = pd.read_csv(neutrino_folder + '\\sensor_geometry.csv')
sensor_geometry.head()

Unnamed: 0,sensor_id,x,y,z
0,0,-256.14,-521.08,496.03
1,1,-256.14,-521.08,479.01
2,2,-256.14,-521.08,461.99
3,3,-256.14,-521.08,444.97
4,4,-256.14,-521.08,427.95


## Merging files

In [10]:
# Make train and test batches dataframes
'''''
train_batch_files = glob.glob(neutrino_folder + '\\train\\*.parquet')[:100]
test_batch_files = glob.glob(neutrino_folder + '\\test\\*.parquet')[:100]

train_batches = [pd.read_parquet(f) for f in train_batch_files]
test_batches = [pd.read_parquet(f) for f in test_batch_files]
'''

"''\ntrain_batch_files = glob.glob(neutrino_folder + '\\train\\*.parquet')[:100]\ntest_batch_files = glob.glob(neutrino_folder + '\\test\\*.parquet')[:100]\n\ntrain_batches = [pd.read_parquet(f) for f in train_batch_files]\ntest_batches = [pd.read_parquet(f) for f in test_batch_files]\n"

In [11]:
# Merge the train and test examples with the sensor geometry
train_example = train_batch_example.merge(sensor_geometry, left_on='sensor_id', right_index=True)
test_example = test.merge(sensor_geometry, left_on='sensor_id', right_index=True)

In [12]:
train_example.head()

Unnamed: 0_level_0,sensor_id,sensor_id_x,time,charge,auxiliary,sensor_id_y,x,y,z
event_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
24,3918,3918,5928,1.325,True,3918,303.41,335.64,206.58
2743,3918,3918,10813,0.975,True,3918,303.41,335.64,206.58
3007,3918,3918,10290,0.925,False,3918,303.41,335.64,206.58
3007,3918,3918,10515,0.625,False,3918,303.41,335.64,206.58
3007,3918,3918,10688,0.925,False,3918,303.41,335.64,206.58


In [13]:
# Merge train_meta and test_meta with train and test data
train_example = train_example.merge(train_metadata, left_on='event_id', right_index=True)
test_example = test_example.merge(test_metadata, left_on='event_id', right_index=True)

In [14]:
# Look at the train example
train_example.head()

Unnamed: 0_level_0,sensor_id,sensor_id_x,time,charge,auxiliary,sensor_id_y,x,y,z,batch_id,event_id,first_pulse_index,last_pulse_index,azimuth,zenith
event_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
24,3918,3918,5928,1.325,True,3918,303.41,335.64,206.58,1,451,2024,2080,6.167555,1.596237
24,4157,4157,6115,1.175,True,4157,-145.45,374.24,212.73,1,451,2024,2080,6.167555,1.596237
24,3520,3520,6492,0.925,True,3520,505.27,257.88,-174.6,1,451,2024,2080,6.167555,1.596237
24,3520,3520,14523,1.325,True,3520,505.27,257.88,-174.6,1,451,2024,2080,6.167555,1.596237
24,5041,5041,6665,0.225,True,5041,-9.68,-79.5,181.0,1,451,2024,2080,6.167555,1.596237


## AutoML with TPOT

### Train test split

In [15]:
# Split the train example into X and Y for azimuth and zenith
X_azi = train_example.drop(columns='azimuth')
Y_azi = train_example['azimuth']
X_zen = train_example.drop(columns='zenith')
Y_zen = train_example['zenith']

In [16]:
Y_azi.head()

event_id
24    6.167555
24    6.167555
24    6.167555
24    6.167555
24    6.167555
Name: azimuth, dtype: float64

In [17]:
# Split the train example into train and test for azimuth
X_azi_train, X_azi_test, Y_azi_train, Y_azi_test = train_test_split(X_azi, Y_azi, test_size=0.2, random_state=42)
# Split the train example into train and test for zenith
X_zen_train, X_zen_test, Y_zen_train, Y_zen_test = train_test_split(X_zen, Y_zen, test_size=0.2, random_state=42)

In [18]:
X_azi_train.head()

Unnamed: 0_level_0,sensor_id,sensor_id_x,time,charge,auxiliary,sensor_id_y,x,y,z,batch_id,event_id,first_pulse_index,last_pulse_index,zenith
event_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1835241,1344,1344,10260,3.025,False,1344,-368.93,-210.23,92.04,10,29867631,5811118,5811173,2.22763
442338,1671,1671,11584,0.375,True,1671,248.15,-111.87,-370.02,3,7203323,6225494,6225529,0.687027
659533,1829,1829,11786,6.125,False,1829,-570.9,-125.14,6.0,4,10745886,9879250,9879289,1.664153
2305027,4980,4980,18117,0.975,True,4980,57.2,-105.52,186.02,12,37529543,16572960,16573023,2.651641
1299554,3054,3054,16709,2.575,False,3054,-481.6,101.39,-417.07,7,21148241,16216023,16216052,1.854019


In [19]:
# Start H2O
h2o.init()

# Convert the train and test data to H2O frames for azimuth and zenith
h2o_X_azi_train = h2o.H2OFrame(pd.concat([X_azi_train, Y_azi_train], axis=1))
h2o_X_azi_test = h2o.H2OFrame(pd.concat([X_azi_test, Y_azi_test], axis=1))

Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
; Java HotSpot(TM) 64-Bit Server VM (build 25.361-b09, mixed mode)
  Starting server from C:\Users\shexx\anaconda3\Lib\site-packages\h2o\backend\bin\h2o.jar
  Ice root: C:\Users\shexx\AppData\Local\Temp\tmp8lx8x95e
  JVM stdout: C:\Users\shexx\AppData\Local\Temp\tmp8lx8x95e\h2o_shexx_started_from_python.out
  JVM stderr: C:\Users\shexx\AppData\Local\Temp\tmp8lx8x95e\h2o_shexx_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,04 secs
H2O_cluster_timezone:,Europe/Paris
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.40.0.2
H2O_cluster_version_age:,15 days
H2O_cluster_name:,H2O_from_python_shexx_n6z3r7
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,7.023 Gb
H2O_cluster_total_cores:,8
H2O_cluster_allowed_cores:,8


Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |█████████████████████████████████████████████████████████████████| (done) 100%


In [20]:
# Set features and target names for azimuth
X_azi = X_azi_train.columns.to_list()
y_azi = 'azimuth'

# Set features and target names for zenith
X_zen = X_zen_train.columns.to_list()
y_zen = 'zenith'

In [21]:
X_azi

['sensor_id',
 'sensor_id_x',
 'time',
 'charge',
 'auxiliary',
 'sensor_id_y',
 'x',
 'y',
 'z',
 'batch_id',
 'event_id',
 'first_pulse_index',
 'last_pulse_index',
 'zenith']

In [22]:
# Make the h2o model for azimuth
aml_azi = H2OAutoML(max_runtime_secs=7200, seed=1)

# Train the model for azimuth
aml_azi.train(x=X_azi, y=y_azi, training_frame=h2o_X_azi_train)

AutoML progress: |
09:39:44.995: AutoML: XGBoost is not available; skipping it.

███████████████████████████████████████████████████████████████| (done) 100%


key,value
Stacking strategy,blending
Number of base models (used / total),2/3
# GBM base models (used / total),1/1
# DRF base models (used / total),1/1
# GLM base models (used / total),0/1
Metalearner algorithm,GLM
Metalearner fold assignment scheme,AUTO
Metalearner nfolds,0
Metalearner fold_column,
Custom metalearner hyperparameters,


In [26]:
#inspect the leaderboard for azimuth
lb_azi = aml_azi.leaderboard
lb_azi.head()

model_id,rmse,mse,mae,rmsle,mean_residual_deviance
StackedEnsemble_BestOfFamily_2_AutoML_1_20230325_93938,1.11774,1.24935,0.789361,0.358715,1.24935
StackedEnsemble_AllModels_2_AutoML_1_20230325_93938,1.11774,1.24935,0.789361,0.358715,1.24935
StackedEnsemble_AllModels_1_AutoML_1_20230325_93938,1.11774,1.24935,0.789361,0.358715,1.24935
StackedEnsemble_AllModels_3_AutoML_1_20230325_93938,1.11774,1.24935,0.789361,0.358715,1.24935
StackedEnsemble_BestOfFamily_3_AutoML_1_20230325_93938,1.11774,1.24935,0.789361,0.358715,1.24935
DRF_1_AutoML_1_20230325_93938,1.14931,1.32091,0.793737,0.351715,1.32091
StackedEnsemble_BestOfFamily_1_AutoML_1_20230325_93938,1.1562,1.3368,0.810338,0.360506,1.3368
GBM_1_AutoML_1_20230325_93938,1.19151,1.4197,0.847271,0.363603,1.4197
XRT_1_AutoML_1_20230325_93938,1.19245,1.42195,0.851997,0.364656,1.42195
GBM_4_AutoML_1_20230325_93938,1.38152,1.90859,1.07089,0.418754,1.90859


In [35]:
# Inspect the model for azimuth
aml_azi.leader

key,value
Stacking strategy,blending
Number of base models (used / total),2/3
# GBM base models (used / total),1/1
# DRF base models (used / total),1/1
# GLM base models (used / total),0/1
Metalearner algorithm,GLM
Metalearner fold assignment scheme,AUTO
Metalearner nfolds,0
Metalearner fold_column,
Custom metalearner hyperparameters,


In [38]:
# Path to save the model in the local directory
model_path = os.path.abspath('./models/')

# Save the model for azimuth
model = aml_azi.leader
model_path = h2o.save_model(model=model, path=model_path, force=True)

In [47]:
type(model)

h2o.estimators.stackedensemble.H2OStackedEnsembleEstimator

In [48]:
# Get the base model keys
base_models_keys = model.base_models

# Retrieve the actual base models using the model keys
b_models = [h2o.get_model(key) for key in base_models_keys]

# Extract and print hyperparameters for each base model
for i, b_model in enumerate(b_models):
    print(f"Base Model {i+1}: {b_model.model_id}")
    print(f"Model Type: {b_model.algo}")
    print("Hyperparameters:")

    for k, v in b_model.params.items():
        print(f"\t{k}: {v['actual']}")
        print('\n')

Base Model 1: DRF_1_AutoML_1_20230325_93938
Model Type: drf
Hyperparameters:
	model_id: {'__meta': {'schema_version': 3, 'schema_name': 'ModelKeyV3', 'schema_type': 'Key<Model>'}, 'name': 'DRF_1_AutoML_1_20230325_93938', 'type': 'Key<Model>', 'URL': '/3/Models/DRF_1_AutoML_1_20230325_93938'}


	training_frame: {'__meta': {'schema_version': 3, 'schema_name': 'FrameKeyV3', 'schema_type': 'Key<Frame>'}, 'name': 'AutoML_1_20230325_93938_training_Key_Frame__upload_82dc7bfcdd09a2da3a152a4b05aa4ddb.hex', 'type': 'Key<Frame>', 'URL': '/3/Frames/AutoML_1_20230325_93938_training_Key_Frame__upload_82dc7bfcdd09a2da3a152a4b05aa4ddb.hex'}


	validation_frame: {'__meta': {'schema_version': 3, 'schema_name': 'FrameKeyV3', 'schema_type': 'Key<Frame>'}, 'name': 'AutoML_1_20230325_93938_validation_Key_Frame__upload_82dc7bfcdd09a2da3a152a4b05aa4ddb.hex', 'type': 'Key<Frame>', 'URL': '/3/Frames/AutoML_1_20230325_93938_validation_Key_Frame__upload_82dc7bfcdd09a2da3a152a4b05aa4ddb.hex'}


	nfolds: 0


	keep_