# Script to generate a datachek with all the relevant information 
#### The information is stored per per subrun / run that is contained either in the datachecks and the MAGIC weather station

The data we will store in the datacheck will be the following:

#### <span style="color:darkred;">- Time:</span>
<span style="color:darkblue;">- Timestamp [datetime object]</span>\
<span style="color:darkblue;">- Time elapsed [s]</span>
#### <span style="color:darkred;">- Pointing:</span>
<span style="color:darkblue;">- Azimuth [deg]</span>\
<span style="color:darkblue;">- Zenith distance [deg]</span>
#### <span style="color:darkred;">- Intensity profiles:</span>
<span style="color:darkblue;">- Intensity at Half Peak Rate [p.e.]</span>\
<span style="color:darkblue;">- Cosmics Rate at 422p.e. [ev / s / p.e.]</span>\
<span style="color:darkblue;">- Delta Cosmics Rate at 422p.e. [ev / s / p.e.]</span>\
<span style="color:darkblue;">- Cosmics Spectral Index []</span>\
<span style="color:darkblue;">- Light Yield [p.e./p.e.]</span>
#### <span style="color:darkred;">- Weather:</span>
<span style="color:darkblue;">- Temperature [Cº]</span>\
<span style="color:darkblue;">- Pressure [mmHg]</span>\
<span style="color:darkblue;">- Humidity [%]</span>\
<span style="color:darkblue;">- Wind Speed [km/h]</span>\
<span style="color:darkblue;">- Wind Gust [km/h]</span>\
<span style="color:darkblue;">- Wind Speed Average [km/h]</span>\
<span style="color:darkblue;">- TNG Dust [$\micro g/m^3$]</span>\
<span style="color:darkblue;">- TNG Seeing [arcsecond]</span>\
<span style="color:darkblue;">- Rain [tbd]</span>



## Datacheck `cosmics_intensity_spectrum` (subrun-wise)
Contains:\
yyyymmdd, ra_tel, dec_tel, cos_zenith, az_tel, runnumber,
       subrun, time, elapsed_time, corrected_elapsed_time,
       cosmics_rate, cosmics_cleaned_rate, intensity_at_half_peak_rate,
       ZD_corrected_intensity_at_half_peak_rate, cosmics_peak_rate,
       ZD_corrected_cosmics_peak_rate, cosmics_rate_at_422_pe,
       ZD_corrected_cosmics_rate_at_422_pe, cosmics_spectral_index,
       ZD_corrected_cosmics_spectral_index, intensity_spectrum_fit_p_value,
       intensity_at_reference_rate, diffuse_nsb_std,
       num_star_affected_pixels, anomalous_low_intensity_peak

## Datachek `runsummary` (run-wise)
Contains:\
runnumber, time, elapsed_time, min_altitude, mean_altitude,
       max_altitude, min_azimuth, max_azimuth, mean_azimuth, mean_ra,
       mean_dec, num_cosmics, num_pedestals, num_flatfield,
       num_unknown_ucts_trigger_tags, num_wrong_ucts_tags_in_cosmics,
       num_wrong_ucts_tags_in_pedestals, num_wrong_ucts_tags_in_flatfield,
       num_ucts_jumps, num_unknown_tib_trigger_tags,
       num_wrong_tib_tags_in_cosmics, num_wrong_tib_tags_in_pedestals,
       num_wrong_tib_tags_in_flatfield, num_pedestals_after_cleaning,
       num_contained_mu_rings, ff_charge_mean, ff_charge_mean_err,
       ff_charge_stddev, ff_time_mean, ff_time_mean_err,
       ff_time_stddev, ff_rel_time_stddev, ped_charge_mean,
       ped_charge_mean_err, ped_charge_stddev,
       ped_fraction_pulses_above10, ped_fraction_pulses_above30,
       cosmics_fraction_pulses_above10, cosmics_fraction_pulses_above30,
       mu_effi_mean, mu_effi_stddev, mu_width_mean, mu_width_stddev,
       mu_hg_peak_sample_mean, mu_hg_peak_sample_stddev,
       mu_intensity_mean, mean_number_of_pixels_nearby_stars
       
## Weather Station data
Contains:\
sun_alt, sun_az, fBits, mjd, temperature, pressure,
       windDirection, humidity, windSpeedCurrent, windGust,
       windSpeedAverage, windDirectionAverage, tempSensor, tngDust,
       tngSeeing, rain, state, Any, Mes, DP, diff1, is_dup,
       temperatureR

#### Import needed packages and scripts


In [1]:
# Importing necessary libraries
import matplotlib.pyplot as plt
import numpy as np
import astropy.units as u
from datetime import datetime
import pickle, json, sys, os, glob
import pandas as pd
pd.set_option('display.max_columns', None)

# Display settings for Jupyter Notebook
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# Importing custom utility functions
sys.path.insert(0, os.getcwd() + "/../scripts/")
import auxiliar as aux

### Paths to data and results

In [2]:
# Number of rows for each job
n_rows = 6000

# Root path of this script
root = os.getcwd() + "/"
# Objects directory
root_objects = root + "objects/"

# Directory of all the night-wise datachecks
root_dchecks = "/fefs/aswg/workspace/abelardo.moralejo/data/datachecks/night_wise/DL1_datacheck_"
# Weather station file
ws_database = root_objects + "WS2003-23.h5"

# Some filenames -------------------
# Filename of the datacheck dictionary
fname_datacheck_dict = root_objects + "datacheck_dict.pkl"
# Filename of the total dictionary
fname_total_dict = root_objects + "total_dict.pkl"
# Job list file
fname_job_list = root_objects + "bash_job_list.txt"
# Filename of the relation between run and night
fname_run_night_relation = root_objects + "ws_run_relation.txt"


# Flags for computing or not different parts
# Compute the datacheck dictionary
compute_datacheck_dict = True
# Send all the bash jobs to the cluster
send_jobs = True

# Create needed folders
for dir in [root_objects, root_objects + "output_slurm/"]:
    if not os.path.exists(dir):
        os.makedirs(dir)

### Extracting dates and parameters of all runs/subruns

In [3]:
%%time
if compute_datacheck_dict:

    run_number   = [] # Run numbers
    srun_number  = [] # Subrun numbers
    timestamps   = [] # Timestamps of each subrun
    time_elapsed = [] # Elapsed time of each subrun
    mean_azimuth = [] # Mean azimuth of each run
    mean_zenith_distance = [] # Mean zenith of each run
    mean_ra = [] # Mean right ascension
    mean_dec = [] # Mean dec
    zd_corrected_intensity_at_half_peak_rate = [] # ZD corrected intensity at half peak rate
    zd_corrected_cosmics_rate_at_422_pe = [] # ZD corrected cosmics rate at 422 pe
    zd_corrected_delta_cosmics_rate_at_422_pe = [] # ZD corrected delta cosmics rate at 422 pe
    zd_corrected_cosmics_spectral_index = [] # ZD corrected cosmics spectral index
    zd_corrected_delta_cosmics_spectral_index = [] # ZD corrected delta cosmics spectral index
    light_yield = [] # Light yield

    # All the datachecks for all the nights
    dchecks = np.sort(glob.glob(root_dchecks + "*.h5"))

    # We iterate over all the datachecks
    for i, dcheck in enumerate(dchecks):

        print(f"Analysing... {i:3}/{len(dchecks)}") if i % 30 == 0 else None

        # The datacheck file of the run summary (runwise)
        ds = pd.read_hdf(dcheck, key="runsummary")
        # The datacheck file of the intensity spectrums (subrunwise)
        di = pd.read_hdf(dcheck, key="cosmics_intensity_spectrum")
        
        # Iterating over all the entries of each night, the subruns
        for j in range(len(ds)):

            # Reference run number
            runref = ds["runnumber"].iloc[j]
            
            # Intensity datacheck for only the subruns of the reference run
            di_run = di.query(f"runnumber == {runref}")
            
            # Subrun iteration and storing all the data we are interested in
            for k in range(len(di_run)):

                run_number.append(runref)
                srun_number.append(di_run["subrun"].iloc[k])
                timestamps.append(datetime.fromtimestamp(di_run["time"].iloc[k]))
                time_elapsed.append(di_run["corrected_elapsed_time"].iloc[k])
                mean_azimuth.append(ds["mean_azimuth"].iloc[j])
                mean_zenith_distance.append(np.arccos(di_run["cos_zenith"].iloc[k]))                
                mean_ra.append(ds["mean_ra"].iloc[j])
                mean_dec.append(ds["mean_dec"].iloc[j])
                zd_corrected_intensity_at_half_peak_rate.append(di_run["ZD_corrected_intensity_at_half_peak_rate"].iloc[k])
                zd_corrected_cosmics_rate_at_422_pe.append(di_run["ZD_corrected_cosmics_rate_at_422_pe"].iloc[k])
                zd_corrected_delta_cosmics_rate_at_422_pe.append(di_run["ZD_corrected_delta_cosmics_rate_at_422_pe"].iloc[k])
                zd_corrected_cosmics_spectral_index.append(di_run["ZD_corrected_cosmics_spectral_index"].iloc[k])
                zd_corrected_delta_cosmics_spectral_index.append(di_run["delta_cosmics_spectral_index"].iloc[k])
                light_yield.append(di_run["light_yield"].iloc[k])            

    print(f"Analysing... {len(dchecks):3}/{len(dchecks)}\n")

    # Now we are going to sort looking to the timestamps
    _, run_number = aux.sort_based(run_number, timestamps)
    _, srun_number = aux.sort_based(srun_number, timestamps)
    _, time_elapsed = aux.sort_based(time_elapsed, timestamps)
    _, mean_azimuth = aux.sort_based(mean_azimuth, timestamps)
    _, mean_zenith_distance = aux.sort_based(mean_zenith_distance, timestamps)
    _, mean_ra = aux.sort_based(mean_ra, timestamps)
    _, mean_dec = aux.sort_based(mean_dec, timestamps)
    _, zd_corrected_intensity_at_half_peak_rate = aux.sort_based(zd_corrected_intensity_at_half_peak_rate, timestamps)
    _, zd_corrected_cosmics_rate_at_422_pe = aux.sort_based(zd_corrected_cosmics_rate_at_422_pe, timestamps)
    _, zd_corrected_delta_cosmics_rate_at_422_pe = aux.sort_based(zd_corrected_delta_cosmics_rate_at_422_pe, timestamps)
    _, zd_corrected_cosmics_spectral_index = aux.sort_based(zd_corrected_cosmics_spectral_index, timestamps)
    _, zd_corrected_delta_cosmics_spectral_index = aux.sort_based(zd_corrected_delta_cosmics_spectral_index, timestamps)
    timestamps, light_yield = aux.sort_based(light_yield, timestamps)

    # Creating the data dictionary
    dict_dcheck = {
        "run" : np.array(run_number),
        "srun" : np.array(srun_number),
        "time" : np.array(timestamps),
        "telapsed" : np.array(time_elapsed),
        "az" : np.rad2deg(mean_azimuth),
        "zd" : np.rad2deg(mean_zenith_distance),
        "ra" : np.array(mean_ra),
        "dec" : np.array(mean_dec),
        "ZD_corrected_intensity_at_half_peak_rate" : np.array(zd_corrected_intensity_at_half_peak_rate),
        "ZD_corrected_cosmics_rate_at_422_pe" : np.array(zd_corrected_cosmics_rate_at_422_pe),
        "ZD_corrected_delta_cosmics_rate_at_422_pe" : np.array(zd_corrected_delta_cosmics_rate_at_422_pe),
        "ZD_corrected_cosmics_spectral_index" : np.array(zd_corrected_cosmics_spectral_index),
        "delta_cosmics_spectral_index" : np.array(zd_corrected_delta_cosmics_spectral_index),
        "light_yield" : np.array(light_yield)
    }        

    # Saving the objects in the objects directory
    with open(fname_datacheck_dict, 'wb') as f:
        pickle.dump(dict_dcheck, f, pickle.HIGHEST_PROTOCOL)  
else:
    # To read the file:
    with open(fname_datacheck_dict, 'rb') as f:
            dict_dcheck = pickle.load(f)    

Analysing...   0/529
Analysing...  30/529
Analysing...  60/529
Analysing...  90/529
Analysing... 120/529
Analysing... 150/529
Analysing... 180/529
Analysing... 210/529
Analysing... 240/529
Analysing... 270/529
Analysing... 300/529
Analysing... 330/529
Analysing... 360/529
Analysing... 390/529
Analysing... 420/529
Analysing... 450/529
Analysing... 480/529
Analysing... 510/529
Analysing... 529/529

CPU times: user 5min 51s, sys: 24.6 s, total: 6min 15s
Wall time: 7min 57s


### Creating the total dictionary run-subrun-wise

In [4]:
%%time
total_dict = {}

# We create an entry per run
for run in np.unique(dict_dcheck["run"]):
    total_dict[run] = {}

# Converting dcheck dictionary to total dictionary
for i in range(len(dict_dcheck["run"])):

    total_dict[dict_dcheck["run"][i]][dict_dcheck["srun"][i]] = {
        "time" : dict_dcheck["time"][i],
        "telapsed" : dict_dcheck["telapsed"][i],
        "az" : dict_dcheck["az"][i],
        "zd" : dict_dcheck["zd"][i],
        "ra" : dict_dcheck["ra"][i],
        "dec" : dict_dcheck["dec"][i],
        "ZD_corrected_intensity_at_half_peak_rate" : dict_dcheck["ZD_corrected_intensity_at_half_peak_rate"][i],
        "ZD_corrected_cosmics_rate_at_422_pe" : dict_dcheck["ZD_corrected_cosmics_rate_at_422_pe"][i],
        "ZD_corrected_delta_cosmics_rate_at_422_pe" : dict_dcheck["ZD_corrected_delta_cosmics_rate_at_422_pe"][i],
        "ZD_corrected_cosmics_spectral_index" : dict_dcheck["ZD_corrected_cosmics_spectral_index"][i],
        "delta_cosmics_spectral_index" : dict_dcheck["delta_cosmics_spectral_index"][i],        
        "light_yield" : dict_dcheck["light_yield"][i]
    }

#### Reading the WS table and we reduce it to the part we are interested in

In [5]:
%%time
# Loading the weather station database
df_ws = pd.read_hdf(ws_database)

# Loading the timestamp of each entry in the datacheck dictionary
dates_dcheck = dict_dcheck["time"]

# Getting the min and max dates
maxdate = np.max(dates_dcheck)
mindate = np.min(dates_dcheck)

# Converting the weather station dates to datetime objects
dates_ws = np.array([datetime.fromisoformat(str(d).split(".")[0]) for d in df_ws.index])

# Getting the max date of the weather station
maxdate_ws = np.max(dates_ws)

# Masking the weather station data to the min and max dates of the datacheck dictionary
mask_dates  = ((dates_ws > mindate) & (dates_ws < maxdate))

# Masking also for day data, i.e. sun_alt > 0 we are not interested in 
mask_night = (df_ws["sun_alt"] < 0)

total_mask = (mask_dates & mask_night)

dates_ws = dates_ws[total_mask]
df_ws    = df_ws[total_mask]

### Separating in bunchs of small number of jobs and writting into a txt file

In [6]:
start_indexes = []
end_indexes   = []

i, total = 0, 0
while total < len(dict_dcheck["run"]):
    start_indexes.append(total)
    end_indexes.append(total + n_rows - 1)
    
    i     += 1
    total += n_rows

print(f"With groups of {n_rows} subruns, the number of prepared jobs is {len(start_indexes)}")

# Opening a new txt file with a job per column
file_job_list = open(fname_job_list, "w")

for s, e in zip(start_indexes, end_indexes):
    file_job_list.write(f"{s},{e}\n") 

file_job_list.close()

With groups of 6000 subruns, the number of prepared jobs is 129


### Launching the jobs to the queue

In [7]:
if send_jobs == True:
    # Creating a file to store the results of the jobs
    file_results = open(fname_run_night_relation, "w")
    file_results.write("# Run - Subrun , WS entry id (date in ISO format)")
    file_results.close()
    
    # Launching the jobs
    print("")
    !sh bash_jobs_indexes_ws_run.sh


Sending job 0,5999 to the queue...

Submitted batch job 35016362
Sending job 6000,11999 to the queue...

Submitted batch job 35016363
Sending job 12000,17999 to the queue...

Submitted batch job 35016364
Sending job 18000,23999 to the queue...

Submitted batch job 35016365
Sending job 24000,29999 to the queue...

Submitted batch job 35016366
Sending job 30000,35999 to the queue...

Submitted batch job 35016367
Sending job 36000,41999 to the queue...

Submitted batch job 35016368
Sending job 42000,47999 to the queue...

Submitted batch job 35016369
Sending job 48000,53999 to the queue...

Submitted batch job 35016370
Sending job 54000,59999 to the queue...

Submitted batch job 35016371
Sending job 60000,65999 to the queue...

Submitted batch job 35016372
Sending job 66000,71999 to the queue...

Submitted batch job 35016373
Sending job 72000,77999 to the queue...

Submitted batch job 35016374
Sending job 78000,83999 to the queue...

Submitted batch job 35016375
Sending job 84000,89999 t

### <span style="color:red;">------------------------------------------------------------------------------------------------------------------</span>
### <span style="color:red;">------------------------------------------------------------------------------------------------------------------</span>

### <span style="color:red;"> Wait untill the jobs are processed and then the results need to be fully stored </span>
### <span style="color:red;">------------------------------------------------------------------------------------------------------------------</span>
### <span style="color:red;">------------------------------------------------------------------------------------------------------------------</span>

#### Now we read the results file where we associate each subrun to a entry of the WS data

In [10]:
%%time
# Reading the results
file_results_lines = np.loadtxt(fname_run_night_relation, dtype=str, delimiter=",")

# Creating a dictionary to organise them
dict_results = {}

for line in file_results_lines:

    runsubrun, date_str = line

    run, srun = runsubrun.split("-")
    run = int(run)
    srun = int(srun)

    date_str = date_str if date_str != "None" else None

    try:
        dict_results[run][srun] = date_str    
    except KeyError:
        dict_results[run] = {srun : date_str}
    

CPU times: user 2.24 s, sys: 175 ms, total: 2.41 s
Wall time: 2.55 s


#### Now the weather station data can be added to the total dictionary

In [11]:
%%time
for i, run in enumerate(total_dict.keys()):

    print(f"Adding data... {i:6}/{len(total_dict.keys())} runs") if i % 500 == 0 else None
        
    for srun in total_dict[run].keys():

        try:
            string_date = dict_results[run][srun]

            if string_date != None:
                empty_flag = False
            else:
                empty_flag = True
        except KeyError:
            empty_flag = True

        if not empty_flag:
            try:
                total_dict[run][srun]["weather"] = {
                    "temperature" :        df_ws.loc[string_date]["temperature"],      # degree celsius
                    "pressure" :           df_ws.loc[string_date]["pressure"],         # mmHg
                    "humidity" :           df_ws.loc[string_date]["humidity"],         # %
                    "wind_speed" :         df_ws.loc[string_date]["windSpeedCurrent"], # km/h
                    "wind_gust" :          df_ws.loc[string_date]["windGust"],         # km/h
                    "wind_speed_average" : df_ws.loc[string_date]["windSpeedAverage"], # km/s
                    "tng_dust" :           df_ws.loc[string_date]["tngDust"],          # ug/m3
                    "tng_seeing" :         df_ws.loc[string_date]["tngSeeing"],        # arcseconds
                    "rain" :               df_ws.loc[string_date]["rain"],             #
                }
            except KeyError:
                print(f"KeyError in Run {run}, Subrun {srun} with entry ID {string_date}.")
                empty_flag = True

        if empty_flag:            
            total_dict[run][srun]["weather"] = {
                "temperature" :        None, # degree celsius
                "pressure" :           None, # mmHg
                "humidity" :           None, # %
                "wind_speed" :         None, # km/h
                "wind_gust" :          None, # km/h
                "wind_speed_average" : None, # km/s
                "tng_dust" :           None, # ug/m3
                "tng_seeing" :         None, # arcseconds
                "rain" :               None, #
            } 

Adding data...      0/7771 runs
Adding data...    500/7771 runs
Adding data...   1000/7771 runs
Adding data...   1500/7771 runs
Adding data...   2000/7771 runs
Adding data...   2500/7771 runs
Adding data...   3000/7771 runs
Adding data...   3500/7771 runs
Adding data...   4000/7771 runs
Adding data...   4500/7771 runs
Adding data...   5000/7771 runs
Adding data...   5500/7771 runs
Adding data...   6000/7771 runs
Adding data...   6500/7771 runs
Adding data...   7000/7771 runs
Adding data...   7500/7771 runs
CPU times: user 12min 14s, sys: 1.72 s, total: 12min 16s
Wall time: 12min 16s


#### Saving the dictionary with all the information

In [12]:
# Saving the object
with open(fname_total_dict, 'wb') as f:
    pickle.dump(total_dict, f, pickle.HIGHEST_PROTOCOL)

# # To read the file:
# with open(fname_total_dict, 'rb') as f:
#         total_dict = pickle.load(f)   