In [5]:
### imports

# external modules
import os
import sys
import numpy as np
import matplotlib.pyplot as plt
import importlib

# local modules
sys.path.append('../utils')
import csv_utils as csvu
import json_utils as jsonu
import dataframe_utils as dfu
import hist_utils as hu
import autoencoder_utils as aeu
import plot_utils as pu
import generate_data_utils as gdu
import refruns_utils as rru
importlib.reload(csvu)
importlib.reload(jsonu)
importlib.reload(dfu)
importlib.reload(hu)
importlib.reload(aeu)
importlib.reload(pu)
importlib.reload(gdu)
importlib.reload(rru)
sys.path.append('../src')
sys.path.append('../src/classifiers')
sys.path.append('../src/cloudfitters')
import HistStruct
importlib.reload(HistStruct)
import DataLoader
importlib.reload(DataLoader)
import AutoEncoder
importlib.reload(AutoEncoder)
import SeminormalFitter
import GaussianKdeFitter
import HyperRectangleFitter
importlib.reload(SeminormalFitter)
importlib.reload(GaussianKdeFitter)
importlib.reload(HyperRectangleFitter)

<module 'HyperRectangleFitter' from '/eos/home-i01/k/khowey/SWAN_projects/ML4DQMDC-PixelAE/KH-AutoencoderTest/../src/cloudfitters/HyperRectangleFitter.py'>

In [6]:
### Controls
# This cell sets all major run properties

# Define a list of good 'reference' runs
goodrunsls = {'2017':
              {
                  "297056":[[-1]],
                  "297177":[[-1]],
                  "301449":[[-1]]  
              },
              '2018':{
                  "315267":[[-1]]
              }}

# Define a list of bad runs
badrunsls = {'2017':
             {
                "297287":[[-1]],
                "297288":[[-1]],
                "297289":[[-1]],
                "299316":[[-1]],
                "299324":[[-1]],
                "299326":[[-1]],
                "301086":[[88,126]]
             },
             '2018':
             {
                 #"317479":[[-1]],
                "317480":[[-1]],
                "317481":[[-1]],
                "317482":[[-1]],
                #"319847":[[1,35]]
             }}

# Set the year to be used
year = '2017'

# Set whether to train globally or locally
training_mode = 'global'

In [7]:
### Dataset Controls

# Set of histogram names to use
histnames = ([
    'NormalizedHitResiduals_TIB__Layer__1',
    'Summary_ClusterStoNCorr__OnTrack__TIB__layer__1',
    'NormalizedHitResiduals_TIB__Layer__2',
    'Summary_ClusterStoNCorr__OnTrack__TIB__layer__2',
    'NormalizedHitResiduals_TIB__Layer__3',
    'Summary_ClusterStoNCorr__OnTrack__TIB__layer__3',
    'NormalizedHitResiduals_TIB__Layer__4',
    'Summary_ClusterStoNCorr__OnTrack__TIB__layer__4',
    'chargeInner_PXLayer_1',
    'chargeInner_PXLayer_2',
    'chargeInner_PXLayer_3',
    'chargeInner_PXLayer_4',
    'chargeOuter_PXLayer_1',
    'chargeOuter_PXLayer_2',
    'chargeOuter_PXLayer_3',
    'chargeOuter_PXLayer_4',
])

In [8]:
### Decision to train on entire dataset or a subset

# Full dataset training
if training_mode == 'global':
    runsls_training = None # Decide whether to add a mask for training
    runsls_good = None # Decide whether to add a mask for good runs
    runsls_bad = badrunsls[year] # Call up our defined set of bad runs

# Train on a portion of the dataset
elif training_mode == 'local':
    # train locally on a small set of runs
    # - either on n runs preceding a chosen application run,
    # - or on the run associated as reference to the chosen application run.
    
    # Select application run and apply filter for DCS-bit on
    available_runs = dfu.get_runs( dfu.select_dcson( csvu.read_csv('../data/DF'+year+'_'+histnames[0]+'.csv') ) )
    run_application = 299316
    run_application_index = available_runs.index(run_application)
    
    # Select a training set/determine whether to use a reference run or not
    # Pretty sure this is an alternative to averaging the whole dataset?
    usereference = False
    if usereference:
        run_reference = rru.get_reference_run( run_application, jsonfile='../utils/json_allRunsRefRuns.json' )
        if run_reference<0:
            raise Exception('no valid reference run has been defined for run {}'.format(run_application))
        runsls_training = jsonu.tuplelist_to_jsondict([(run_reference,[-1])])
    else:
        ntraining = 5
        offset = 0 # normal case: offset = 0 (just use 5 previous runs)
        runsls_training = jsonu.tuplelist_to_jsondict([(el,[-1]) for el in available_runs[run_application_index-ntraining-offset:run_application_index-offset]])
    
    # I don't know what this is for
    #runsls_bad = badrunsls[year]
    #runsls_good = jsonu.tuplelist_to_jsondict([(run_application,[-1])])
    
    # Set up list of good and bad runs for training/evaluation
    runsls_bad = jsonu.tuplelist_to_jsondict([(run_application,[-1])])
    runsls_good = runsls_training
    
    # Inform user
    print('selected runs/lumisections for training: ')
    print(runsls_training)
    print('selected runs/lumisections as good test set:')
    print(runsls_good)
    print('selected runs/lumisections as bad test set:')
    print(runsls_bad)

In [9]:
### Read in data

# Controls
readnew = True
save = False

if readnew:
    
    # Initializations
    dloader = DataLoader.DataLoader()
    histstruct = HistStruct.HistStruct()
    
    # Loop over histogram types to store them
    for histname in histnames:
        print('adding {}...'.format(histname))
        
        # Bring the histograms into memory from storage for later use
        filename = '../data/DF'+year+'B_'+histname+'.csv'
        df = dloader.get_dataframe_from_file( filename )
        
        # In case of local training, we can remove most of the histograms
        if( runsls_training is not None and runsls_good is not None and runsls_bad is not None ):
            runsls_total = {k: v for d in (runsls_training, runsls_good, runsls_bad) for k, v in d.items()}
            df = dfu.select_runsls( df, runsls_total )
            
        # Store the data in the histstruct object managing this whole thing
        histstruct.add_dataframe( df )
        
    print('found {} histograms'.format(len(histstruct.runnbs)))
    
    # Add masks - I think this is important for differentiating data with
    # different attributes down the line
    histstruct.add_dcsonjson_mask( 'dcson' )
    histstruct.add_goldenjson_mask('golden' )
    histstruct.add_highstat_mask( 'highstat' )
    histstruct.add_stat_mask( 'lowstat', max_entries_to_bins_ratio=100 )
    if runsls_training is not None: histstruct.add_json_mask( 'training', runsls_training )
    if runsls_good is not None: histstruct.add_json_mask( 'good', runsls_good )
    
    # Count of bad runs for use in applying more masks
    nbadruns = 0
    if runsls_bad is not None:
        histstruct.add_json_mask( 'bad', runsls_bad )
        # special case for bad runs: add a mask per run (different bad runs have different characteristics)
        nbadruns = len(runsls_bad.keys())
        for i,badrun in enumerate(runsls_bad.keys()):
            histstruct.add_json_mask( 'bad{}'.format(i), {badrun:runsls_bad[badrun]} )
    
    # Save the histogram for later access
    if save:
        histstruct.save('test.pk1')
        
if not readnew:
    
    # Load histstruct from storage
    histstruct = HistStruct.HistStruct.load('test.pk1')
    
    # Count of bad runs, presumably for later use
    nbadruns = len([name for name in list(histstruct.masks.keys()) if 'bad' in name])
    
# Output to user
print('created a histstruct with the following properties:')
print('- number of histogram types: {}'.format(len(histstruct.histnames)))
print('- number of lumisections: {}'.format(len(histstruct.lsnbs)))
print('- masks: {}'.format(list(histstruct.masks.keys())))

adding NormalizedHitResiduals_TIB__Layer__1...
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'list'>
<class 'list'>
adding Summary_ClusterStoNCorr__OnTrack__TIB__layer__1...
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
adding NormalizedHitResiduals_TIB__Layer__2...
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
adding Summary_ClusterStoNCorr__OnTrack__TIB__layer__2...
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
adding NormalizedHitResiduals_TIB__Layer__3...
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
adding Summary_ClusterStoNCorr__OnTrack__TIB__layer__3...
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
adding NormalizedHitResiduals_TIB__Layer__4...
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class '

In [16]:
### Plot the training and/or test sets
# Useful for local mode to determine if training set is reliable
# and if application run is anomalous

skipthiscell = True

if(training_mode == 'local' and not skipthiscell):
    
    # Training and application runs (training vs good set)
    histstruct.plot_histograms( masknames=[['dcson','highstat','training'],['dcson','highstat','good']],
                                labellist = ['training','testing'],
                                colorlist = ['blue','green']
                              )
    
    # Application run and bad test runs (good vs bad set)
    histstruct.plot_histograms( masknames=[['dcson','highstat','good'],['dcson','highstat','bad']],
                                labellist = ['good','bad'],
                                colorlist = ['green','red']
                              )
    
if( training_mode=='global' and not skipthiscell ):
    
    # Bad test runs
    for i in [0,1,2,3,4,5,6]:
        
        # Plot good vs bad set
        histstruct.plot_histograms( masknames=[['dcson','highstat','good'],['dcson','highstat','bad{}'.format(i)]],
                                labellist = ['typical good histograms','bad'],
                                colorlist = ['blue','red'],
                                transparencylist = [0.01,1.]
                                  )

In [None]:
### Extend training with artifical data

# Control for the cell
extendtraining = False

if extendtraining:
    
    # Bypasses run/lumisection checks since histos are artificial
    histstruct.exthistograms['training'] = {}
    
    # Generate random histograms based on good data
    for histname in histstruct.histnames:
        # option 1: start from 'training' mask
        hists = histstruct.get_histograms( histname=histname, masknames=['dcson','highstat','training'] )
        # option 2: start from averages of DCS-on data
        #hists = hu.averagehists( 
        #            histstruct.get_histograms( histname=histname, masknames=['dcson','highstat'] ), 
        #            1000 )
        print('generating artificial training data for '+histname)
        (exthists,_,_) = gdu.upsample_hist_set(hists, 5e4, doplot=True )
        histstruct.add_exthistograms( 'training', histname, exthists )
        print(' -> generated {} histograms'.format(len(histstruct.exthistograms['training'][histname])))

In [None]:
### Define and train an autoencoder for each element

# Controls
trainnew = True
save = False
modelloc = '../models/autoencoders_global_training_dcson_highstat_v20210622'
modelbasename = ''

if trainnew:
    for histname in histstruct.histnames:
        
        # Choose training set
        hists = histstruct.get_histograms( histname=histname, masknames=['dcson','highstat'] )
        if extendtraining: hists = histstruct.get_exthistograms( 'training', histname=histname )
        print('size of training set: {}'.format(hists.shape))
        
        # Choose whether to save the model
        modelname = modelbasename+'_'+histname+'.h5'
        modelname = os.path.join(modelloc,modelname)
        if not save: modelname = '' # empty string means do not save models
        nepochs = 40 # manual number of epochs
        model = aeu.train_simple_autoencoder(hists,nepochs=nepochs,modelname=modelname,
                                            batch_size=2000
                                            )
        classifier = AutoEncoder.AutoEncoder( model=model )
        histstruct.add_classifier(histname,classifier)
    
else:
    from autoencoder_utils import mseTop10
    for histname in histstruct.histnames:
        print('loading model for {}'.format(histname))
        modelpath = modelbasename+'_'+histname+'.h5'
        modelpath = os.path.join(modelloc,modelpath)
        classifier = AutoEncoder.AutoEncoder( modelpath=modelpath )
        histstruct.add_classifier(histname,classifier)