In [1]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import torch
from sklearn import preprocessing
from sklearn import ensemble
from utils import loadData, saveData, getNum, getPePerWF, saveans, lossfunc_eval, lossfunc_train
import lightgbm as lgb
from tqdm import tqdm

In [2]:
testpath = "data/final.h5"
thres = 300

In [3]:
print('start processing testset')
testWF = loadData(testpath, 'test')
print('testset loaded')

start processing testset
Structure of data:
<HDF5 dataset "Waveform": shape (12178193,), type "|V2008"> Waveform /Waveform
testset loaded


In [4]:
numPEW, wfIndices = getNum(testWF)

In [5]:
pePerTestWF = np.array([])
splitWFChannels = np.split(testWF['ChannelID'], wfIndices[1:-1])
denoisedTestWF = np.where(testWF['Waveform'] < 918, 918-testWF['Waveform'], 0)
intTestWF = np.sum(denoisedTestWF, axis=1)
pointsPerTestWF = np.sum(denoisedTestWF > 0, axis=1)
pePerTestWFCalc = np.empty(denoisedTestWF.shape[0])
for index, waveform in enumerate(tqdm(denoisedTestWF)):
    wfArgmax = getPePerWF(denoisedTestWF[index], pointsPerTestWF[index])
    pePerTestWFCalc[index] = wfArgmax.shape[0]

100%|██████████| 12178193/12178193 [39:49<00:00, 5095.88it/s] 


In [6]:
gbmForPePerWF = lgb.Booster(model_file='./modelPePerWF.txt')
pePerWF = gbmForPePerWF.predict(
    np.stack(
        (intTestWF, pointsPerTestWF, pePerTestWFCalc),
        axis=1
    )
)

In [7]:
splitPePerTrainWFFinal = np.split(pePerWF, wfIndices[1:-1])
peTotal = np.empty(4000)
peMean = np.empty(4000)
peStd = np.empty(4000)
for index, pePerTrainWFFinalChunk in enumerate(tqdm(splitPePerTrainWFFinal)):
    peTotal[index] = np.sum(pePerTrainWFFinalChunk)
    peMean[index] = np.mean(pePerTrainWFFinalChunk)
    peStd[index] = np.std(pePerTrainWFFinalChunk)

100%|██████████| 4000/4000 [00:00<00:00, 12011.62it/s]


In [8]:
gbmForP = lgb.Booster(model_file='./modelP.txt')
answerP = gbmForP.predict(
    np.stack(
        (peTotal, peMean, peStd),
        axis=1
    )
)

In [10]:
saveans(answerP, './ans/ans1.h5')

In [5]:
w_test, j_test = np.unique(testWF['EventID'], return_index=True)
j_test = np.append(j_test, len(testWF))
numPEWtest = np.diff(j_test)
invTestWF = 1000-testWF['Waveform']

intWFtest = np.array([])
intWFdRtest = np.array([])
numFilteredTest = np.array([])
for arr in tqdm(np.split(invTestWF, j_test[1:-1])):

    # arr = wavelet_denoising(arr)

    head = np.mean(arr[:, :100], axis=1)
    tail = np.mean(arr[:, -100:], axis=1)
    base = np.minimum(head, tail)
    intpWF = np.sum(arr, axis=1)-tail*1000
    filtered = np.maximum(intpWF, thres)
    numFiltered = np.sum(filtered==thres)
    intWFtest = np.append(intWFtest, np.sum(filtered)-numFiltered*thres)
    maxWF = np.argmax(arr, axis=1).reshape(-1,1) + 1
    filtdR = filtered/maxWF*500*(1-(filtered==thres))
    intWFdRtest = np.append(intWFdRtest, np.sum(filtdR))

Xtest = np.hstack((numPEWtest.reshape(-1, 1), intWFtest.reshape(-1, 1), intWFdRtest.reshape(-1, 1)))
# saveData(Xtest, np.array([0]), savetestpath)
print('testset shape: ', Xtest.shape)

100%|██████████| 4000/4000 [09:19<00:00,  7.15it/s]

testset shape:  (4000, 3)





In [6]:
gbm1 = lgb.Booster(model_file='./FINALGBM1.txt')
gbm2 = lgb.Booster(model_file='./FINALGBM2.txt')
gbm = lgb.Booster(model_file='./FINALGBM.txt')

In [7]:
Xt = Xtest
Xt1 = np.hstack((Xt, gbm1.predict(Xt, num_iteration=gbm1.best_iteration).reshape(-1,1)))
Xt2 = np.hstack((Xt1, gbm2.predict(Xt1, num_iteration=gbm2.best_iteration).reshape(-1,1)))
ans_gbm = gbm.predict(Xt2, num_iteration=gbm.best_iteration)
expname = 'FINAL'
saveans(ans_gbm, './ans/'+expname+'.h5')