In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader,TensorDataset
from pytorch_model_summary import summary
from tsnecuda import TSNE
#from sklearn.preprocessing import StandardScaler

In [None]:
#check for GPU
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(DEVICE)

In [None]:
# .npy files extracted from file: /global/project/projectdirs/m2676/data/lngs/pgt/raw/geds/LPGTA_r0030_20200718T161311Z_phys_geds_raw.lh5
# have to process raw file using raw_to_dsp.py from pygama first, then save waveforms and other parameters into npy file

#load data and print dimensions of data set
import matplotlib.pyplot as plt
import numpy as np

waveforms = np.load('g032_waveforms_phys.npy')
psd_sum10 = np.load('g032_psd_sum10_phys.npy')
bl_slope  = np.load('g032_bl_slope_phys.npy')

train_data = waveforms[0:7650,:]
test_data = waveforms[7650:9600,:] #lose 6 observations for validation
test_psd10 = psd_sum10[7650:9600]
test_bl_slope = bl_slope[7650:9600]

print(waveforms.shape)
print(train_data.shape)
print(test_data.shape)
print(test_psd10.shape)
print(test_bl_slope.shape)

plt.plot(np.arange(len(waveforms[44,:])), waveforms[44,:]) #anomalous sample wf 
plt.show()

In [None]:
#Preprocess and convert data to tensors

#scaler = StandardScaler()
#scaled_data = scaler.fit_transform(train_data,y=None) #scales train data 
#print(scaled_data.shape)

train_data, test_data = map(torch.tensor, (train_data, test_data))

# batch size 
bs = 50 

#train data loader
train_ds = TensorDataset(train_data)
train_dl = DataLoader(train_ds, batch_size=bs) #removed batch shuffling here

#test data loader
valid_ds = TensorDataset(test_data)
valid_dl = DataLoader(valid_ds, batch_size=bs)

In [None]:
#Send data to GPU
def preprocess(x):
    return x.to(DEVICE)

class WrappedDataLoader:
    def __init__(self, dl, func):
        self.dl = dl
        self.func = func

    def __len__(self):
        return len(self.dl)

    def __iter__(self):
        batches = iter(self.dl)
        for b in batches:
            yield (self.func(*b))
            
train_dl = WrappedDataLoader(train_dl, preprocess)
valid_dl = WrappedDataLoader(valid_dl,preprocess)

In [None]:
#Define Convolutional Autoencoder

class waveform_cae_aobo(nn.Module):
    def __init__(self):
        super(waveform_cae_aobo, self).__init__()
        
        self.lrelu = nn.LeakyReLU(0)
        
        #Encoder
        self.conv1 = nn.Conv1d(1, 10, kernel_size=20, stride=1)
        self.conv2 = nn.Conv1d(10, 10, kernel_size=20, stride=1)
        self.conv3 = nn.Conv1d(10, 10, kernel_size=20, stride=1)
        self.batchnorm1 = nn.BatchNorm1d(10)
        self.maxpool = nn.MaxPool1d(5)
        self.fc1 = nn.Linear(6390, 980)
        self.fc2 = nn.Linear(980, 100)
        self.do1 = nn.Dropout(p=0.5)
        self.do2 = nn.Dropout(p=0.5)
        
        #Decoder
        self.deconv1 = nn.ConvTranspose1d(10,10,kernel_size=20,stride=1)
        self.deconv2 = nn.ConvTranspose1d(10,10,kernel_size=20,stride=1)
        self.deconv3 = nn.ConvTranspose1d(10,1,kernel_size=20,stride=1)
        self.batchnorm2 = nn.BatchNorm1d(10)
        self.fc3 = nn.Linear(100,980)
        self.fc4 = nn.Linear(980,6390)
        self.upsample = nn.Upsample(3404)
        self.do3 = nn.Dropout(p=0.5)
        self.do4 = nn.Dropout(p=0.5)

    def forward(self, x):
        #Encoding part 
        x = x.view(bs,1,3404)
        x = self.lrelu(self.conv1(x))
        x = self.maxpool(x)
        x = self.batchnorm1(x)
        x = self.lrelu(self.conv2(x))
        x = self.lrelu(self.conv3(x))
        x = self.do1(x)
        x = torch.flatten(x,1)
        x = self.lrelu(self.fc1(x))
        x = self.do2(x)
        x = self.lrelu(self.fc2(x))
        fvec = x
        
        #Decoding part 
        x = self.lrelu(self.fc3(x))
        x = self.do3(x)
        x = self.lrelu(self.fc4(x))
        x = self.do4(x)
        x = x.view(bs,10,639)
        x = self.lrelu(self.deconv1(x))
        x = self.lrelu(self.deconv2(x))
        x = self.batchnorm2(x)
        x = self.lrelu(self.deconv3(x))
        x = self.upsample(x)
        x = torch.flatten(x,1)
        return x, fvec

In [None]:
#Summarize model, make sure dimensions match in between layers

model = waveform_cae_aobo()
print(summary(model,torch.zeros(bs,3404),show_hierarchical=True))

In [None]:
#training function 
def train(model, num_epochs, batch_size, learning_rate, filename):
    torch.manual_seed(42)
    criterion = nn.MSELoss()# mean squared error loss
    criterion.to(DEVICE)
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=1e-5) # <--

    epochs = []
    losses = []
    wfs = []
    recons = []
    
    model.train()
    for epoch in range(num_epochs):
        for data in train_dl:
            wf = data
            recon, _ = model(wf)
            loss = criterion(recon, wf)
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            

            
            if epoch == (num_epochs-1):     #save waveforms for last epoch only 
                wfs.append(wf.cpu().detach())
                recons.append(recon.cpu().detach())
            else:
                continue

        epochs.append(epoch)
        losses.append(loss)
        print('Epoch:{}, Loss:{:.4f}'.format(epoch+1, float(loss)))
        
    torch.save(model.state_dict(), filename)
        
    return epochs, losses, wfs, recons

#validating function 
def validate(model, num_epochs):
    torch.manual_seed(42)
    criterion = nn.MSELoss()# mean squared error loss
    criterion.to(DEVICE)
    
    fvecs = []
    model.eval()
    for epoch in range(num_epochs):
        with torch.no_grad():
            for data in valid_dl:
                wf = data
                recon, fvec = model(wf)
                loss = criterion(recon, wf)
                if epoch == (num_epochs-1):     #save fvecs for last epoch only 
                    fvecs.append(fvec.cpu())
                else:
                    continue
                
        print('Epoch:{}, Loss:{:.4f}'.format(epoch+1, float(loss)))
              
    return fvecs

                

In [None]:
#train model 
import time

max_epochs = 30
lr = 1e-4

model.to(DEVICE)
start_time = time.time()
epochs, losses, wfs, recons = train(model, max_epochs, bs, lr,'mse_lr0001_lrelu0_30epochs.pt')
print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
#concatenate training model outputs
train_wfs = np.concatenate(wfs, axis=0)
train_recons = np.concatenate(recons, axis=0)
print(train_wfs.shape)
print(train_recons.shape)

In [None]:
#plot input and corresponding reconstructed waveform

#plt.suptitle("Sample input and reconstructed waveform", fontsize=18)
plt.plot(np.arange(len(train_wfs[8,:])), train_wfs[8,:], label='input')
plt.plot(np.arange(len(train_recons[8,:])), train_recons[8,:], label='reconstructed')
plt.ylabel('ADC', fontsize=20)
plt.xlabel("ns", fontsize=20)
plt.legend()
plt.show()

plt.plot(np.arange(len(train_wfs[86,:])), train_wfs[86,:], label='input')
plt.plot(np.arange(len(train_recons[86,:])), train_recons[86,:], label='reconstructed')
plt.ylabel('ADC', fontsize=20)
plt.xlabel("ns", fontsize=20)
plt.legend()
plt.show()

plt.plot(np.arange(len(train_wfs[2840,:])), train_wfs[2840,:], label='input')
plt.plot(np.arange(len(train_recons[2840,:])), train_recons[2840,:], label='reconstructed')
plt.ylabel('ADC', fontsize=20)
plt.xlabel("ns", fontsize=20)
plt.legend()
plt.show()

plt.plot(np.arange(len(train_wfs[4811,:])), train_wfs[4811,:], label='input')
plt.plot(np.arange(len(train_recons[4811,:])), train_recons[4811,:], label='reconstructed')
plt.ylabel('ADC', fontsize=20)
plt.xlabel("ns", fontsize=20)
plt.legend()
plt.show()


In [None]:
#plot loss function of CAE
epochs = np.asarray(epochs)
losses = np.asarray(losses)
plt.plot(epochs,losses)
plt.title('Reconstruction Learning Curve (Background)', fontsize=16)
plt.ylabel('Loss Criterion (MSE)', fontsize=14)
plt.xlabel("Epoch", fontsize=14)
plt.show()

In [None]:
#test model, obtain feature vectors to feed into TSNE
#import time
#model.load_state_dict(torch.load('mse_lr0001_lrelu01_30epochs.pt'))
#model.to(DEVICE)

max_epochs = 20
start_time = time.time()
fvecs = validate(model,max_epochs)
print("--- %s seconds ---" % (time.time() - start_time))


In [None]:
#concatenate results from validation step
test_fvecs = np.concatenate(fvecs ,axis=0)
np.shape(test_fvecs)

In [None]:
#feed fvecs into TSNE

start_time = time.time()
test_embedded = TSNE(verbose=0, n_components=2, learning_rate=200).fit_transform(test_fvecs)
print("--- %s seconds ---" % (time.time() - start_time))
print(test_embedded.shape)

In [None]:
import matplotlib
from matplotlib.lines import Line2D
matplotlib.use('Agg')

#Define t-SNE plotting functions
def plot_all_clusters(embedded_fvecs, filename):
    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.plot(embedded_fvecs[:, 0], embedded_fvecs[:, 1], 'b.', markersize=1)
    fig.suptitle("t-SNE Classification of Waveforms", fontsize=18)
    fig.savefig(filename)
    
def plot_all_clusters_psd(embedded_fvecs, tags, filename):
    fig = plt.figure()
    ax = fig.add_subplot(111)
    for i in range(0,len(tags)):
        if tags[i] >= 0.65e6: #high freq. threshold here
            ax.plot(embedded_fvecs[i, 0], embedded_fvecs[i, 1], 'r.', markersize=10)
        else:
            ax.plot(embedded_fvecs[i, 0], embedded_fvecs[i, 1], 'b.', markersize=1)
            
    fig.suptitle("t-SNE with PSD Tags", fontsize=18)
    legend_elements = [Line2D([0], [0], marker='.', color='w', label='low psd_sum10', markerfacecolor='b', markersize=15),
                       Line2D([0], [0], marker='.', color='w', label='high psd_sum10', markerfacecolor='r', markersize=15)]
    ax.legend(handles=legend_elements, fancybox=True, loc='upper right')
    fig.savefig(filename)
    

def plot_all_clusters_bl_slope(embedded_fvecs, tags, filename):
    fig = plt.figure()
    ax = fig.add_subplot(111)
    for i in range(0,len(tags)):
        if tags[i] >= 0.1:  #outlier cuts here 
            ax.plot(embedded_fvecs[i, 0], embedded_fvecs[i, 1], 'r.', markersize=10)
        elif tags[i] < -0.1:
            ax.plot(embedded_fvecs[i, 0], embedded_fvecs[i, 1], 'r.', markersize=10)
        else:
            ax.plot(embedded_fvecs[i, 0], embedded_fvecs[i, 1], 'b.', markersize=1)
            
    fig.suptitle("t-SNE with Baseline Slope Tags", fontsize=18)
    legend_elements = [Line2D([0], [0], marker='.', color='w', label='central bl_slope', markerfacecolor='b', markersize=15),
                       Line2D([0], [0], marker='.', color='w', label='outlier bl_slope', markerfacecolor='r', markersize=15)]
    ax.legend(handles=legend_elements, fancybox=True, loc='upper right')
    fig.savefig(filename)
    
def plot_2Dhist(embedded_fvecs, filename):
    fig = plt.figure()
    ax = fig.add_subplot(111)
    h = ax.hist2d(embedded_fvecs[:, 0], embedded_fvecs[:, 1], bins=50, density=True, cmap='hot')
    fig.colorbar(h[3], ax=ax)
    fig.suptitle("Density Plot of t-SNE Results", fontsize=18)
    fig.savefig(filename)


In [None]:
#Plot TSNE performance for various digits 

plot_all_clusters(test_embedded, "tsne_mse_lr0001_lrelu0")
plot_2Dhist(test_embedded, "2Dhist_mse_lr0001_lrelu0")
plot_all_clusters_psd(test_embedded, test_psd10, "tsne_mse_lr0001_lrelu0_psd")
plot_all_clusters_bl_slope(test_embedded, test_bl_slope, "tsne_mse_lr0001_lrelu0_bl_slope")
    