In [1]:
import numpy as np
import copy
import pickle
import matplotlib.pyplot as plt
from scipy.ndimage import gaussian_filter
from collections import defaultdict
import os 
from scipy.optimize import curve_fit

plt.rcParams['figure.figsize'] = (40.0, 20.0)
# plt.rcParams['figure.figsize'] = (8, 4.0)

# plot if an spike is wrongly classified as cosmic rays
PLOT_ERRORS = False


In [2]:
# file_location = "../data/Raman_Mouse/raw_npy/"
file_location = "../data/Green_excitation/raw_npy/"
try:
    filenames = np.load(f"{file_location}FileNames.npy")
except FileNotFoundError:
    filenames = ['ML-MoS2-532-LP10-2-3-50X.npy']
    

data = []
for f in filenames:
    data.append(np.load(f"{file_location}{f}"))
data = np.array(data)
print(data.shape)

(1, 12, 32, 5019)


The Raman mouse dataset has at least 96 pixels have an anomaly.

In [3]:
file_location2 = f"{'/'.join(file_location.split('/')[:-2])}/cosmic_ray_removed/"
os.makedirs(file_location2, exist_ok=True)

for f in filenames:
    wavelength = np.load(f"{file_location}{f.split('.')[0]}_wavelength.npy")
    np.save(f'{file_location2}{f.split("/")[-1].split(".")[0]}_wavelength.npy', wavelength) 

np.save(f'{file_location2}FileNames', filenames)  



In [4]:
def find_cosmic_ray_noise(img):
    """
    find cosmic ray noise based on the wavelengths within one pixel (a large difference of intensities)
    TODO: 
    """    
    smooth = gaussian_filter(img, sigma=(0,0,1), order=0, mode='nearest')
    diff = img-smooth
    grad = gaussian_filter(diff, sigma=(0,0,0.5), order=1, mode='nearest')

    # find the elbow in the gradient of the data
    data = np.sort(np.abs(grad.flatten()))
    sec = gaussian_filter(data[1:] - data[:-1], sigma=3)
    m_sec, s_std = np.mean(sec), np.std(sec)
    threshold = data[np.max(np.where(sec < m_sec + 0.5*s_std))]   
    
    # make a dict where per pixel the problem area's are in.
    tmp = defaultdict(list)
    for x,y,z in zip(*np.where(grad > threshold)):
        tmp[(x,y)].append(z)
        
    return tmp

def find_cosmic_ray_noise_neighbourhood(img):
    """
    find cosmic ray noise based also on the neighbourhood (a large difference of intensities)
    This returns false positives if in the neighbourhood there is a large spike but not in the pixel it self.
    """
    smooth = gaussian_filter(img, sigma=(1,1,1), order=0, mode='nearest')
    diff = img-smooth
    grad = gaussian_filter(diff, sigma=(0,0,0.5), order=1, mode='nearest')
    
    # find the elbow in the data
    data = np.sort(np.abs(grad.flatten()))
    sec = gaussian_filter(data[1:] - data[:-1], sigma=3)
    m_sec, s_std = np.mean(sec), np.std(sec)
    threshold = data[np.max(np.where(sec < m_sec + 0.5*s_std))]    
    
    # make a dict where per pixel the problem area's are in.
    tmp = defaultdict(list)
    for x,y,z in zip(*np.where(grad > threshold)):
        tmp[(x,y)].append(z)
        
    return tmp

def find_region(lst):
    lst = sorted(lst)
    start = []
    stop = []
    index = lst[0]
    start.append(index)
    for i in lst[1:]:
        if i-index < 5:
            index = i
        else:
            stop.append(index)
            index = i
            start.append(index)
    stop.append(index)
    return list(zip(start, stop))

def gaussian(base):
    def tmp(x, *params):
        mu = np.array(params[slice(0,len(params),3)])
        scale = np.array(params[slice(1,len(params),3)])
        sigma = np.array(params[slice(2,len(params),3)])
        return np.sum(scale * np.exp(-0.5*((x.reshape(-1,1) - mu)/sigma)**2),1) + base
    return tmp


def correcting_4_cosmic_ray_noies(img, cosmic_ray_noise, func = gaussian):
    img2 = copy.copy(img)
    
    # find cosmic ray noise indices for each pixel
    new_cosmic_ray_noise = defaultdict(list)
    for (x,y), lst in cosmic_ray_noise.items():
        # collect all the indices and turn them into seperate windows/ranges with appropiate spacing such that 
        # interpolation can be used to estimate the "true" value of the effected pixel wavelength combination
        for Range in find_region(lst):
            # determine the range of the region
            size = Range[1]-Range[0]
            padding = int(4 + (size)/2)
            X = np.arange(max(0,Range[0]-padding), min(img2.shape[2], Range[1]+padding+1), dtype=int)

            # fit a guassian curve to check for incorrectly classified cosmic ray noise
            mu, base = X[np.argmax(img2[x,y,X[0]:X[-1]+1])], np.min(img2[x,y,X[0]:X[-1]+1])
            sigma, scale = size/2 if size != 0 else 1, max(1, img2[x,y][mu])
            base_adjusted_func = func(base)
            try:
                popt, pcov = curve_fit(base_adjusted_func, X, img2[x,y,X[0]:X[-1]+1], p0=[mu, scale, sigma])
            except RuntimeError:
                # definitly not a gaussian
                img2[x,y,X[0]:X[-1]+1] = np.interp(X, [X[0], X[-1]], [img[x,y,X[0]], img[x,y,X[-1]]])
                new_cosmic_ray_noise[(x,y)].append((Range[1] + Range[0]) / 2)
                continue
                
            fit = base_adjusted_func(X, *popt)
            mu_fit, scale_fit, sigma_fit = popt
            HM = scale_fit / 2
            
            HW = sigma_fit * np.sqrt(-2*np.log(HM/scale_fit))
            left, right, appr_left = mu_fit - HW, mu_fit + HW, max(0,min(len(wavelength)-1, int(mu_fit - HW)))
            NRMSE = np.sqrt(np.mean((fit - img2[x,y,X[0]:X[-1]+1])**2))/scale_fit
            FWHM = wavelength[min(len(wavelength)-1,appr_left + int(HW*2))] - wavelength[appr_left]
            
            # if the NRMSE is below 0.1 and the full width (FW) is larger than 5, the found spike is Raman.
            if NRMSE < 0.1 and FWHM > 5:
                print("REMOVED", x,y, Range, NRMSE, FWHM)
                if PLOT_ERRORS:
                    print("------------ WRONG ---------------")
                    print('MSE:', NRMSE, popt, [mu, scale, sigma], ',base:', base, ',HM:', HM, ',FWHM:', FWHM)
                    plt.axhline(y=HM + base, color='g')
                    plt.axvline(x=left, color='g')
                    plt.axvline(x=mu_fit, color='y')
                    plt.axvline(x=right, color='g')


                    plt.plot(X, fit , 'r-', label='raman appr')
                    plt.plot(X, img2[x,y,X[0]:X[-1]+1], label='raw')
                    plt.legend()
                    plt.show()

                    for z in Range:
                        plt.plot([z,z],[-1000,3000], alpha=0.1, color='k')
                    plt.plot(img[x,y], alpha=0.4)
                    plt.grid(True, which='both')
                    plt.xlim([0,len(wavelength)])
                    locs, _ = plt.xticks()
                    plt.xticks(locs, [wavelength[int(i)] if i < len(wavelength) else "" for i in locs])
                    plt.xlim([0,len(wavelength)])
                    plt.show()
                    print("------------ END WRONG ---------------")
                continue
                
            img2[x,y,X[0]:X[-1]+1] = np.interp(X, [X[0], X[-1]], 
                                                    [img[x,y,X[0]],
                                                     img[x,y,X[-1]]])  
            new_cosmic_ray_noise[(x,y)].append((Range[1] + Range[0]) / 2)
    return img2, new_cosmic_ray_noise

In [5]:
for i, img in enumerate(data):
    print(filenames[i])

    tmp = find_cosmic_ray_noise(img)
    tmp2 = find_cosmic_ray_noise_neighbourhood(img)
    
    # if both functions find cosmic ray noise then it is classified as noise.
    cosmic_ray_noise = {pixel:list(set(z).union(set(tmp2[pixel]))) for pixel, z in tmp.items() if pixel in tmp2}
    img2, cosmic_ray_noise = correcting_4_cosmic_ray_noies(img, cosmic_ray_noise)
    
    # plot problem points
    print(len(cosmic_ray_noise))
#     for (x,y), lst in cosmic_ray_noise.items():
#         print(f"x,y = {x, y} cosmic ray wavelengths {[wavelength[int(i)] for i in lst]}")
#         for z in lst:
#             plt.plot([z,z],[-1000,3000], alpha=0.1, color='k')
#         plt.plot(img2[x,y])
#         plt.plot(img[x,y], alpha=0.4)
#         plt.grid(True, which='both')
#         plt.xlim([0,len(wavelength)])
#         locs, _ = plt.xticks()
#         plt.xticks(locs, [wavelength[int(i)] if i < len(wavelength) else "" for i in locs])
#         plt.xlim([0,len(wavelength)])
#         plt.show()
#     break

#     plot all points
#     for x in img2:
#         for y in x:
#             plt.plot(y)
#     plt.show()

#     np.save(f'{file_location2}{filenames[i].split("/")[-1].split(".")[0]}', img2) 


ML-MoS2-532-LP10-2-3-50X.npy
REMOVED 0 0 (122, 130) 0.022076053221455647 7.0049999999999955
REMOVED 0 1 (122, 130) 0.02230307087240372 7.0049999999999955
REMOVED 0 2 (122, 130) 0.023686794554968863 7.0049999999999955
REMOVED 0 3 (122, 130) 0.023817965551464135 7.0049999999999955
REMOVED 0 4 (121, 130) 0.02304367271233442 7.0090000000000146
REMOVED 0 5 (121, 130) 0.023867514864524934 7.0090000000000146
REMOVED 0 6 (122, 130) 0.021855853064920834 7.0049999999999955
REMOVED 0 7 (121, 123) 0.02814615201286357 7.0090000000000146
REMOVED 0 7 (128, 130) 0.026445244773413142 7.0090000000000146
REMOVED 0 8 (122, 130) 0.021167974764770725 7.0049999999999955
REMOVED 0 9 (121, 123) 0.02683437962788629 7.0090000000000146
REMOVED 0 9 (128, 130) 0.025187732780325156 7.0090000000000146
REMOVED 0 10 (121, 123) 0.026425283210828023 7.0090000000000146
REMOVED 0 10 (128, 130) 0.024953359415602117 7.0090000000000146
REMOVED 0 11 (121, 123) 0.02631699564004169 7.0090000000000146
REMOVED 0 11 (128, 129) 0.01

REMOVED 2 16 (45, 46) 0.03746221936388169 5.335999999999956
REMOVED 2 16 (56, 57) 0.02396563252958626 7.100999999999999
REMOVED 2 16 (121, 123) 0.02208914209328348 7.0090000000000146
REMOVED 2 16 (128, 129) 0.018430471381099583 5.2549999999999955
REMOVED 2 17 (45, 46) 0.04017906491324339 5.335999999999956
REMOVED 2 17 (56, 57) 0.01717299451330671 7.099999999999966
REMOVED 2 17 (121, 129) 0.01867273199979843 7.0090000000000146
REMOVED 2 18 (45, 46) 0.049132273886858466 5.335999999999956
REMOVED 2 18 (56, 57) 0.011799537181934569 7.100999999999999
REMOVED 2 18 (121, 123) 0.01657564354201429 7.0090000000000146
REMOVED 2 18 (128, 129) 0.019926434329290088 5.2549999999999955
REMOVED 2 19 (45, 45) 0.036880581147058125 5.335999999999956
REMOVED 2 19 (56, 57) 0.024533773094118687 7.100999999999999
REMOVED 2 19 (121, 123) 0.016916483351993847 7.0090000000000146
REMOVED 2 19 (128, 129) 0.01843386756850094 5.255999999999972
REMOVED 2 20 (45, 46) 0.04754657731080584 5.335999999999956
REMOVED 2 20 

REMOVED 4 15 (56, 57) 0.08275353964689444 5.324999999999989
REMOVED 4 15 (121, 123) 0.01721245288499854 5.255999999999972
REMOVED 4 15 (128, 129) 0.017008019067427462 5.2549999999999955
REMOVED 4 16 (45, 46) 0.04191911418059241 5.335999999999956
REMOVED 4 16 (56, 57) 0.024655104098123055 5.324999999999989
REMOVED 4 16 (121, 123) 0.015762941849291685 5.255999999999972
REMOVED 4 16 (128, 129) 0.018473286067714135 7.0090000000000146
REMOVED 4 17 (45, 46) 0.03666820109777219 5.335999999999956
REMOVED 4 17 (56, 57) 0.009684191809611534 7.099999999999966
REMOVED 4 17 (121, 129) 0.02092635895497785 7.0090000000000146
REMOVED 4 18 (45, 46) 0.03874737020723344 5.335999999999956
REMOVED 4 18 (56, 57) 0.01310953700946942 7.099999999999966
REMOVED 4 18 (121, 123) 0.017749076827635997 5.255999999999972
REMOVED 4 18 (128, 129) 0.01928151781191842 5.2549999999999955
REMOVED 4 19 (45, 46) 0.028932350056412676 5.335999999999956
REMOVED 4 19 (56, 57) 0.011451029849610161 7.100999999999999
REMOVED 4 19 (

REMOVED 7 12 (121, 129) 0.017996373701590358 5.255999999999972
REMOVED 7 13 (121, 129) 0.018657154834869925 5.255999999999972
REMOVED 7 14 (121, 129) 0.01956117650755625 5.255999999999972
REMOVED 7 15 (121, 129) 0.020544783494235625 5.255999999999972
REMOVED 7 16 (121, 128) 0.0190891386144068 5.255999999999972
REMOVED 7 17 (121, 129) 0.019622397484226818 5.255999999999972
REMOVED 7 18 (121, 129) 0.020507780846677485 5.255999999999972
REMOVED 7 19 (121, 129) 0.019506249439164545 5.255999999999972
REMOVED 7 20 (121, 129) 0.02080524493331387 5.255999999999972
REMOVED 7 21 (121, 129) 0.018717803079942186 5.255999999999972
REMOVED 7 22 (121, 129) 0.019056663715073637 5.255999999999972
REMOVED 7 23 (121, 129) 0.019722646565753187 5.255999999999972
REMOVED 7 24 (121, 129) 0.019639783947636647 5.255999999999972
REMOVED 7 25 (121, 129) 0.0185887113794028 5.255999999999972
REMOVED 7 26 (121, 129) 0.019653279544209652 5.255999999999972
REMOVED 7 27 (121, 129) 0.01879765731872259 5.255999999999972

REMOVED 10 18 (121, 129) 0.0180737403739704 5.255999999999972
REMOVED 10 19 (121, 129) 0.020212955264956214 5.255999999999972
REMOVED 10 20 (121, 129) 0.018971060896971356 5.255999999999972
REMOVED 10 21 (121, 129) 0.01987018980217851 5.255999999999972
REMOVED 10 22 (121, 129) 0.018885611936360312 5.255999999999972
REMOVED 10 23 (121, 129) 0.02036523739711917 5.255999999999972
REMOVED 10 24 (121, 129) 0.019324334229878778 5.255999999999972
REMOVED 10 25 (121, 129) 0.020264117649923117 5.255999999999972
REMOVED 10 26 (121, 129) 0.019678785381499895 5.255999999999972
REMOVED 10 27 (121, 129) 0.01991585414546195 5.255999999999972
REMOVED 10 28 (121, 129) 0.01963380927123949 5.255999999999972
REMOVED 10 29 (58, 58) 0.04576967611096096 5.324000000000012
REMOVED 10 29 (121, 128) 0.02005747503088675 5.255999999999972
REMOVED 10 30 (44, 44) 0.04257174820097339 5.336999999999989
REMOVED 10 30 (58, 59) 0.025097448075927106 5.324000000000012
REMOVED 10 30 (122, 127) 0.019805094074202223 5.2559999