In [1]:
import numpy as np
import copy
import pickle
import matplotlib.pyplot as plt
from scipy.ndimage import gaussian_filter
from collections import defaultdict
import os 
from scipy.optimize import curve_fit

plt.rcParams['figure.figsize'] = (40.0, 20.0)
# plt.rcParams['figure.figsize'] = (8, 4.0)

# plot if an spike is wrongly classified as cosmic rays
PLOT_ERRORS = False


In [25]:
# file_location = "../data/Raman_Mouse/raw_npy/"
file_location = "../data/Green_excitation/raw_npy/"
try:
    filenames = np.load(f"{file_location}FileNames.npy")
except FileNotFoundError:
    filenames = ['ML-MoS2-532-LP10-2-3-50X.npy']
    

data = []
for f in filenames:
    data.append(np.load(f"{file_location}{f}"))
data = np.array(data)
print(data.shape)

(1, 12, 32, 5019)


The Raman mouse dataset has at least 96 pixels have an anomaly.

In [14]:
file_location2 = f"{'/'.join(file_location.split('/')[:-2])}/cosmic_ray_removed/"
os.makedirs(file_location2, exist_ok=True)

for f in filenames:
    wavelength = np.load(f"{file_location}{f.split('.')[0]}_wavelength.npy")
    np.save(f'{file_location2}{f.split("/")[-1].split(".")[0]}_wavelength.npy', wavelength) 

np.save(f'{file_location2}FileNames', filenames)  



In [15]:
def find_cosmic_ray_noise(img):
    """
    find cosmic ray noise based on the wavelengths within one pixel (a large difference of intensities)
    TODO: 
    """    
    smooth = gaussian_filter(img, sigma=(0,0,1), order=0, mode='nearest')
    diff = img-smooth
    grad = gaussian_filter(diff, sigma=(0,0,0.5), order=1, mode='nearest')

    # find the elbow in the gradient of the data
    data = np.sort(np.abs(grad.flatten()))
    sec = gaussian_filter(data[1:] - data[:-1], sigma=3)
    m_sec, s_std = np.mean(sec), np.std(sec)
    threshold = data[np.max(np.where(sec < m_sec + 0.5*s_std))]   
    
    # make a dict where per pixel the problem area's are in.
    tmp = defaultdict(list)
    for x,y,z in zip(*np.where(grad > threshold)):
        tmp[(x,y)].append(z)
        
    return tmp

def find_cosmic_ray_noise_neighbourhood(img):
    """
    find cosmic ray noise based also on the neighbourhood (a large difference of intensities)
    This returns false positives if in the neighbourhood there is a large spike but not in the pixel it self.
    """
    smooth = gaussian_filter(img, sigma=(1,1,1), order=0, mode='nearest')
    diff = img-smooth
    grad = gaussian_filter(diff, sigma=(0,0,0.5), order=1, mode='nearest')
    
    # find the elbow in the data
    data = np.sort(np.abs(grad.flatten()))
    sec = gaussian_filter(data[1:] - data[:-1], sigma=3)
    m_sec, s_std = np.mean(sec), np.std(sec)
    threshold = data[np.max(np.where(sec < m_sec + 0.5*s_std))]    
    
    # make a dict where per pixel the problem area's are in.
    tmp = defaultdict(list)
    for x,y,z in zip(*np.where(grad > threshold)):
        tmp[(x,y)].append(z)
        
    return tmp

def find_region(lst):
    lst = sorted(lst)
    start = []
    stop = []
    index = lst[0]
    start.append(index)
    for i in lst[1:]:
        if i-index < 5:
            index = i
        else:
            stop.append(index)
            index = i
            start.append(index)
    stop.append(index)
    return list(zip(start, stop))

def gaussian(base):
    def tmp(x, *params):
        mu = np.array(params[slice(0,len(params),3)])
        scale = np.array(params[slice(1,len(params),3)])
        sigma = np.array(params[slice(2,len(params),3)])
        return np.sum(scale * np.exp(-0.5*((x.reshape(-1,1) - mu)/sigma)**2),1) + base
    return tmp


def correcting_4_cosmic_ray_noies(img, cosmic_ray_noise, func = gaussian):
    img2 = copy.copy(img)
    
    # find cosmic ray noise indices for each pixel
    new_cosmic_ray_noise = defaultdict(list)
    for (x,y), lst in cosmic_ray_noise.items():
        # collect all the indices and turn them into seperate windows/ranges with appropiate spacing such that 
        # interpolation can be used to estimate the "true" value of the effected pixel wavelength combination
        for Range in find_region(lst):
            # determine the range of the region
            size = Range[1]-Range[0]
            padding = int(4 + (size)/2)
            X = np.arange(max(0,Range[0]-padding), min(img2.shape[2], Range[1]+padding+1), dtype=int)

            # fit a guassian curve to check for incorrectly classified cosmic ray noise
            mu, base = X[np.argmax(img2[x,y,X[0]:X[-1]+1])], np.min(img2[x,y,X[0]:X[-1]+1])
            sigma, scale = size/2 if size != 0 else 1, max(1, img2[x,y][mu])
            base_adjusted_func = func(base)
            try:
                popt, pcov = curve_fit(base_adjusted_func, X, img2[x,y,X[0]:X[-1]+1], p0=[mu, scale, sigma])
            except RuntimeError:
                # definitly not a gaussian
                img2[x,y,X[0]:X[-1]+1] = np.interp(X, [X[0], X[-1]], [img[x,y,X[0]], img[x,y,X[-1]]])
                new_cosmic_ray_noise[(x,y)].append((Range[1] + Range[0]) / 2)
                continue
                
            fit = base_adjusted_func(X, *popt)
            mu_fit, scale_fit, sigma_fit = popt
            HM = scale_fit / 2
            
            HW = sigma_fit * np.sqrt(-2*np.log(HM/scale_fit))
            left, right, appr_left = mu_fit - HW, mu_fit + HW, max(0,min(len(wavelength)-1, int(mu_fit - HW)))
            NRMSE = np.sqrt(np.mean((fit - img2[x,y,X[0]:X[-1]+1])**2))/scale_fit
            FWHM = wavelength[min(len(wavelength)-1,appr_left + int(HW*2))] - wavelength[appr_left]
            
            # if the NRMSE is below 0.1 and the full width (FW) is larger than 5, the found spike is Raman.
            if NRMSE < 0.1 and FWHM > 5:
                print("REMOVED", x,y, Range, NRMSE, FWHM)
                if PLOT_ERRORS:
                    print("------------ WRONG ---------------")
                    print('MSE:', NRMSE, popt, [mu, scale, sigma], ',base:', base, ',HM:', HM, ',FWHM:', FWHM)
                    plt.axhline(y=HM + base, color='g')
                    plt.axvline(x=left, color='g')
                    plt.axvline(x=mu_fit, color='y')
                    plt.axvline(x=right, color='g')


                    plt.plot(X, fit , 'r-', label='raman appr')
                    plt.plot(X, img2[x,y,X[0]:X[-1]+1], label='raw')
                    plt.legend()
                    plt.show()

                    for z in Range:
                        plt.plot([z,z],[-1000,3000], alpha=0.1, color='k')
                    plt.plot(img[x,y], alpha=0.4)
                    plt.grid(True, which='both')
                    plt.xlim([0,len(wavelength)])
                    locs, _ = plt.xticks()
                    plt.xticks(locs, [wavelength[int(i)] if i < len(wavelength) else "" for i in locs])
                    plt.xlim([0,len(wavelength)])
                    plt.show()
                    print("------------ END WRONG ---------------")
                continue
                
            img2[x,y,X[0]:X[-1]+1] = np.interp(X, [X[0], X[-1]], 
                                                    [img[x,y,X[0]],
                                                     img[x,y,X[-1]]])  
            new_cosmic_ray_noise[(x,y)].append((Range[1] + Range[0]) / 2)
    return img2, new_cosmic_ray_noise

In [16]:
for i, img in enumerate(data):
    print(filenames[i])

    tmp = find_cosmic_ray_noise(img)
    tmp2 = find_cosmic_ray_noise_neighbourhood(img)
    
    # if both functions find cosmic ray noise then it is classified as noise.
    cosmic_ray_noise = {pixel:list(set(z).union(set(tmp2[pixel]))) for pixel, z in tmp.items() if pixel in tmp2}
    img2, cosmic_ray_noise = correcting_4_cosmic_ray_noies(img, cosmic_ray_noise)
    
    # plot problem points
    print(len(cosmic_ray_noise))
#     for (x,y), lst in cosmic_ray_noise.items():
#         print(f"x,y = {x, y} cosmic ray wavelengths {[wavelength[int(i)] for i in lst]}")
#         for z in lst:
#             plt.plot([z,z],[-1000,3000], alpha=0.1, color='k')
#         plt.plot(img2[x,y])
#         plt.plot(img[x,y], alpha=0.4)
#         plt.grid(True, which='both')
#         plt.xlim([0,len(wavelength)])
#         locs, _ = plt.xticks()
#         plt.xticks(locs, [wavelength[int(i)] if i < len(wavelength) else "" for i in locs])
#         plt.xlim([0,len(wavelength)])
#         plt.show()
#     break

#     plot all points
#     for x in img2:
#         for y in x:
#             plt.plot(y)
#     plt.show()

#     np.save(f'{file_location2}{filenames[i].split("/")[-1].split(".")[0]}', img2) 


ML-MoS2-532-LP10-2-3-50X.npy
REMOVED 0 0 (122, 130) 0.022076053224124634 7.0049999999999955
REMOVED 0 1 (122, 130) 0.02230307087240372 7.0049999999999955
REMOVED 0 2 (122, 130) 0.023686794554968863 7.0049999999999955
REMOVED 0 3 (122, 130) 0.02381796554989852 7.0049999999999955
REMOVED 0 4 (121, 130) 0.023043672712614854 7.0090000000000146
REMOVED 0 5 (121, 130) 0.023867514864524934 7.0090000000000146
REMOVED 0 6 (122, 130) 0.02185585306343253 7.0049999999999955
REMOVED 0 7 (121, 123) 0.02814615201286357 7.0090000000000146
REMOVED 0 7 (128, 130) 0.026445244773413142 7.0090000000000146
REMOVED 0 8 (122, 130) 0.021167974764770725 7.0049999999999955
REMOVED 0 9 (121, 123) 0.02683437962788629 7.0090000000000146
REMOVED 0 9 (128, 130) 0.025187732780325156 7.0090000000000146
REMOVED 0 10 (121, 123) 0.026425283210828023 7.0090000000000146
REMOVED 0 10 (128, 130) 0.024953359415713066 7.0090000000000146
REMOVED 0 11 (121, 123) 0.02631699564004169 7.0090000000000146
REMOVED 0 11 (128, 129) 0.016

REMOVED 2 23 (45, 46) 0.0464036053433236 5.335999999999956
REMOVED 2 23 (56, 57) 0.00808918698600576 7.100000000000023
REMOVED 2 23 (121, 123) 0.016014688788351536 7.0090000000000146
REMOVED 2 23 (128, 129) 0.016707130077736134 5.2549999999999955
REMOVED 2 24 (45, 46) 0.05828930403850079 5.335999999999956
REMOVED 2 24 (56, 57) 0.0835438926641976 5.324999999999989
REMOVED 2 24 (121, 123) 0.01609714984639165 7.0090000000000146
REMOVED 2 24 (128, 129) 0.01840997815756087 5.2560000000000855
REMOVED 2 25 (45, 46) 0.032596053370233274 5.335999999999956
REMOVED 2 25 (56, 57) 0.01830355249454695 7.100000000000023
REMOVED 2 25 (121, 123) 0.02099652307190315 7.0090000000000146
REMOVED 2 25 (128, 129) 0.01706919491866686 5.2549999999999955
REMOVED 2 26 (45, 46) 0.038521016722197245 5.335999999999956
REMOVED 2 26 (56, 57) 0.01613865207134327 7.100999999999942
REMOVED 2 26 (121, 123) 0.01852698529859346 7.0090000000000146
REMOVED 2 26 (128, 129) 0.017914212221382533 5.2560000000000855
REMOVED 2 27 

REMOVED 4 5 (121, 129) 0.018535149517308325 7.0090000000000146
REMOVED 4 6 (46, 46) 0.0428731509050339 5.335999999999956
REMOVED 4 6 (56, 56) 0.02796541486519299 7.100000000000023
REMOVED 4 6 (121, 123) 0.01732789896214821 5.2560000000000855
REMOVED 4 6 (128, 129) 0.018519198864772844 5.2549999999999955
REMOVED 4 7 (45, 46) 0.03133923527139652 5.335999999999956
REMOVED 4 7 (56, 57) 0.012002909825664211 7.100000000000023
REMOVED 4 7 (121, 123) 0.017564547924964015 7.0090000000000146
REMOVED 4 7 (128, 129) 0.019403853088354742 5.2549999999999955
REMOVED 4 8 (45, 46) 0.044969131609420065 5.335999999999956
REMOVED 4 8 (56, 57) 0.019610737984540074 7.100000000000023
REMOVED 4 8 (121, 129) 0.019731861669063346 7.0090000000000146
REMOVED 4 9 (45, 46) 0.035224245792198246 5.335999999999956
REMOVED 4 9 (56, 57) 0.012208685547504159 7.100999999999942
REMOVED 4 9 (121, 129) 0.02137213290427362 7.0090000000000146
REMOVED 4 10 (45, 46) 0.033316708859396166 5.335999999999956
REMOVED 4 10 (56, 57) 0.

REMOVED 5 20 (56, 57) 0.007031127340492547 7.100000000000023
REMOVED 5 20 (121, 123) 0.014348489196829746 5.2560000000000855
REMOVED 5 20 (128, 129) 0.016713950446618494 5.2549999999999955
REMOVED 5 21 (45, 46) 0.026352336290138053 5.336999999999989
REMOVED 5 21 (56, 58) 0.013808434841029869 7.100000000000023
REMOVED 5 21 (121, 129) 0.018400742568633235 5.2560000000000855
REMOVED 5 22 (44, 46) 0.03128016962384596 5.336999999999989
REMOVED 5 22 (56, 58) 0.006433977517380859 7.100000000000023
REMOVED 5 22 (121, 129) 0.019519859811608652 5.2560000000000855
REMOVED 5 23 (45, 46) 0.03900689271728784 5.336999999999989
REMOVED 5 23 (56, 58) 0.010704430607888943 7.100000000000023
REMOVED 5 23 (121, 129) 0.020744591193697298 7.0090000000000146
REMOVED 5 24 (45, 46) 0.03603739025016817 5.336999999999989
REMOVED 5 24 (57, 57) 0.00732342774553379 7.100000000000023
REMOVED 5 24 (121, 129) 0.01772233770405576 5.2560000000000855
REMOVED 5 25 (57, 58) 0.015868642921046033 7.100000000000023
REMOVED 5 2

REMOVED 9 0 (57, 57) 0.009159784882622136 7.100000000000023
REMOVED 9 0 (121, 128) 0.01817416630008405 5.2560000000000855
REMOVED 9 1 (44, 46) 0.02905132260422462 5.336999999999989
REMOVED 9 1 (55, 57) 0.017777155039065233 7.100000000000023
REMOVED 9 1 (121, 128) 0.022912025499556118 5.2560000000000855
REMOVED 9 2 (44, 45) 0.020770883796532896 5.336999999999989
REMOVED 9 2 (50, 50) 0.03914375661984257 5.336999999999989
REMOVED 9 2 (55, 58) 0.01158767267007261 7.100000000000023
REMOVED 9 2 (121, 128) 0.01837357330466604 5.2560000000000855
REMOVED 9 3 (121, 129) 0.01985920378709617 5.2560000000000855
REMOVED 9 4 (61, 61) 0.06594302974858732 7.100000000000023
REMOVED 9 4 (121, 129) 0.018744001867310345 5.2560000000000855
REMOVED 9 5 (121, 129) 0.02006354310599956 5.2560000000000855
REMOVED 9 6 (121, 129) 0.019996685644959565 5.2560000000000855
REMOVED 9 7 (121, 129) 0.02053843969972919 5.2560000000000855
REMOVED 9 8 (121, 129) 0.01942555796176728 5.2560000000000855
REMOVED 9 9 (121, 129) 