In [1]:
import rpy2.robjects as robjects

robjects.r('''
require("readBrukerFlexData")
require("MALDIquant")
readBruker <- function(path_, out_path_) {
    sample <- readBrukerFlexDir(path_, removeCalibrationScans = TRUE,
                                removeMetaData = FALSE, useHpc = TRUE, useSpectraNames = TRUE,                                                        
                                filterZeroIntensities = FALSE, verbose = FALSE)
        
    m <- sample[[1]][[1]]$mass
    i <- sample[[1]][[1]]$intensity
        
    #Basic Preprocessing
    # (In this version the data was NOT transformed and normalized. That will hapen later in python)
    spectra <- createMassSpectrum(mass = m, intensity = i)
    spectra <- transformIntensity(spectra, method="sqrt")
    spectra <- smoothIntensity(spectra, method="SavitzkyGolay", halfWindowSize=10)
    spectra <- removeBaseline(spectra, method="SNIP", iterations=20)
    spectra <- calibrateIntensity(spectra, method="TIC")
    spectra <- trim(spectra, range=c(2000, 20000))

    #pks <- detectPeaks(spectra, method="MAD", halfWindowSize=20, SNR=4)
    pks <- spectra

    mass <- mass(pks)
    intensity <- intensity(pks)
    small_dataframe <- data.frame(mass, intensity, stringsAsFactors = FALSE)

    write.table(small_dataframe, out_path_, row.names = F, col.names = F)
    }
    ''')
read_bruker = robjects.globalenv['readBruker']

R[write to console]: Loading required package: readBrukerFlexData

R[write to console]: Loading required package: MALDIquant

R[write to console]: 
This is MALDIquant version 1.22.3
Quantitative Analysis of Mass Spectrometry Data
 See ‘?MALDIquant’ for more information about this package.




In [26]:
import glob
import os
import pandas as pd
import numpy as np

def folder_scan(raw_dir: str) -> dict:
    file_exist_dic = {}
    raw_file_path = os.path.join(raw_dir, '*', '*')
    raw_file_list = glob.glob(raw_file_path)
    for filepath in raw_file_list:
        species_name, sample_number = filepath.split(os.sep)[-2:]
        if species_name not in file_exist_dic.keys():
            file_exist_dic[species_name] = set()
        file_exist_dic[species_name].add(sample_number)
    print(f'File scan done.')
    
    return file_exist_dic


def preprocessing(raw_dir: str, preprocessed_dir: str, file_exist_dic: dict) -> None:
    for species in file_exist_dic.keys():
        raw_path_species = os.path.join(raw_dir, species)
        preprocessed_path = os.path.join(preprocessed_dir, species)
        os.makedirs(preprocessed_path, exist_ok=True)
        for sample_number in file_exist_dic[species]:
            raw_path = os.path.join(raw_path_species, sample_number)
            preprocessed_filepath = os.path.join(preprocessed_path, sample_number)
            preprocessed_filepath = f'{preprocessed_filepath}.txt'

            if os.path.exists(preprocessed_filepath):
                #print(f'Preprocessing {preprocessed_filepath} already exist.')
                continue
            
            print(f'New raw file: {raw_path} found.')
            read_bruker(raw_path, preprocessed_filepath)

            try:
                read_bruker(raw_path, preprocessed_filepath)
                print(f'Preprocessing {preprocessed_filepath} done.')
            except:
                print(f'Preprocessing of {raw_path} fail.')

    return 


def bin_vectorize(preprocessed_file: str, binned_file: str, bin_size: int) -> None:
    spectra = pd.read_csv(preprocessed_file, sep=' ', index_col=False, header=None).to_numpy()
    combined_times = spectra[:, 0]
    min_range = min(2000, np.min(combined_times))
    max_range = max(20000, np.max(combined_times))

    _, bin_edges_ = np.histogram(combined_times, bin_size, range=(min_range, max_range))

    times = spectra[:, 0]
    indices = np.digitize(times, bin_edges_, right=True)
    
    valid = (indices >= 1) & (indices <= bin_size)
    spectrum = spectra[valid]
    
    # Need to update indices to ensure that the first bin is at
    # position zero.
    indices = indices[valid] - 1 
    identity = np.eye(bin_size)
    
    vec = np.sum(identity[indices] * spectra[:, 1][:, np.newaxis], axis=0)
    np.savetxt(binned_file, vec, delimiter=",")
    
    return


def binning(preprocessed_dir: str, binned_dir: str, file_todo_dic: dict, bin_size: int) -> None:
    for species in file_todo_dic.keys():
        preprocessed_path_species = os.path.join(preprocessed_dir, species)
        binned_path = os.path.join(binned_dir, species)
        os.makedirs(binned_path, exist_ok=True)
        for sample_number in file_todo_dic[species]:
            preprocessed_file_path = os.path.join(preprocessed_path_species, sample_number)
            sample_outpath = os.path.join(binned_path, sample_number)

            if os.path.exists(sample_outpath):
                print(f'Preprocessing {sample_outpath} already exist.')
                continue
            
            print(f'New preprocessed file: {preprocessed_file_path} found.')
            bin_vectorize(preprocessed_file_path, sample_outpath, bin_size)
            
            try:
                bin_vectorize(preprocessed_file_path, sample_outpath, bin_size)
                print(f'Binning {sample_outpath} done.')
            except:
                print(f'Binning of {preprocessed_file_path} fail.')


    
def scan_preprocessing(bin_size: int) -> None:
    raw_dir = os.path.join('..', 'data', 'raw')
    preprocessed_dir = os.path.join('..', 'data', 'preprocessed')
    binned_dir = os.path.join('..', 'data', f'binned_{str(bin_size)}')
    os.makedirs(raw_dir, exist_ok=True)
    os.makedirs(preprocessed_dir, exist_ok=True)
    os.makedirs(binned_dir, exist_ok=True)

    file_exist_dic = folder_scan(raw_dir)
    print(file_exist_dic)
    preprocessing(raw_dir, preprocessed_dir, file_exist_dic)

    preprocessed_exist_dic = folder_scan(preprocessed_dir)
    print(preprocessed_exist_dic)
    _ = binning(preprocessed_dir, binned_dir, preprocessed_exist_dic, bin_size)

In [27]:
scan_preprocessing(6000)

Raw file scan done.
{'Escherichia_coli': {'example12', 'example1'}}
Raw file scan done.
{'Escherichia_coli': {'example1.txt', 'example12.txt'}}
New raw file: ../data/preprocessed/Escherichia_coli/example1.txt found.
Binning ../data/binned_6000/Escherichia_coli/example1.txt done.
New raw file: ../data/preprocessed/Escherichia_coli/example12.txt found.
Binning ../data/binned_6000/Escherichia_coli/example12.txt done.
