In [1]:
import os
import pandas as pd
import numpy as np
from torch.utils.data import Dataset
import torch

from maldi_learn.vectorization import BinningVectorizer


class DRIAMS(Dataset):
    def __init__(self):
        self.selected_antibiotics = ['Penicillin', 'Ceftriaxone', 'Vancomycin', 'Piperacillin-Tazobactam',
       'Ciprofloxacin', 'Cefepime', 'Cotrimoxazole', 'Meropenem']

        self.meta = pd.read_csv("../data/Driams/DRIAMS-A/id/2018/2018_clean.csv")

        self.meta[self.selected_antibiotics] = self.meta[self.selected_antibiotics][self.meta[self.selected_antibiotics].isin(["S","I","R","-"])]
        self.meta[self.selected_antibiotics] = self.meta[self.selected_antibiotics].replace({"-": np.nan, "R": 0,"I": 0, "S": 1})
        self.meta.dropna(inplace=True,ignore_index=True)
        
        self.dir = "../data/Driams/DRIAMS-A/preprocessed/2018/"
        self.transform = BinningVectorizer(18000, min_bin=2000, max_bin=20000)


    def __len__(self):
        return len(self.meta.index)

    def __getitem__(self, idx):
        id = self.meta.loc[idx, "code"]

        path = os.path.join(f"{self.dir}{id}.txt")
        spectrum = pd.read_csv(path, sep='\s+', comment= "#")
        
        labels = self.meta.loc[idx, self.selected_antibiotics]
        labels = labels.astype("float32")

        min_range = min(spectrum["mass.spectra."])
        min_range = min(min_range, self.transform.min_bin)
        max_range = max(spectrum["mass.spectra."])
        max_range = max(max_range, self.transform.max_bin)
        bin_edges_ = np.linspace(min_range, max_range, self.transform.n_bins + 1)

        times = spectrum["mass.spectra."]
        valid = (times > bin_edges_[0]) & (times <= bin_edges_[-1])
        vec = np.histogram(spectrum["mass.spectra."], bins=bin_edges_, weights=spectrum["intensity.spectra."])[0]
        tensor = torch.from_numpy(np.float32(vec))

        return tensor, torch.from_numpy(labels.values)

In [2]:
from torch.utils.data import DataLoader

driams = DRIAMS()
loader = DataLoader(driams, batch_size=64, shuffle=True, drop_last= True)

  self.meta[self.selected_antibiotics] = self.meta[self.selected_antibiotics].replace({"-": np.nan, "R": 0,"I": 0, "S": 1})


In [3]:
def loadalldata():
    for _ in loader:
        continue

In [4]:
import timeit

result_pandas = []

for _ in range(10):
    result_pandas.append(timeit.Timer(loadalldata).timeit(number=1))
# calculate the average
result_pandas

[38.54298223501246,
 34.05232154800615,
 33.62475699199422,
 33.622133554003085,
 33.81009316499694,
 33.41008228500141,
 33.35115543300344,
 33.31514028899255,
 33.70088799200312,
 34.985334288998274]

In [ ]:
result_pandas = [38.54298223501246,
 34.05232154800615,
 33.62475699199422,
 33.622133554003085,
 33.81009316499694,
 33.41008228500141,
 33.35115543300344,
 33.31514028899255,
 33.70088799200312,
 34.985334288998274]

In [5]:
import polars

class DRIAMS(Dataset):
    def __init__(self):
        self.selected_antibiotics = ['Penicillin', 'Ceftriaxone', 'Vancomycin', 'Piperacillin-Tazobactam',
       'Ciprofloxacin', 'Cefepime', 'Cotrimoxazole', 'Meropenem']

        self.meta = pd.read_csv("../data/Driams/DRIAMS-A/id/2018/2018_clean.csv")

        self.meta[self.selected_antibiotics] = self.meta[self.selected_antibiotics][self.meta[self.selected_antibiotics].isin(["S","I","R","-"])]
        self.meta[self.selected_antibiotics] = self.meta[self.selected_antibiotics].replace({"-": np.nan, "R": 0,"I": 0, "S": 1})
        self.meta.dropna(inplace=True,ignore_index=True)
        
        self.dir = "../data/Driams/DRIAMS-A/preprocessed/2018/"
        self.transform = BinningVectorizer(18000, min_bin=2000, max_bin=20000)


    def __len__(self):
        return len(self.meta.index)

    def __getitem__(self, idx):
        id = self.meta.loc[idx, "code"]

        path = os.path.join(f"{self.dir}{id}.txt")
        spectrum = polars.read_csv(path, separator=' ', comment_prefix = "#")
        
        labels = self.meta.loc[idx, self.selected_antibiotics]
        labels = labels.astype("float32")

        min_range = min(spectrum["mass.spectra."])
        min_range = min(min_range, self.transform.min_bin)
        max_range = max(spectrum["mass.spectra."])
        max_range = max(max_range, self.transform.max_bin)
        bin_edges_ = np.linspace(min_range, max_range, self.transform.n_bins + 1)

        times = spectrum["mass.spectra."]
        valid = (times > bin_edges_[0]) & (times <= bin_edges_[-1])
        vec = np.histogram(spectrum["mass.spectra."], bins=bin_edges_, weights=spectrum["intensity.spectra."])[0]
        tensor = torch.from_numpy(np.float32(vec))

        return tensor, torch.from_numpy(labels.values)

In [6]:
from torch.utils.data import DataLoader

driams = DRIAMS()
loader = DataLoader(driams, batch_size=64, shuffle=True, drop_last= True)

  self.meta[self.selected_antibiotics] = self.meta[self.selected_antibiotics].replace({"-": np.nan, "R": 0,"I": 0, "S": 1})


In [7]:
def loadalldata():
    for _ in loader:
        continue

In [8]:
import timeit

result_polars = []

for _ in range(10):
    result_polars.append(timeit.Timer(loadalldata).timeit(number=1))
    
result_polars

[15.820284066998283,
 15.291565140010789,
 15.71043348700914,
 15.782293251992087,
 16.970831714992528,
 16.107691868994152,
 17.22435426499578,
 18.340394825994736,
 17.494993004002026,
 17.49683939300303]

In [ ]:
result_polars = [15.820284066998283,
 15.291565140010789,
 15.71043348700914,
 15.782293251992087,
 16.970831714992528,
 16.107691868994152,
 17.22435426499578,
 18.340394825994736,
 17.494993004002026,
 17.49683939300303]

In [9]:
class DRIAMS(Dataset):
    def __init__(self):
        self.selected_antibiotics = ['Penicillin', 'Ceftriaxone', 'Vancomycin', 'Piperacillin-Tazobactam',
       'Ciprofloxacin', 'Cefepime', 'Cotrimoxazole', 'Meropenem']

        self.meta = pd.read_csv("../data/Driams/DRIAMS-A/id/2018/2018_clean.csv")

        self.meta[self.selected_antibiotics] = self.meta[self.selected_antibiotics][self.meta[self.selected_antibiotics].isin(["S","I","R","-"])]
        self.meta[self.selected_antibiotics] = self.meta[self.selected_antibiotics].replace({"-": np.nan, "R": 0,"I": 0, "S": 1})
        self.meta.dropna(inplace=True,ignore_index=True)
        
        self.dir = "../data/Driams/DRIAMS-A/preprocessed/2018/"
        self.transform = BinningVectorizer(18000, min_bin=2000, max_bin=20000)


    def __len__(self):
        return len(self.meta.index)

    def __getitem__(self, idx):
        id = self.meta.loc[idx, "code"]

        path = os.path.join(f"{self.dir}{id}.txt")
        spectrum = pd.read_csv(path, sep=' ', engine= "pyarrow", skiprows = 2, header = 2)
        
        labels = self.meta.loc[idx, self.selected_antibiotics]
        labels = labels.astype("float32")

        min_range = min(spectrum["mass.spectra."])
        min_range = min(min_range, self.transform.min_bin)
        max_range = max(spectrum["mass.spectra."])
        max_range = max(max_range, self.transform.max_bin)
        bin_edges_ = np.linspace(min_range, max_range, self.transform.n_bins + 1)

        times = spectrum["mass.spectra."]
        valid = (times > bin_edges_[0]) & (times <= bin_edges_[-1])
        vec = np.histogram(spectrum["mass.spectra."], bins=bin_edges_, weights=spectrum["intensity.spectra."])[0]
        tensor = torch.from_numpy(np.float32(vec))

        return tensor, torch.from_numpy(labels.values)

In [10]:
from torch.utils.data import DataLoader

driams = DRIAMS()
loader = DataLoader(driams, batch_size=64, shuffle=True, drop_last= True)

def loadalldata():
    for _ in loader:
        continue
        
import timeit

result_pyarrow = []

for _ in range(10):
    result_pyarrow.append(timeit.Timer(loadalldata).timeit(number=1))
    
result_pyarrow

  self.meta[self.selected_antibiotics] = self.meta[self.selected_antibiotics].replace({"-": np.nan, "R": 0,"I": 0, "S": 1})


[26.16419681799016,
 27.45790529200167,
 26.395152655008133,
 27.5912802870007,
 25.30875223201292,
 25.00190786800522,
 24.433556490999763,
 24.81148859199311,
 25.11939408699982,
 24.20830324299459]

In [ ]:
result_pyarrow = [26.16419681799016,
 27.45790529200167,
 26.395152655008133,
 27.5912802870007,
 25.30875223201292,
 25.00190786800522,
 24.433556490999763,
 24.81148859199311,
 25.11939408699982,
 24.20830324299459]

In [11]:
from torch.utils.data import Dataset
import pandas as pd
import numpy as np

class DRIAMS(Dataset):
    def __init__(self):
        self.selected_antibiotics = ['Penicillin', 'Ceftriaxone', 'Vancomycin', 'Piperacillin-Tazobactam',
       'Ciprofloxacin', 'Cefepime', 'Cotrimoxazole', 'Meropenem']

        self.meta = pd.read_csv("../data/Driams/DRIAMS-A/id/2018/2018_clean.csv")

        self.meta[self.selected_antibiotics] = self.meta[self.selected_antibiotics][self.meta[self.selected_antibiotics].isin(["S","I","R","-"])]
        self.meta[self.selected_antibiotics] = self.meta[self.selected_antibiotics].replace({"-": np.nan, "R": 0,"I": 0, "S": 1})
        self.meta.dropna(inplace=True,ignore_index=True)
        
        self.dir = "../data/Driams/DRIAMS-A/preprocessed/2018/"
        self.transform = BinningVectorizer(18000, min_bin=2000, max_bin=20000)


    def __len__(self):
        return len(self.meta.index)

    def __getitem__(self, idx):
        id = self.meta.loc[idx, "code"]

        path = os.path.join(f"{self.dir}{id}.txt")
        spectrum = np.genfromtxt(path,skip_header = 3,names = ["massspectra", "intensityspectra"], dtype = np.float32)
        
        labels = self.meta.loc[idx, self.selected_antibiotics]
        labels = labels.astype("float32")

        min_range = min(spectrum["massspectra"])
        min_range = min(min_range, self.transform.min_bin)
        max_range = max(spectrum["massspectra"])
        max_range = max(max_range, self.transform.max_bin)
        bin_edges_ = np.linspace(min_range, max_range, self.transform.n_bins + 1)

        times = spectrum["massspectra"]
        valid = (times > bin_edges_[0]) & (times <= bin_edges_[-1])
        vec = np.histogram(spectrum["massspectra"], bins=bin_edges_, weights=spectrum["intensityspectra"])[0]
        tensor = torch.from_numpy(np.float32(vec))

        return tensor, torch.from_numpy(labels.values)

In [12]:
from torch.utils.data import DataLoader

driams = DRIAMS()
loader = DataLoader(driams, batch_size=64, shuffle=True, drop_last= True)

def loadalldata():
    for _ in loader:
        continue
        
import timeit

result_numpy = []

for _ in range(10):
    result_numpy.append(timeit.Timer(loadalldata).timeit(number=1))
    
result_numpy

  self.meta[self.selected_antibiotics] = self.meta[self.selected_antibiotics].replace({"-": np.nan, "R": 0,"I": 0, "S": 1})


[99.67195300399908,
 100.24804145999951,
 105.51859697699547,
 108.03765478699643,
 114.84337878899532,
 105.67098381801043,
 106.0676439650124,
 107.46306798100704,
 103.34276368799328,
 104.50932237898814]

In [ ]:
result_numpy = [99.67195300399908,
 100.24804145999951,
 105.51859697699547,
 108.03765478699643,
 114.84337878899532,
 105.67098381801043,
 106.0676439650124,
 107.46306798100704,
 103.34276368799328,
 104.50932237898814]

In [2]:
from src.maldi2resistance.data.driams import Driams

driams = Driams(
    root_dir="../data/Driams",
    sites=["DRIAMS-A"],
    years=[2018],
    antibiotics= ['Penicillin', 'Ceftriaxone', 'Vancomycin', 'Piperacillin-Tazobactam','Ciprofloxacin', 'Cefepime', 'Cotrimoxazole', 'Meropenem']
)

  selected_columns.replace(replace_dict, inplace = True)


In [None]:
from torch.utils.data import DataLoader

loader = DataLoader(driams, batch_size=64, shuffle=True, drop_last= True)

def loadalldata():
    for _ in loader:
        continue
        
import timeit

result_implemented = []

for _ in range(10):
    result_implemented.append(timeit.Timer(loadalldata).timeit(number=1))
    
result_implemented

In [ ]:
result_implemented

In [None]:
%matplotlib inline

import matplotlib.pyplot as plt
import numpy as np
  
fig, ax = plt.subplots()
 
# Creating plot

ax.boxplot([result_pandas, result_pyarrow,result_polars, result_numpy])
ax.set_xticklabels(["Pandas", "Pyarrow", "Polars", "Numpy"])
ax.set_title("Test runtime of pytorch Dataset implementations")
ax.set_xlabel('Implementation')
ax.set_ylabel('Run time in seconds')
 
# show plot
#plt.savefig("../data/created_figures/runtime_test.png")
plt.show()

In [None]:
from maldi_nn.scripts.process_DRIAMS import DRIAMS_raw_spectra_to_h5torch

#DRIAMS_raw_spectra_to_h5torch("/home/jan/Uni/master/data/Driams/","/home/jan/Uni/master/data/Driams_complete")