In [1]:
import numpy
from matchms.importing import load_from_msp
from matchms.filtering import reduce_to_number_of_peaks
import tensorflow as tf

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [4]:
def filter_none(iterable):
    """ Remove None from list. """
    filtered = filter(lambda x: x is not None, iterable)
    return list(filtered)

In [6]:
def safe_convert_float(x):
    try:
        val = float(x)
    except:
        val = 0
    return val

In [None]:
spectra = list(load_from_msp("GCMS DB-Public-KovatsRI-VS3.msp"))

k = 5
spectra = filter_none([reduce_to_number_of_peaks(x, n_required = k) for x in spectra])


In [3]:
smiles = [x.get("smiles") for x in spectra]
exact_mass = [safe_convert_float(x.get("exactmass")) for x in spectra]
ri = [safe_convert_float(x.get("retentionindex")) for x in spectra]
peaks = [x.peaks.to_numpy for x in spectra]

In [None]:
exact_mass_idx = list(numpy.nonzero(exact_mass)[0])
ri_idx = list(numpy.nonzero(ri)[0])
indices = numpy.intersect1d(exact_mass_idx, ri_idx)

In [None]:
x = numpy.take(exact_mass, indices)
y = numpy.take(ri, indices)

In [None]:
def normalize(data):
    demeaned = data - numpy.mean(data)
    quotient = numpy.max(numpy.abs(demeaned))
    normalized = demeaned / quotient
    return normalized

In [None]:
x_normalized = normalize(x)
y_normalized = normalize(y)

In [None]:
def make_dataset(data, batch):
    ds = tf.data.Dataset.from_tensor_slices(data).shuffle(2048).batch(batch)
    return ds

In [None]:
batch_size = 64
x_train = make_dataset(x_normalized, batch_size)
y_train = make_dataset(y_normalized, batch_size)
ds = tf.data.Dataset.zip((x_train, y_train))

In [None]:
inputs = tf.keras.Input(shape=(1,))
act0 = tf.keras.activations.relu(inputs, threshold = -1)
dense0 = tf.keras.layers.Dense(1000, kernel_initializer='normal')(act0)
dense1 = tf.keras.layers.Dense(2000, activation=tf.nn.relu, kernel_initializer='normal')(dense0)
dense2 = tf.keras.layers.Dense(2000, activation=tf.nn.relu, kernel_initializer='normal')(dense1)
dense3 = tf.keras.layers.Dense(1000, activation=tf.nn.relu, kernel_initializer='normal')(dense2)
dense4 = tf.keras.layers.Dense(1000, activation=tf.nn.relu, kernel_initializer='normal')(dense3)
outputs = tf.keras.layers.Dense(1, activation=tf.nn.tanh, kernel_initializer='normal')(dense4)
model = tf.keras.Model(inputs=inputs, outputs=outputs)

In [None]:
model.compile(
    loss=tf.keras.losses.MeanAbsoluteError(),
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
    metrics=["accuracy"],
)

In [None]:
history = model.fit(ds, epochs=5)

In [None]:
history = model.fit(
    x=x_normalized,
    y=y_normalized,
    batch_size=64,
    epochs=10,
    verbose="auto",
    validation_split=0.2,
    validation_data=None,
    shuffle=True,
    class_weight=None,
    sample_weight=None,
    initial_epoch=0,
    steps_per_epoch=None,
    validation_freq=1,
    max_queue_size=10,
    workers=8,
    use_multiprocessing=True,
)

In [None]:
print(history)