In [None]:
import pandas as pd


def read_tab_separated_file(filename):
    data = pd.read_csv(filename, sep='\t')
    data.columns = ['ppm', 'intensity', 'none']
    data = data.drop(columns=['none'])
    return data

def read_molecule(filename):
    c = read_tab_separated_file(filename + "_correct_cnmr.csv")
    h = read_tab_separated_file(filename + "_correct_hnmr.csv")
    return c, h

dict_save = {}
for molecule in ["C23H30O5", "C13H14O4", "C12H15NOS", "C20H18O6", "C19H29NO3"]:
    # c, h = read_molecule(molecule)
    dict_save[molecule] = read_molecule(molecule)
    # print length of c and h
    print(molecule, len(dict_save[molecule][0]), len(dict_save[molecule][1]))

In [None]:
import numpy as np
from scipy.interpolate import interp1d


def interpolate_data(data, num_points):
    f = interp1d(data['ppm'], data['intensity'], kind="next")
    new_ppm = pd.Series(data=np.linspace(-1.9981, 10, num_points))
    new_intensity = pd.Series(data=f(new_ppm))
    return pd.DataFrame({'ppm': new_ppm, 'intensity': new_intensity})

data_save_interpolated = {}

for molecule in ["C23H30O5", "C13H14O4", "C12H15NOS", "C20H18O6", "C19H29NO3"]:
    data_save_interpolated[molecule] = (interpolate_data(dict_save[molecule][0], 10000), interpolate_data(dict_save[molecule][1], 10000))


def combine_peaks(peaks):
    combined_peaks = []
    for peak in peaks:
        if len(combined_peaks) == 0:
            combined_peaks.append(peak)
        else:
            if peak[0] - combined_peaks[-1][0] < 0.59:
                combined_peaks[-1] = (peak[0], peak[1] + combined_peaks[-1][1])
            else:
                combined_peaks.append(peak)
    return combined_peaks

In [None]:
# canonicize the smiles
from rdkit import Chem


SMILES_DICT = {
    "C23H30O5": "OC1=C(C=O)C(O)=C([C@H](CC(C)C)C[C@@]2(CC[C@@H]3C[C@@H]2C3(C)C)O4)C4=C1C=O",
    "C13H14O4": "CC(C(OC1=CC(C)=C2OC)=O)=C(C1=C2)OC",
    "C12H15NOS": "O=C(NCCSC)/C=C/C1=CC=CC=C1",
    "C20H18O6": "O[C@H](C1=COC2=C1[C@@]3(C)C(C=CC4=C5CCC4=O)=C5C2=O)[C@H]([C@H]3O)OC",
    "C19H29NO3": "CCCCCCCC[C@H]1CCC(C2=C1C(C(OC)=C(C)N2)=O)=O"
}

dataframe_spectra = pd.DataFrame(columns=["smiles", "h_nmr_cnn", "c_nmr"])

for index, molecule in enumerate(["C23H30O5", "C13H14O4", "C12H15NOS", "C20H18O6", "C19H29NO3"]):
    print(molecule)
    mol = Chem.MolFromSmiles(SMILES_DICT[molecule])
    canon_smi = Chem.MolToSmiles(mol)
    spectra_hnmr = data_save_interpolated[molecule][1]["intensity"].to_numpy()
    spectra_hnmr = spectra_hnmr[::-1]
    spectra_cnmr = dict_save[molecule][0].loc[dict_save[molecule][0]['intensity'] > 0.01].to_numpy()
    combine_peaks_ = np.array(combine_peaks(spectra_cnmr))[:, 0]
    dataframe_spectra.loc[index] = [canon_smi, spectra_hnmr, combine_peaks_]

dataframe_spectra.to_parquet("angewandte_molecules.parquet")
