In [9]:
import rdkit
from rdkit import Chem
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import Draw
from rdkit.Chem import AllChem
from rdkit.Chem import MolToSmiles
from rdkit.Chem import PandasTools
import pandas as pd
import os 
import requests
import re
import csv
import hashlib

In [None]:
#pip install requests

Orts: iNEXTG2-Plate-01 - iNEXTG2-Plate-09

In [None]:
# Function to generate molecule images from SMILES + save them with corresponding txt.row of the SMILES code
def generate_molecule_images(dataframe, output_folder):
    os.makedirs(output_folder, exist_ok=True)

    with open(os.path.join(output_folder, 'molecule_info.txt'), 'w') as out_file:
        for index, row in dataframe.iterrows():
            smiles = row['SMILES']  
            mol = Chem.MolFromSmiles(smiles)
            if mol is not None:
                img = Draw.MolToImage(mol)
                img.save(os.path.join(output_folder, f'molecule_{index}.png'))  
                out_file.write(f"{index}, {smiles}\n")

In [None]:
# Assuming all necessary imports are done earlier in the notebook
# Data extraction (e.g. Solvent and .mol) from SDF file, storing in CSV file
# Converting .mol from SDF to SMILES in CSV  

for i in range(1, 9):
    plate_number = f"0{i}"
    sdf_file_path = f"datasets/raw/orts/iNEXT_Lib_1D/iNEXTG2-Plate-{plate_number}.SDF"
    csv_file_path = f"datasets/processed/orts/iNEXT_Lib_1D/iNEXTG2-Plate-{plate_number}.csv"
    output_folder_path = f"datasets/processed/orts/images/images_iNEXTG2-Plate-{plate_number}"

    # Load the SDF file into a DataFrame
    sdf_supplier = Chem.SDMolSupplier(sdf_file_path)
    mols = [mol for mol in sdf_supplier if mol is not None]
    sdf_df = PandasTools.LoadSDF(sdf_file_path)

    # Process DataFrame, Extraction of Information, .mol from SDF into SMILES for CSV
    sdf_df["Solvent"] = "DMSO"
    sdf_df["SMILES"] = [Chem.MolToSmiles(mol) for mol in mols]
    sdf_df["SDF present"] = "iNEXT_Lib_1D"
    sdf_df["SDF file path"] = sdf_file_path

    sdf_df.to_csv(csv_file_path, index=False, mode="w")

    df = pd.read_csv(csv_file_path)

    # Generate molecule images for SMILES, save them to CSV
    generate_molecule_images(df, output_folder_path)

Nmrshiftdb2: nmredata

In [None]:
params = Chem.SanitizeFlags.SANITIZE_ALL ^ Chem.SanitizeFlags.SANITIZE_PROPERTIES
sdf_supplier = Chem.SDMolSupplier('datasets/raw/nmrshiftdb2/nmredata/nmrshiftdb2.nmredata.sd', sanitize=True, removeHs=False)

mols = []
solvent_info = []

for mol in sdf_supplier:
    if mol is not None:
        smiles = Chem.MolToSmiles(mol)
        mols.append(smiles)

        if 'NMREDATA_SOLVENT' in mol.GetPropsAsDict():
            solvent = mol.GetPropsAsDict()['NMREDATA_SOLVENT']
        else:
            solvent = None
        solvent_info.append(solvent)

new_data = {
    "SMILES": mols,
    "SDF present": "nmredata",
    "SDF file path": "/data/shared/projects/nmr2structure/datasets/raw/nmrshfitdb2/nmredata/nmrshiftdb2.nmredata.sd"
}
new_data["NMREDATA_SOLVENT"] = solvent_info
new_df = pd.DataFrame(new_data)
new_df.rename(columns={"NMREDATA_SOLVENT": "Solvent"}, inplace=True)

new_df.to_csv("datasets/processed/nmrshiftdb2/nmredata/nmrshiftdb2.nmredata.csv", index=False, mode="w")


In [None]:
csv_file_path = 'datasets/processed/nmrshiftdb2/nmredata/nmrshiftdb2.nmredata.csv'  
df = pd.read_csv(csv_file_path)

def generate_molecule_images(dataframe, output_folder):
    os.makedirs(output_folder, exist_ok=True)

    with open(os.path.join(output_folder, 'molecule_info.txt'), 'w') as out_file:
        for index, row in dataframe.iterrows():
            smiles = row['SMILES']  
            mol = Chem.MolFromSmiles(smiles)
            if mol is not None:
                img = Draw.MolToImage(mol)
                img.save(os.path.join(output_folder, f'molecule_{index}.png'))  
                out_file.write(f"{index}, {smiles}\n")

output_folder_path = 'datasets/processed/nmrshiftdb2/nmredata/images.nmredata/'  
generate_molecule_images(df, output_folder_path)

In [47]:
csv_file_path = 'datasets/processed/nmrshiftdb2/nmredata/nmrshiftdb2.nmredata.csv'  
df = pd.read_csv(csv_file_path)

image_folder = '/data/shared/projects/nmr2structure/datasets/processed/nmrshiftdb2/nmredata/images.nmredata/' 
df['Image folder location'] = df.index.map(lambda x: os.path.join(image_folder, f'molecule_{x}.png'))

updated_csv_file_path = 'datasets/processed/nmrshiftdb2/nmredata/nmrshiftdb2.nmredata.csv'  
df.to_csv(updated_csv_file_path, index=False)

Nmrshiftdb2: withsignals

In [None]:
sdf_supplier = Chem.SDMolSupplier("datasets/raw/nmrshiftdb2/withsignals/nmrshiftdb2withsignals.sd")
mols = [mol for mol in sdf_supplier if mol is not None]
sdf_df = PandasTools.LoadSDF("datasets/raw/nmrshiftdb2/withsignals/nmrshiftdb2withsignals.sd")

In [3]:
sdf_df["SMILES"] = [Chem.MolToSmiles(mol) for mol in mols]
sdf_df["SDF present"] = "withsignals"
sdf_df["SDF file path"] = "/data/shared/projects/nmr2structure/datasets/raw/nmrshiftdb2/withsignals/nmrshiftdb2withsignals.sd"
sdf_df.to_csv("datasets/processed/nmrshiftdb2/withsignals/nmrshiftdb2withsignals.csv", index=False, mode="w")

In [None]:
csv_file_path = 'datasets/processed/nmrshiftdb2/withsignals/nmrshiftdb2withsignals.csv'  
df = pd.read_csv(csv_file_path)

def generate_molecule_images(dataframe, output_folder):
    os.makedirs(output_folder, exist_ok=True)

    with open(os.path.join(output_folder, 'molecule_info.txt'), 'w') as out_file:
        for index, row in dataframe.iterrows():
            smiles = row['SMILES']  
            mol = Chem.MolFromSmiles(smiles)
            if mol is not None:
                img = Draw.MolToImage(mol)
                img.save(os.path.join(output_folder, f'molecule_{index}.png'))  
                out_file.write(f"{index}, {smiles}\n")

output_folder_path = 'datasets/processed/nmrshiftdb2/withsignals/images.withsignals/'  

generate_molecule_images(df, output_folder_path)

In [None]:
csv_file_path = 'datasets/processed/nmrshiftdb2/withsignals/nmrshiftdb2withsignals.csv'  
df = pd.read_csv(csv_file_path)

image_folder = '/data/shared/projects/nmr2structure/datasets/processed/nmrshiftdb2/withsignals/images.withsignals/' 
df['Image folder location'] = df.index.map(lambda x: os.path.join(image_folder, f'molecule_{x}.png'))

updated_csv_file_path = 'datasets/processed/nmrshiftdb2/withsignals/nmrshiftdb2withsignals.csv'  
df.to_csv(updated_csv_file_path, index=False)

nmrshiftdb2: identical pairs check in nmredata & withsignals

In [None]:
# Function to calculate hash of image file
def calculate_hash(file_path):
    hasher = hashlib.md5()
    with open(file_path, "rb") as file:
        while True:
            data = file.read(4096)
            if not data:
                break
            hasher.update(data)
    return hasher.hexdigest()

folder1 = "datasets/processed/nmrshiftdb2/nmredata/images.nmredata"
folder2 = "datasets/processed/nmrshiftdb2/withsignals/images.withsignals"

# Initializing dictionaries to store image hashes
hashes_folder1 = {}
hashes_folder2 = {}

# Populating dictionaries w image hashes from folder1
for root, _, files in os.walk(folder1):
    for file in files:
        if file.endswith((".jpg", ".png", ".gif", ".bmp")):
            file_path = os.path.join(root, file)
            image_hash = calculate_hash(file_path)
            hashes_folder1[image_hash] = file_path

# from folder2
for root, _, files in os.walk(folder2):
    for file in files:
        if file.endswith((".jpg", ".png", ".gif", ".bmp")):
            file_path = os.path.join(root, file)
            image_hash = calculate_hash(file_path)
            hashes_folder2[image_hash] = file_path

# Find identical image hashes
identical_hashes = set(hashes_folder1.keys()) & set(hashes_folder2.keys())

csv_filename = "datasets/processed/nmrshiftdb2/identical_pairs_images.csv"
with open(csv_filename, mode='w', newline='') as csv_file:
    csv_writer = csv.writer(csv_file)
    csv_writer.writerow(["nmredata", "withsignals"])

    for image_hash in identical_hashes:
        folder1_image = hashes_folder1.get(image_hash)
        folder2_image = hashes_folder2.get(image_hash)
        if folder1_image and folder2_image:
            image_name1 = os.path.splitext(os.path.basename(folder1_image))[0]
            image_name2 = os.path.splitext(os.path.basename(folder2_image))[0]
            csv_writer.writerow([image_name1, image_name2])

print(f"Number of identical pairs found: {len(identical_hashes)}")
print(f"CSV file '{csv_filename}' created.")

In [7]:
df = pd.read_csv("datasets/processed/nmrshiftdb2/identical_pairs_images.csv")

def extract_and_add_2(image_name):
    try:
        number = int(image_name.split('_')[1])
        new_number = number + 2
        return new_number
    except:
        return None

df["Original row nmredata"] = df["nmredata"].apply(extract_and_add_2)
df["Original row withsignals"] = df["withsignals"].apply(extract_and_add_2)

df.to_csv("datasets/processed/nmrshiftdb2/identical_pairs_images.csv", index=False)


In [None]:
csv_file1 = "datasets/processed/nmrshiftdb2//nmredata/nmrshiftdb2.nmredata.csv"
csv_file2 = "datasets/processed/nmrshiftdb2/withsignals/nmrshiftdb2withsignals.csv"

df1 = pd.read_csv(csv_file1)
df2 = pd.read_csv(csv_file2)

smiles1 = df1["SMILES"]
smiles2 = df2["SMILES"]

unique_smiles2 = set(smiles2)

identical_pairs_count = 0
identical_pairs = []

for idx, smile1 in enumerate(smiles1):
    mol1 = Chem.MolFromSmiles(smile1)
    if mol1:
        identical_in_smiles2 = [smile2 for smile2 in unique_smiles2 if smile2 == smile1]
        if identical_in_smiles2:
            identical_pairs_count += 1
            identical_pairs.append((idx, [df2.index[df2["SMILES"] == smile2].tolist() for smile2 in identical_in_smiles2]))

print("Number of identical pairs:", identical_pairs_count)

identical_pairs_df = pd.DataFrame(identical_pairs, columns=["nmredata", "withsignals"])

identical_pairs_df.to_csv("datasets/processed/nmrshiftdb2/identical_pairs.csv", index=False)


In [13]:
images_df = pd.read_csv("datasets/processed/nmrshiftdb2/identical_pairs_images.csv")
pairs_df = pd.read_csv("datasets/processed/nmrshiftdb2/identical_pairs.csv")

def extract_number_withsignals(withsignals):
    match = re.search(r'\[\[(\d+)]]', withsignals)
    if match:
        return int(match.group(1))
    return None

pairs_df["withsignals_number"] = pairs_df["withsignals"].apply(extract_number_withsignals)

unique_number_pairings = set(zip(pairs_df["nmredata"], pairs_df["withsignals_number"]))

identical_pairs = []

for _, row in images_df.iterrows():
    nmredata = row["Original row nmredata"]
    withsignals = row["Original row withsignals"]
    if (nmredata, withsignals) in unique_number_pairings:
        identical_pairs.append({"nmredata": nmredata, "withsignals": withsignals})

identical_pairs_df = pd.DataFrame(identical_pairs)

identical_pairs_df.to_csv("datasets/processed/nmrshiftdb2/identical_pairs_combined.csv", index=False)


Urban files: AG VB

Gekaufte Substanzen

In [None]:
input_file = "datasets/processed/urban/AG Battisti Verena P/Gekaufte Substanzen/Gekaufte Substanzen_raw.txt"
output_file = "datasets/processed/urban/AG Battisti Verena P/Gekaufte Substanzen/Gekaufte Substanzen_extracted.csv"

ids = []
compound_names = [] 
pattern = r"^VB-\d{4}"

with open(input_file, "r") as file:
    lines = file.readlines()

for i, line in enumerate(lines):
    if re.search(pattern, line):
        ids.append(line.strip())
        
        if i >= 2:
            compound_name = lines[i - 2].strip()
        else:
            compound_name = ""
        compound_names.append(compound_name)

data = [{"ID": id_value, "compound name": comp_name} for id_value, comp_name in zip(ids, compound_names)]

with open(output_file, "w", newline="") as csv_file:
    fieldnames = ["ID", "compound name"]
    writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
    writer.writeheader()
    writer.writerows(data)

print(f"Extracted data saved to {output_file}")


VB

In [None]:
input_file = "datasets/processed/urban/AG Battisti Verena P/VB- Battisti Verena/VB_raw.txt"
output_file = "datasets/processed/urban/AG Battisti Verena P/VB- Battisti Verena/VB_extracted.csv"

ids = []
compound_names = []

pattern = r"^VB-\d{3}"

with open(input_file, "r") as file:
    lines = file.readlines()

for i, line in enumerate(lines):
    if re.search(pattern, line):
        ids.append(line.strip())
        
        if i >= 2:
            compound_name = lines[i - 2].strip()
        else:
            compound_name = ""
        compound_names.append(compound_name)

data = [{"ID": id_value, "compound name": comp_name} for id_value, comp_name in zip(ids, compound_names)]

with open(output_file, "w", newline="") as csv_file:
    fieldnames = ["ID", "compound name"]
    writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
    writer.writeheader()
    writer.writerows(data)

print(f"Extracted data saved to {output_file}")


VB-JK

In [None]:
input_file = "datasets/processed/urban/AG Battisti Verena P/VB-JK - Kirchebner Julia/VB-JK_raw.txt"
output_file = "datasets/processed/urban/AG Battisti Verena P/VB-JK - Kirchebner Julia/VB-JK_extracted.csv"

ids = []
compound_names = []

pattern_id = r"^VB-JK\d{3}"

pattern_compound = r"^(.*)\s+\(X{0,2}\d{1,2}\)$"

with open(input_file, "r") as file:
    lines = file.readlines()

for i, line in enumerate(lines):
    if re.search(pattern_id, line):
        ids.append(line.strip())
        
        if i >= 2:
            match = re.match(pattern_compound, lines[i - 2].strip())
            if match:
                compound_name = match.group(1)
            else:
                compound_name = ""
        else:
            compound_name = ""
        compound_names.append(compound_name)

data = [{"ID": id_value, "compound name": comp_name} for id_value, comp_name in zip(ids, compound_names)]

with open(output_file, "w", newline="") as csv_file:
    fieldnames = ["ID", "compound name"]
    writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
    writer.writeheader()
    writer.writerows(data)

print(f"Extracted data saved to {output_file}")


In [None]:
def process_csv(input_file, output_file, pdf_path_template):
    ids = []
    compound_names = []
    solvent = "d6DMSO"
    nmr_type = "1H, 13C"

    with open(input_file, "r") as csv_file:
        reader = csv.DictReader(csv_file)
        for row in reader:
            ids.append(row["ID"])
            compound_names.append(row["compound name"])

    updated_data = []
    for i in range(len(ids)):
        updated_data.append({
            "ID": ids[i],
            "compound name": compound_names[i],
            "Solvent": solvent,
            "NMR-Type": nmr_type,
            "PDF file path": pdf_path_template.replace("XXX", ids[i].split('-')[1])
        })

    with open(output_file, "w", newline="") as csv_file:
        fieldnames = ["ID", "compound name", "Solvent", "NMR-Type", "PDF file path"]
        writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(updated_data)

    print(f"Additional columns added to {output_file}")

files_info = [
    ("datasets/processed/urban/AG Battisti Verena P/Gekaufte Substanzen/Gekaufte Substanzen_raw.csv", 
     "datasets/processed/urban/AG Battisti Verena P/Gekaufte Substanzen/Gekaufte Substanzen_extracted.csv", 
     "datasets/raw/urban/Sicherung Ausw 2023-08-24/AG Battisti Verena/Gekaufte Substanzen/VB-XXX.pdf"), 
     ("datasets/processed/urban/AG Battisti Verena P/VB- Battisti Verena/VB_raw.csv", 
      "datasets/processed/urban/AG Battisti Verena P/VB- Battisti Verena/VB_extracted.csv", 
      "datasets/raw/urban/Sicherung Ausw 2023-08-24/AG Battisti Verena/VB- Battisti Verena/VB-XXX.pdf")
      ("datasets/processed/urban/AG Battisti Verena P/VB-JK - Kirchebner Julia/VB-JK_extracted.csv", 
       "datasets/processed/urban/AG Battisti Verena P/VB-JK - Kirchebner Julia/VB-JK_processed.csv",
       "datasets/raw/urban/Sicherung Ausw 2023-08-24/AG Battisti Verena/VB-JK - Kirchebner Julia/VB-XXX.pdf")
]

for input_file, output_file, pdf_template in files_info:
    process_csv(input_file, output_file, pdf_template)"

AG Lubec Gert

MK - Kirchhofer Michael, PN - Neill Philip, SB - Bittner Stefan 

In [None]:
input_file = "datasets/processed/urban/AG Lubec Gert P/MK - Kirchhofer Michael/MK_raw.txt"
output_file = "datasets/processed/urban/AG Lubec Gert P/MK - Kirchhofer Michael/MK_extracted.csv"

ids = []
compound_names = []

pattern_id = r"^MK\d{3}[\w\d-]*"

pattern_compound = r"^(.*)$"

with open(input_file, "r") as file:
    lines = file.readlines()

for i, line in enumerate(lines):
    if re.search(pattern_id, line):
        ids.append(line.strip())

        if i >= 2 and i - 2 < len(lines):
            compound_name = re.match(pattern_compound, lines[i - 2].strip())
            if compound_name:
                compound_names.append(compound_name.group(1))
            else:
                compound_names.append("")

data = [{"ID": id_value, "compound name": comp_name} for id_value, comp_name in zip(ids, compound_names)]

with open(output_file, "w", newline="") as csv_file:
    fieldnames = ["ID", "compound name"]
    writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
    writer.writeheader()
    writer.writerows(data)

print(f"Extracted data saved to {output_file}")

In [None]:
input_file = "datasets/processed/urban/AG Lubec Gert P/PN - Neill Philip/PN_raw.txt"
output_file = "datasets/processed/urban/AG Lubec Gert P/PN - Neill Philip/PN_extracted.csv"

ids = []
compound_names = []

pattern_id = r"^PN\d{3}[\w\d-]*"

pattern_compound = r"^(.*)$"

with open(input_file, "r") as file:
    lines = file.readlines()

for i, line in enumerate(lines):
    if re.search(pattern_id, line):
        ids.append(line.strip())

        if i >= 2 and i - 2 < len(lines):
            compound_name = re.match(pattern_compound, lines[i - 2].strip())
            if compound_name:
                compound_names.append(compound_name.group(1))
            else:
                compound_names.append("")

data = [{"ID": id_value, "compound name": comp_name} for id_value, comp_name in zip(ids, compound_names)]

with open(output_file, "w", newline="") as csv_file:
    fieldnames = ["ID", "compound name"]
    writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
    writer.writeheader()
    writer.writerows(data)

print(f"Extracted data saved to {output_file}")

In [None]:
input_file = "datasets/processed/urban/AG Lubec Gert P/SB - Bittner Stefan/SB_raw.txt"
output_file = "datasets/processed/urban/AG Lubec Gert P/SB - Bittner Stefan/SB_extracted.csv"

ids = []
compound_names = []

pattern_id = r"^SB\d{3}[\w\d-]*"

pattern_compound = r"^(.*)$"

with open(input_file, "r") as file:
    lines = file.readlines()

for i, line in enumerate(lines):
    if re.search(pattern_id, line):
        ids.append(line.strip())

        if i >= 2 and i - 2 < len(lines):
            compound_name = re.match(pattern_compound, lines[i - 2].strip())
            if compound_name:
                compound_names.append(compound_name.group(1))
            else:
                compound_names.append("")

data = [{"ID": id_value, "compound name": comp_name} for id_value, comp_name in zip(ids, compound_names)]

with open(output_file, "w", newline="") as csv_file:
    fieldnames = ["ID", "compound name"]
    writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
    writer.writeheader()
    writer.writerows(data)

print(f"Extracted data saved to {output_file}")

In [None]:
def process_csv(input_file, output_file, pdf_path_template):
    ids = []
    compound_names = []
    solvent = "cdcl3"
    nmr_type = "1H, 13C"

    # Read the input CSV file
    with open(input_file, "r") as csv_file:
        reader = csv.DictReader(csv_file)
        for row in reader:
            ids.append(row["ID"])
            compound_names.append(row["compound name"])

    # Prepare updated data with additional columns
    updated_data = []
    for i in range(len(ids)):
        updated_data.append({
            "ID": ids[i],
            "compound name": compound_names[i],
            "Solvent": solvent,
            "NMR-Type": nmr_type,
            "PDF file path": pdf_path_template.format(ID=ids[i])
        })

    # Write the updated data to the output CSV file
    with open(output_file, "w", newline="") as csv_file:
        fieldnames = ["ID", "compound name", "Solvent", "NMR-Type", "PDF file path"]
        writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(updated_data)

    print(f"Additional columns added to {output_file}")

# Define file paths and templates
files_info = [
    ("datasets/processed/urban/AG Lubec Gert P/MK - Kirchhofer Michael/MK_extracted.csv", 
     "datasets/processed/urban/AG Lubec Gert P/MK - Kirchhofer Michael/MK_processed.csv", 
     "datasets/raw/urban/Sicherung Ausw 2023-08-24/AG Lubec Gert/MK - Kirchhofer Michael/{ID}.pdf")
    ("datasets/processed/urban/AG Lubec Gert P/PN - Neill Philip/PN_extracted.csv",
     "datasets/processed/urban/AG Lubec Gert P/PN - Neill Philip/PN_processed.csv",
     "datasets/raw/urban/Sicherung Ausw 2023-08-24/AG Lubec Gert/PN - Neill Philip/{ID}.pdf"),
    ("datasets/processed/urban/AG Lubec Gert P/SB - Bittner Stefan/SB_extracted.csv",
     "datasets/processed/urban/AG Lubec Gert P/SB - Bittner Stefan/SB_processed.csv",
     "datasets/raw/urban/Sicherung Ausw 2023-08-24/AG Lubec Gert/SB - Bittner Stefan/{ID}.pdf")
]

# Process each pair of files
for input_file, output_file, pdf_template in files_info:
    process_csv(input_file, output_file, pdf_template)


Generating SMILES strings from IUPAC names in CSV files

In [None]:
#Code for generating the SMILES strings from IUPAC names

CIR_BASE_URL = "https://cactus.nci.nih.gov/chemical/structure"

# List of CSV files containing IUPAC names in column "compound name"
INPUT_CSV_FILES = [
    "datasets/processed/urban/AG Battisti Verena P/Gekaufte Substanzen/Gekaufte Substanzen_processed.csv",
    "datasets/processed/urban/AG Battisti Verena P/VB- Battisti Verena/VB_processed.csv",
    "datasets/processed/urban/AG Battisti Verena P/VB-JK - Kirchebner Julia/VB-JK_processed.csv", 
    "datasets/processed/urban/AG Lubec Gert P/MK - Kirchhofer Michael/MK_processed.csv", 
    "datasets/processed/urban/AG Lubec Gert P/PN - Neill Philip/PN_processed.csv", 
    "datasets/processed/urban/AG Lubec Gert P/SB - Bittner Stefan/SB_processed.csv",
]

for input_csv_file in INPUT_CSV_FILES:
    df = pd.read_csv(input_csv_file)

    smiles_list = []

    for iupac_name in df["compound name"]:
        try:
            response = requests.get(f"{CIR_BASE_URL}/{iupac_name}/smiles", timeout=300)  
            if response.status_code == 200:
                smiles = response.text
                smiles_list.append(smiles)
            else:
                smiles_list.append("Unable to retrieve SMILES")
                print(f"Failed to retrieve SMILES for {iupac_name}")
        except requests.exceptions.RequestException as e:
            # In case CACTUS is down 
            smiles_list.append("Unable to retrieve SMILES")
            print(f"Server down or connection unsuccessful for compound: {iupac_name}, Error: {e}")

    df.insert(df.columns.get_loc("compound name") + 1, "SMILES", smiles_list)

    df.to_csv(input_csv_file, index=False)

    print(f"SMILES added to {input_csv_file}")


Cleaning: Urban files

In [None]:
csv_files = ["datasets/processed/urban/AG Battisti Verena P/Gekaufte Substanzen/Gekaufte Substanzen_processed.csv",
    "datasets/processed/urban/AG Battisti Verena P/VB- Battisti Verena/VB_processed.csv",
    "datasets/processed/urban/AG Battisti Verena P/VB-JK - Kirchebner Julia/VB-JK_processed.csv",
    "datasets/processed/urban/AG Lubec Gert P/MK - Kirchhofer Michael/MK_processed.csv",
    "datasets/processed/urban/AG Lubec Gert P/PN - Neill Philip/PN_processed.csv",
    "datasets/processed/urban/AG Lubec Gert P/SB - Bittner Stefan/SB_processed.csv"]  

for csv_file in csv_files:

    df = pd.read_csv(csv_file)

    nonexistent_pdf_count = 0

    for index, row in df.iterrows():
        pdf_path = row['PDF file path']

        if not os.path.exists(pdf_path):
            print(f"CSV File: {csv_file}")
            print(f"PDF file path does not exist: {pdf_path}")
            print(f"Row {index + 1}: {row}")
            nonexistent_pdf_count += 1

    print(f"Total rows with nonexistent PDFs in {csv_file}: {nonexistent_pdf_count}")


In [None]:
csv_files = ["datasets/processed/urban/AG Battisti Verena P/Gekaufte Substanzen/Gekaufte Substanzen_processed.csv",
    "datasets/processed/urban/AG Battisti Verena P/VB- Battisti Verena/VB_processed.csv",
    "datasets/processed/urban/AG Battisti Verena P/VB-JK - Kirchebner Julia/VB-JK_processed.csv",
    "datasets/processed/urban/AG Lubec Gert P/PN - Neill Philip/PN_processed.csv"]  

for csv_file in csv_files:

    df = pd.read_csv(csv_file)

    nonexistent_pdf_count = 0

    cleaned_rows = []

    for index, row in df.iterrows():
        pdf_path = row['PDF file path']

        if not os.path.exists(pdf_path):
            print(f"CSV File: {csv_file}")
            print(f"PDF file path does not exist: {pdf_path}")
            print(f"Row {index + 1}: {row}")
            nonexistent_pdf_count += 1
        else:
            cleaned_rows.append(row)

    print(f"Total rows with nonexistent PDFs in {csv_file}: {nonexistent_pdf_count}")

    cleaned_df = pd.DataFrame(cleaned_rows, columns=df.columns)

    cleaned_csv_file = csv_file.replace('.csv', '_cleaned.csv')
    cleaned_df.to_csv(cleaned_csv_file, index=False)
    print(f"Cleaned CSV saved as {cleaned_csv_file}")


Combined CSVs

In [None]:
input_files = [ 
    "datasets/processed/orts/iNEXT_Lib_1D/iNEXTG2-Plate-01.csv",
    "datasets/processed/orts/iNEXT_Lib_1D/iNEXTG2-Plate-02.csv",
    "datasets/processed/orts/iNEXT_Lib_1D/iNEXTG2-Plate-03.csv",
    "datasets/processed/orts/iNEXT_Lib_1D/iNEXTG2-Plate-04.csv",
    "datasets/processed/orts/iNEXT_Lib_1D/iNEXTG2-Plate-05.csv",
    "datasets/processed/orts/iNEXT_Lib_1D/iNEXTG2-Plate-06.csv",
    "datasets/processed/orts/iNEXT_Lib_1D/iNEXTG2-Plate-07.csv",
    "datasets/processed/orts/iNEXT_Lib_1D/iNEXTG2-Plate-08.csv",
    "datasets/processed/urban/AG Battisti Verena P/Gekaufte Substanzen/Gekaufte Substanzen_processed_cleaned.csv",
    "datasets/processed/urban/AG Battisti Verena P/VB- Battisti Verena/VB_processed_cleaned.csv",
    "datasets/processed/urban/AG Battisti Verena P/VB-JK - Kirchebner Julia/VB-JK_processed_cleaned.csv",
    "datasets/processed/urban/AG Lubec Gert P/MK - Kirchhofer Michael/MK_processed.csv",
    "datasets/processed/urban/AG Lubec Gert P/PN - Neill Philip/PN_processed_cleaned.csv",
    "datasets/processed/urban/AG Lubec Gert P/SB - Bittner Stefan/SB_processed.csv",
    "datasets/processed/nmrshiftdb2/nmredata/nmrshiftdb2.nmredata.csv",
    "datasets/processed/nmrshiftdb2/withsignals/nmrshiftdb2withsignals.csv"
]

output_folder = "datasets/combined/"

last_assigned_id = {}

dfs = []

def generate_unique_id(row_index, folder_prefix):
    if folder_prefix not in last_assigned_id:
        last_assigned_id[folder_prefix] = 0
    last_assigned_id[folder_prefix] += 1
    return folder_prefix + str(last_assigned_id[folder_prefix]).zfill(5)

for file_path in input_files:
    if file_path.endswith(".csv"):
        folder, filename = os.path.split(file_path)
        folder_parts = folder.split(os.path.sep)
        
        for part in folder_parts:
            if part in folder_prefixes:
                folder_prefix = folder_prefixes[part]
                break
        
        df = pd.read_csv(file_path)
        
        df['Unique ID'] = [generate_unique_id(i, folder_prefix) for i in range(1, len(df) + 1)]
        
        dfs.append(df)

combined_df = pd.concat(dfs, ignore_index=True, sort=False)

combined_df = combined_df[['Unique ID'] + [col for col in combined_df.columns if col != 'Unique ID']]

combined_csv_path = os.path.join(output_folder, "combined_all.csv")
combined_df.to_csv(combined_csv_path, index=False)


In [13]:
columns_to_keep = [
    "Unique ID", "ID", "Solvent", "SMILES", "NMR-Type", "SDF file path",
    "PDF file path", "Image folder location"
]

extracted_df = combined_df[columns_to_keep]

extracted_csv_path = os.path.join(output_folder, "combined_extracted.csv")
extracted_df.to_csv(extracted_csv_path, index=False)