In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("MassSpecGym_data.tsv", sep="\t")

In [3]:
# Starting by removing all spectra which contain less than 3 peaks.

# Converting the string represenation of the spectra to lists
df["mzs"] =df["mzs"].apply(lambda x: [float(i.strip()) for i in str(x).split(",")])
df["intensities"] = df["intensities"].apply(lambda x: [float(i.strip()) for i in str(x).split(",")])

print("Total entries:", len(df))
df = df[df["mzs"].map(len) >2]
print("Total entries kept that has 3 or more peaks:", len(df))

# Converting the lists back to string representation for the spectra
df["mzs"] = df["mzs"].apply(lambda lst: ", ".join(map(str, lst)))
df["intensities"] = df["intensities"].apply(lambda lst: ", ".join(map(str, lst)))

Total entries: 231104
Total entries kept that has 3 or more peaks: 221010


# Observations in MassSpecGym dataset
1. The dataset appears to contain some duplicated entries, where only the MassSpecGym identifier differs.
2. For some entries there seems to be a missmatch between stated precursor formula and formula plus adduct. formula + adduct != precursor_formula
3. For some entries there seems to be a missmatch between stated masses and when we calculate them from the chemical formula. \
    3.1 Parent masses\
    3.2 Precursor masses
4. Additional to 1 the dataset seems to contain entries where the spectral is exactly matching (up to all decimals) but the meta data differs.
5. Additional to 1 and 4 the dataset seems to contain entries where the m/zs match (up to all decimals), but intensities are slightly different.

# 1. Duplicated entries

In [4]:
# Checking for duplicated entries
df["duplicated"] = df.duplicated(subset = ['mzs', 'intensities', 'smiles', 'inchikey', 'formula',
       'precursor_formula', 'parent_mass', 'precursor_mz', 'adduct',
       'instrument_type', 'collision_energy'], keep = False) # Comparing based on all columns exept the identifier column.  

print(sum(df["duplicated"]))

3438


**Total: 3,438 entries which are non-unique** (disregarding the identifier column) 

# 2. Missmatch between precursor formula, formula and adduct
formula + adduct != stated_precursor_formula

In [5]:
from collections import defaultdict
import re

# Dictionary to map the stated adducts to the element that should be added to the formula to get precursor formula
adduct_dict = { "[M+H]+": "H",
                "[M+Na]+": "Na"}

# Function to check if the calculated precursor formula based on the stated formula and adduct is the same as the stated precursor formula. 
def sum_formula(adduct, formula, precursor_formula):

    # Add the adduct as string to parent formula
    element_to_add = adduct_dict.get(adduct)
    calc_precursor_formula = formula + element_to_add

    # Regex to extract elements and their counts from formula
    pattern = r'([A-Z][a-z]?)(\d*)'

    # Initializing dict for element counts and using default dict to be able to directly add counts without checking if key exists
    counts_calc_precursor_dict = defaultdict(int) 
    
    # Extracting elements and counts from the calculated precursor formula string and adding them to the elemental precursor dict
    for (element, count) in re.findall(pattern, calc_precursor_formula):
        count = int(count) if count else 1 # Making int if count is in formula, if no count is specified use 1 as count for that element
        counts_calc_precursor_dict[element] += count # Adding count to the element in dict

    # Initalizing dict for the stated precursor formula
    counts_stated_precursor = defaultdict(int)

    # Repeat same process for the stated precursor formula
    for (element, count) in re.findall(pattern, precursor_formula):
        count = int(count) if count else 1
        counts_stated_precursor[element] += count

    # Comparing if the two dicts (meaning stated and summed precursor formula) are the same or not
    if counts_calc_precursor_dict == counts_stated_precursor: # if same return false
        return False
    
    elif counts_calc_precursor_dict != counts_stated_precursor: # if different return true
        return True
    
    else:
        return None

In [6]:
# Apply function to each row to check for which entries the formula + adduct does not match the stated precursor formula
df["precursor_formula_dont_match"] = df.apply(lambda row: sum_formula(row["adduct"], row["formula"], row["precursor_formula"]), axis=1)
print(f"Number of entries where stated formula + adduct != stated precursor formula: {sum(df['precursor_formula_dont_match'])}")

Number of entries where stated formula + adduct != stated precursor formula: 51


- **51 entries where the formula+adduct does not seem to add up to the precursor formula.** This corresponds to 18 unique SMILES. 
- Examples:
    - MassSpecGymID0402338 and MassSpecGymID0397538, both having precursor formula corresponding to formula -11H.
    - MassSpecGymID0061506, the precursor formula and formula are the same. 
    - MassSpecGymID0019558, formula is charged (+) and precursor formula corresponds to +2H. 

# 3. Missmatch between calculated masses and stated masses

## 3.1 Parent. Stated parent mass and calculated from formula
- Comparing based on calculated mass by element mass summation
- Comparing based on calculated exact mass from SMILES using rdkit

### Element mass summation

In [7]:
# Extracting all unique elements from all formulas and printing
def extract_elements(formula):
    pattern = r'([A-Z][a-z]?)'
    return re.findall(pattern, formula)

all_elements = df["formula"].apply(extract_elements)
flat_elements = [elem for sublist in all_elements for elem in sublist]

unique_elements = sorted(set(flat_elements))
print(unique_elements)


['As', 'B', 'Br', 'C', 'Cl', 'F', 'H', 'I', 'N', 'O', 'P', 'S', 'Se', 'Si']


In [8]:
# Creating a dictionary for exact masses for all elements in MassSpecGym dataset
# Masses taken from MW of element from Agilent FTMS Exact Mass Calculator
exact_mass_dict = {
    "As": 74.92160,
    "B": 11.00931,
    "Br": 78.91834,
    "C": 12.00000,
    "Cl": 34.96885,
    "F": 18.99840,
    "H": 1.00783,
    "I": 126.90447,
    "N": 14.00307,
    "O": 15.99491,
    "P": 30.97376,
    "S": 31.97207,
    "Se": 79.91652,
    "Si": 27.97693,
    "Na": 22.98977 # Include Na for the adducts
    } 


In [9]:
# Function to calculate exact mass from the stated formula
def calc_exact_mass_from_formula(formula):
    
    me = 5.485799090441e-4 # mass of electron (u)  https://physics.nist.gov/cgi-bin/cuu/Value?meu
    
    # Regex to extract elements and their counts from formula
    formula_pattern = r'([A-Z][a-z]?)(\d*)'

    # Initializeing dict for element counts and using default dict to be able to directly add counts without checking if key exists
    counts_formula = defaultdict(int) 
    
    # Extracting elements and counts from the calculated precursor formula and adding them to the dict
    for (elem_formula, count_formula) in re.findall(formula_pattern, formula):
        count_formula = int(count_formula) if count_formula else 1 # Making int if count is in formula, if no count is specified use 1 as count for that element
        counts_formula[elem_formula] += count_formula # Adding count to the element in dict

    # Initializing exact mass
    exact_mass = 0.0

    if "+" in formula:
        exact_mass -= me  # Subtracting mass of electron for precursor mass if formula is positivly charged
    elif "-" in formula:
        exact_mass += me  # Adding mass of electron for negative precursor mass if formula is negativly charged

    # Calculating exact mass based on the element counts based on masses in dictionary
    for elem, count in counts_formula.items():
        try:
            exact_mass += exact_mass_dict.get(elem) * count
        
        except TypeError:
            raise ValueError(f"Element {elem} not found in exact mass dictionary.")

    return exact_mass


In [10]:
# Calculating exact mass from formula and adding as new column
df["calc_exact_mass"] = df.apply(lambda row: calc_exact_mass_from_formula(row["formula"]), axis=1)

In [11]:
# Function to calculate ppm difference between stated parent mass and calculated exact mass, and compare to ppm tolerance of instrument
def ppm_diff_instrument(parent_mass, exact_mass, instrument_type):

    if instrument_type == "QTOF" or pd.isna(instrument_type): # if QTOF or instrument type not stated, set 10 ppm as tolerance
        ppm_diff = 10

    elif instrument_type == "Orbitrap": # if Orbitrap, set 5 ppm as tolerance
        ppm_diff = 5
    else:
        print("missing")

    diff = abs(((exact_mass - parent_mass) / exact_mass) * 1e6) # calculating ppm difference

    # if ppm difference is larger than tolerance for instrument return True, otherwise return False
    if diff > ppm_diff: 
        return diff, True
    else: 
        return diff, False


In [12]:
# Calculating ppm difference between stated parent mass and calculated exact mass and checking against instrument tolerance
df[["ppm_diff_parent_from_formula", "ppm_parent_higher_than_instrument_from_formula"]] = df.apply(lambda row: pd.Series(ppm_diff_instrument(row["parent_mass"], row["calc_exact_mass"], row["instrument_type"])), axis=1)

print("n spectra with parent mass ppm higher than instrument:", sum(df["ppm_parent_higher_than_instrument_from_formula"]))
print("Calculations based on element exact mass summation, elemental mass taken from exact mass calculator")

n spectra with parent mass ppm higher than instrument: 26884
Calculations based on element exact mass summation, elemental mass taken from exact mass calculator


### Exact mass calculation from SMILES with rdkit

In [13]:
from rdkit.Chem import MolFromSmiles
from rdkit.Chem.Descriptors import ExactMolWt

# Function to get exact mass from SMILES using RDKit
def get_exact_mass_from_smiles(smiles):
    mol = MolFromSmiles(smiles)
    if mol is not None:
        exact_mass = ExactMolWt(mol)
        return exact_mass
    else:
        return None


In [14]:
# Calculating exact mass from SMILES
df["exact_mass_from_smiles"] = df["smiles"].apply(get_exact_mass_from_smiles)

# Calculating ppm difference between stated parent mass and calculated exact mass from SMILES, and checking against instrument tolerance
df[["ppm_diff_parent_from_smiles", "ppm_parent_higher_than_instrument_from_smiles"]] = df.apply(lambda row: pd.Series(ppm_diff_instrument(row["parent_mass"], row["exact_mass_from_smiles"], row["instrument_type"])), axis=1)

print("n spectra with parent mass ppm higher than instrument:", sum(df["ppm_parent_higher_than_instrument_from_smiles"]))
print("Calculated masses based on SMILES using RDKit ExactMolWt")

n spectra with parent mass ppm higher than instrument: 26918
Calculated masses based on SMILES using RDKit ExactMolWt


**Summary**
- **26,884 entries have mass differences exceeding instrumental ppm threshold.** This is for calculated exact mass (elemental mass summation) and compared to parent mass. 
- **26,918 entries have mass differences exceeding instrumental ppm threshold.** This is for calculated exact mass (rdkit calc from SMILES) and compared to parent mass. 
- Examples
    - MassSpecGymID0092282, difference of ~319 ppm
    - MassSpecGymID0056320, difference of ~70 ppm


## 3.2 Precursor. Stated precursor mass and calculated precursor mass from formula + adduct.
- Comparing based on elemental mass summation
- Comparing based on calculated exact mass from SMILES and adding mass of adduct

### Element mass summation

In [15]:
# function to get the precursor mass from stated precursor formula
def precursor_mass_check(precursor_formula):
    me = 5.485799090441e-4 # mass of electron (u)  https://physics.nist.gov/cgi-bin/cuu/Value?meu

    # Regex to extract elements and their counts from formula and initlizing dict for element counts
    formula_pattern = r'([A-Z][a-z]?)(\d*)'
    counts_formula = defaultdict(int)
    
    # Extracting elements and counts from the calculated precursor formula and adding them to the dict
    for (elem_formula, count_formula) in re.findall(formula_pattern, precursor_formula):
        count_formula = int(count_formula) if count_formula else 1
        counts_formula[elem_formula] += count_formula

    exact_mass = 0.0 # Initializing exact mass
    exact_mass -= me  # Subtracting mass of electron for precursor mass, since all is positivly charged
    
    # Calculating exact mass based on the element counts based on masses in dictionary
    for elem, count in counts_formula.items():
        try:
            exact_mass += exact_mass_dict.get(elem) * count
        
        except TypeError:
            raise ValueError(f"Element {elem} not found in exact mass dictionary.")
        
    return exact_mass   


In [16]:
# Calculating precursor mass from stated formula + adduct
df["precursor_calc_mass"] = df["precursor_formula"].apply(precursor_mass_check)

# Calculating ppm difference, and checking against instrument tolerance
df[["ppm_diff_precursor_from_formula", "ppm_precursor_higher_than_instrument_from_formula"]]= df.apply(lambda row: pd.Series(ppm_diff_instrument(row["precursor_calc_mass"], row["precursor_mz"], row["instrument_type"])), axis=1)

print("n spectra with precursor mass ppm higher than instrument:", sum(df["ppm_precursor_higher_than_instrument_from_formula"]))
print("Calculations based on element exact mass summation, elemental mass taken from exact mass calculator")

n spectra with precursor mass ppm higher than instrument: 26548
Calculations based on element exact mass summation, elemental mass taken from exact mass calculator


### Exact mass from SMILES + adduct mass

In [17]:
# function to add the mass of the adduct to the exact mass calculated from SMILES, for comparison to the precursor mass
def apply_mass_adduct(adduct, exact_mass_from_smiles):
    me = 5.485799090441e-4 # mass of electron (u)  https://physics.nist.gov/cgi-bin/cuu/Value?meu

    try:
        precursor_mass = exact_mass_from_smiles + exact_mass_dict.get(adduct_dict.get(adduct)) - me # subtracting mass of electron for precursor mass, since all is positivly charged
        return precursor_mass
    
    except TypeError:
        raise ValueError(f"Adduct {adduct} not found in dictionary.")

In [18]:
# get the precursor mass, from parent mass from smiles + adduct
df["precursor_calc_mass_smiles"] = df.apply(lambda row: apply_mass_adduct(row["adduct"], row["exact_mass_from_smiles"]), axis=1) 

# Calculating ppm difference, and checking against instrument tolerance
df[["ppm_diff_precursor_from_smiles", "ppm_precursor_higher_than_instrument_from_smiles"]] = df.apply(lambda row: pd.Series(ppm_diff_instrument(row["precursor_calc_mass_smiles"], row["precursor_mz"], row["instrument_type"])), axis=1)

print("n spectra with precursor mass ppm higher than instrument:", sum(df["ppm_precursor_higher_than_instrument_from_smiles"]))
print("Calculated masses based on SMILES using RDKit ExactMolWt")


n spectra with precursor mass ppm higher than instrument: 26614
Calculated masses based on SMILES using RDKit ExactMolWt


- **26,548 entries have mass differences exceeding instrumental ppm.** This is for calculated exact mass + adduct(summation) and compared to precursor mass. 
- **26,614 entries have mass differences exceeding instrumental ppm.** This is for calculated exact mass + adduct (rdkit calc) and compared to precursor mass. 

- Example:
    - MassSpecGymID0198039, difference of ~18 ppm


# 4. Duplicated spectra

In [19]:
# Checking for duplicated spectra based on identical mzs and intensities

#group by the mzs and intensities and aggregate the identifiers to a list
group_mzs_intensity = df.groupby(['mzs', 'intensities']).agg({'identifier': lambda x: list(set(x))}).reset_index()

# filter to only retain the groups with more than 1 idenfifier
group_mzs_intensity = group_mzs_intensity[group_mzs_intensity['identifier'].map(len) > 1]
print("Number of duplicated groups:", len(group_mzs_intensity))

# Extracting all unique identifiers from the groups with identical spectra
unique_ids = set(id for sublist in group_mzs_intensity['identifier'] for id in sublist)
print("Number of non-unique spectra", len(unique_ids))

# add in as a column based on the identifiers
df["non_unique_spectra"] = df["identifier"].isin(unique_ids)

Number of duplicated groups: 2487
Number of non-unique spectra 5180


- **2,487 groups that has the exact same spectra.** 
- **5,180 spectra are non-unique.**

In [20]:
# Checking for duplicated spectra based on identical mzs and intensities but different SMILES

# group by the mzs and intensities and aggregate the identifiers and unique smiles to a list
group_mzs_intensity_smiles = df.groupby(['mzs', 'intensities']).agg({'identifier': lambda x: list(set(x)),
                                                              "smiles": lambda x: list(set(x))}).reset_index()
# filter to only retain the groups with more than 1 unique SMILES
group_mzs_intensity_smiles = group_mzs_intensity_smiles[group_mzs_intensity_smiles['smiles'].map(len) > 1]
print("number of duplicated spectra groups but with different SMILES:", len(group_mzs_intensity_smiles))

# Extracting all unique identifiers from the groups with identical spectra and add in as a column based on the identifiers
unique_ids = set(id for sublist in group_mzs_intensity_smiles['identifier'] for id in sublist)
print("number of duplicated spectra entries but have different SMILES", len(unique_ids))
df["non_unique_spectra_diff_smiles"] = df["identifier"].isin(unique_ids)

number of duplicated spectra groups but with different SMILES: 567
number of duplicated spectra entries but have different SMILES 1189


- **567 groups that has the exact same spectra but different SMILES.** 
- **1,189 entries that have non-unique spectra but different SMILES.**

In [21]:
# Checking for duplicated spectra based on identical mzs and intensities but different formula

# group by the mzs and intensities and aggregate the identifiers and unique formula to a list
group_mzs_intensity_formula = df.groupby(['mzs', 'intensities']).agg({'identifier': lambda x: list(set(x)),
                                                                    "formula": lambda x: list(set(x))}).reset_index()
# filter to only retain the groups with more than 1 unique formula
group_mzs_intensity_formula = group_mzs_intensity_formula[group_mzs_intensity_formula['formula'].map(len) > 1]
print("number of duplicated spectra groups but with different formula:",len(group_mzs_intensity_formula))

# Extracting all unique identifiers from the groups with identical spectra and add in as a column based on the identifiers
unique_ids = set(id for sublist in group_mzs_intensity_formula['identifier'] for id in sublist)
print("number of duplicated spectra entries but have different formula", len(unique_ids))
df["non_unique_spectra_diff_formula"] = df["identifier"].isin(unique_ids)

number of duplicated spectra groups but with different formula: 35
number of duplicated spectra entries but have different formula 76


- **35 groups that has the exact same spectra but different formula.** 
- **76 entries that have non-unique spectra but different formula.**

# 5. Duplicated mzs

In [22]:
# Checking for duplicated spectra based on only the mzs

#group by the mzs and aggregate the identifiers to a list
group_mzs = df.groupby(['mzs']).agg({'identifier': lambda x: list(set(x))}).reset_index()

# filter to only retain the groups with more than 1 idenfifier
group_mzs = group_mzs[group_mzs['identifier'].map(len) > 1]
print("Number of duplicated mzs groups:",len(group_mzs))

# Extracting all unique identifiers from the groups with identical spectra
unique_ids = set(id for sublist in group_mzs['identifier'] for id in sublist)
print("Number of non-unique mzs entries", len(unique_ids))

# add in as a column based on the identifiers
df["non_unique_mzs"] = df["identifier"].isin(unique_ids)

Number of duplicated mzs groups: 27430
Number of non-unique mzs entries 56442


- **27,430 groups that has the exact same mzs values.** 
- **56,442 entries that have non-unique mzs.**

- Example:
    - MassSpecGymID0195776 and MassSpecGymID0195777 has exact same mzs, but with maximum difference in intensities of 10^-9

# Summery of observations

In [23]:
df_filtered = df.copy()

print("Total entries reviewed in dataset:", len(df_filtered))
# Selecting only the id column and the added true/false columns
df_filtered = df_filtered[['identifier', 
                           'duplicated',
                           'precursor_formula_dont_match',
                           'ppm_parent_higher_than_instrument_from_formula',
                           'ppm_parent_higher_than_instrument_from_smiles',
                           'ppm_precursor_higher_than_instrument_from_formula',
                           'ppm_precursor_higher_than_instrument_from_smiles',
                           'non_unique_spectra',
                           'non_unique_mzs',
                           'non_unique_spectra_diff_smiles',
                           'non_unique_spectra_diff_formula']]

print("n duplicated entries:", sum(df_filtered["duplicated"]))
print("n entries where parent formula + adduct does not match stated precursor formula:", sum(df_filtered["precursor_formula_dont_match"]))
print("n entries with parent mass ppm higher than instrument, mass from formula:", sum(df_filtered["ppm_parent_higher_than_instrument_from_formula"]))
print("n entries with parent mass ppm higher than instrument, mass from SMILES:", sum(df_filtered["ppm_parent_higher_than_instrument_from_smiles"]))
print("n entries with precursor mass ppm higher than instrument, mass from precursor formula:", sum(df_filtered["ppm_precursor_higher_than_instrument_from_formula"]))
print("n entries with precursor mass ppm higher than instrument, mass from SMILES + adduct:", sum(df_filtered["ppm_precursor_higher_than_instrument_from_smiles"]))
print("n non-unique spectra based on identical mzs and intensities:", sum(df_filtered["non_unique_spectra"]))
print("n non-unique spectra based on identical mzs:", sum(df_filtered["non_unique_mzs"]))
print("n non-unique spectra based on identical mzs and intensities but different SMILES:", sum(df_filtered["non_unique_spectra_diff_smiles"]))
print("n non-unique spectra based on identical mzs and intensities but different formula:", sum(df_filtered["non_unique_spectra_diff_formula"]))

print("")
print("n entries with at least one of above observations:", len(df_filtered[
    (df_filtered['duplicated'] == True) |
    (df_filtered['precursor_formula_dont_match'] == True) |
    (df_filtered['ppm_parent_higher_than_instrument_from_formula'] ==True) |
    (df_filtered['ppm_parent_higher_than_instrument_from_smiles'] == True) |
    (df_filtered['ppm_precursor_higher_than_instrument_from_formula'] == True) |
    (df_filtered['ppm_precursor_higher_than_instrument_from_smiles'] == True) |
    (df_filtered['non_unique_spectra'] == True) |
    (df_filtered['non_unique_mzs'] == True) |
    (df_filtered['non_unique_spectra_diff_smiles'] == True) |
    (df_filtered['non_unique_spectra_diff_formula'] == True)
]))
                                                                

print("n entries without any of above observations:", len(df_filtered[
    (df_filtered['duplicated'] == False) &
    (df_filtered['precursor_formula_dont_match'] == False) &
    (df_filtered['ppm_parent_higher_than_instrument_from_formula'] == False) &
    (df_filtered['ppm_parent_higher_than_instrument_from_smiles'] == False) &
    (df_filtered['ppm_precursor_higher_than_instrument_from_formula'] == False) &
    (df_filtered['ppm_precursor_higher_than_instrument_from_smiles'] == False) &
    (df_filtered['non_unique_spectra'] == False) &
    (df_filtered['non_unique_mzs'] == False) &
    (df_filtered['non_unique_spectra_diff_smiles'] == False) &
    (df_filtered['non_unique_spectra_diff_formula'] == False)
]))





Total entries reviewed in dataset: 221010
n duplicated entries: 3438
n entries where parent formula + adduct does not match stated precursor formula: 51
n entries with parent mass ppm higher than instrument, mass from formula: 26884
n entries with parent mass ppm higher than instrument, mass from SMILES: 26918
n entries with precursor mass ppm higher than instrument, mass from precursor formula: 26548
n entries with precursor mass ppm higher than instrument, mass from SMILES + adduct: 26614
n non-unique spectra based on identical mzs and intensities: 5180
n non-unique spectra based on identical mzs: 56442
n non-unique spectra based on identical mzs and intensities but different SMILES: 1189
n non-unique spectra based on identical mzs and intensities but different formula: 76

n entries with at least one of above observations: 82668
n entries without any of above observations: 138342


In [24]:
df_filtered.to_csv("MassSpecGym_observations_summary.csv", index=False)