In [1]:
import pandas as pd

### Using SIRIUS predictions for custom database for matching in mzmine 



In [None]:
# Load annotation and quantification files
# ⚠️ Ensure the quant file matches the batch used in SIRIUS (i.e., no isotope grouper),otherwise feature IDs will not align correctly.
annotation_list = pd.read_csv('canopus_formula_summary.csv') 
quant_list = pd.read_csv('SIRIUS-quant-file.csv') 


In [4]:
filtered_feature_list = quant_list[quant_list['id'].isin(annotation_list['id'])]


In [None]:
# Merge filtered quant data with annotations using 'id'
merged_df = pd.merge(filtered_feature_list, annotation_list, on='id')

# Save merged results to CSV for downstream analysis
merged_df.to_csv('SIRIUS-based-databse.csv', index=False)


#### Transform the list so it contains only rows classified as Terpenoid alkaloids within NPC_class

In [8]:
filtered_database='SIRIUS-based-databse.csv'
data = pd.read_csv(filtered_database)


In [None]:
# Filter rows where 'NPC_class' column is 'Terpenoid alkaloid'
filtered_data = data[data['NPC_class'] == 'Terpenoid alkaloids']

# Save the filtered DataFrame to a new CSV file (optional)
filtered_file_path = 'NPC_terpenoid_alkaloids_db.csv'  # Specify the desired output file name
filtered_data.to_csv(filtered_file_path, index=False)


### Celan the column names from the mzmine output files

In [None]:
# Load data
annotated_alkaloids_features = pd.read_csv('NPC_terpenoid_alkaloids_db.csv')  # Assuming columns: 'id', 'compound_name', 'rt', 'row_m-z'
mzmine_quant= pd.read_csv('mzmine-quant-file.csv')  # Assuming columns: 'm/z', 'intensity', 'retention_time'


In [None]:
# Function to clean column names so there wouldn't be issues in the final table, where headers can be replaced by numbers
def clean_column_names(df):
    df.columns = df.columns.str.replace(r"[:.\-\(\)\[\]\{\}/\\]", "_", regex=True)  # Replace specific special characters
    df.columns = df.columns.str.replace(r"\s+", "_", regex=True)  # Replace spaces with underscores
    df.columns = df.columns.str.strip()  # Remove leading/trailing spaces
    return df

# Apply the function to clean column names
mzmine_quant= clean_column_names(mzmine_quant)

# Save the cleaned CSV file
mzmine_quant.to_csv("mzmine_quant_cleaned.csv", index=False)


Index(['id', 'area', 'rt', 'mz_range_min', 'mz_range_max', 'fragment_scans',
       'alignment_scores_rate', 'alignment_scores_aligned_features_n',
       'alignment_scores_align_extra_features',
       'alignment_scores_weighted_distance_score',
       ...
       'S6_EtNH2_1uL_mzML_rt_ms2_apex_distance', 'S6_EtNH2_1uL_mzML_fwhm',
       'S6_EtNH2_1uL_mzML_rt_range_min', 'S6_EtNH2_1uL_mzML_rt_range_max',
       'S6_EtNH2_1uL_mzML_mz', 'S6_EtNH2_1uL_mzML_intensity_range_min',
       'S6_EtNH2_1uL_mzML_intensity_range_max',
       'S6_EtNH2_1uL_mzML_asymmetry_factor',
       'S6_EtNH2_1uL_mzML_tailing_factor', 'S6_EtNH2_1uL_mzML_height'],
      dtype='object', length=164)


In [None]:
df = pd.read_csv("mzmine_quant_cleaned.csv")

# Save headers to a text file
with open("column_names.txt", "w") as f:
    for col in df.columns:
        f.write(col + "\n")

In [None]:
### I filtered the column names for the mzmine_quant, to only keep ones of interest and create a cleaner output file

# Load the column names from the text file
with open("column_names.txt", "r") as f:
    selected_columns = [line.strip() for line in f.readlines()]  # Remove extra spaces/newlines

# Keep only the selected columns
filtered_df = df[selected_columns]

# Save the filtered DataFrame to a new CSV
filtered_df.to_csv("mzmine_quant_filtered-columns.csv", index=False)

In [None]:
## Do the cleaning for the database as well
df = pd.read_csv("NPC_terpenoid_alkaloids_db.csv")

# Save headers to a text file
with open("column_names-NPC-db.txt", "w") as f:
    for col in df.columns:
        f.write(col + "\n")

In [None]:
# Load the column names from the text file

df = pd.read_csv("NPC_terpenoid_alkaloids_db.csv")

with open("column_names-NPC-db.txt", "r") as f:
    selected_columns = [line.strip() for line in f.readlines()]  # Remove extra spaces/newlines

# Keep only the selected columns
filtered_df = df[selected_columns]

# Save the filtered DataFrame to a new CSV
filtered_df.to_csv("NPC_terpenoid_alkaloids_db_filtered-columns.csv", index=False)

### Filter the NPC terpenoid alkaloids database and remove rows that arent present in at least one replicate from each group

In [None]:

annotated_alkaloids_features = pd.read_csv('NPC_terpenoid_alkaloids_db_filtered-columns.csv')  # Assuming columns: 'id', 'compound_name', 'rt', 'row_m-z'


In [None]:
# Define group info
group_info = {
    'PBS': ['S10_Ctrl_1uL.mzML:area', 'S11_CTRL_1uL.mzML:area','S12_CTRL_1uL.mzML:area'],
    'Ethanolamine': ['S4_EtNH2_1uL.mzML:area', 'S5_EtNH2_1uL.mzML:area','S6_EtNH2_1uL.mzML:area'],
    'Ethylamine': ['S13_EthylNH2_1uL.mzML:area', 'S14_EthylNH2_1uL.mzML:area','S15_EthylNH2_1uL.mzML:area']
    }

# Function to check if a compound has a value greater than 0 in at least one sample per group
def is_present_in_all_groups(row, group_info):
    for group, sample_columns in group_info.items():
        found_in_group = False
        for column in sample_columns:
            if row[column] > 0:  # Check if the value in the sample column is greater than 0
                found_in_group = True
                break
        if not found_in_group:
            return False  # Compound not found in this group, return False
    return True  # Compound found in all groups

# Function to filter compounds based on presence in all groups and collect removed rows
def filter_alkaloids(annotated_alkaloids_features, group_info):
    filtered_alkaloids = []
    removed_alkaloids = []  # List to store removed compounds
    
    for index, row in annotated_alkaloids_features.iterrows():
        # Check if the compound is detected in at least one sample from each group
        if is_present_in_all_groups(row, group_info):
            filtered_alkaloids.append(row)
        else:
            removed_alkaloids.append(row)  # Collect removed compounds
    
    # Convert filtered list of alkaloids back to a DataFrame
    filtered_df = pd.DataFrame(filtered_alkaloids)
    removed_df = pd.DataFrame(removed_alkaloids)  # DataFrame for removed features
    
    return filtered_df, removed_df

# Run the filtering process
filtered_annotated_alkaloids, removed_annotated_alkaloids = filter_alkaloids(annotated_alkaloids_features, group_info)

# Save or inspect the filtered result
filtered_annotated_alkaloids.to_csv('filtered_NPC-db.csv', index=False)

print(f"Number of removed features: {len(removed_annotated_alkaloids)}")


Number of removed features: 72


#### Further filter the list based on the NPC class probability (equal or higher than 0.6)

In [None]:
# Load the CSV file into a DataFrame
file_path = 'filtered_NPC-db.csv'  # Replace with the path to your CSV file
data = pd.read_csv(file_path)

# Apply multiple filtering conditions
filtered_data = data[(data['NPC_class_Probability'] >= 0.6)]      # Keep rows with 'NPC_class_probability' >= 0.6


# Save the filtered DataFrame to a new CSV file (optional)
filtered_file_path = 'filtered_NPC-probability-db.csv'  # Specify the desired output file name
filtered_data.to_csv(filtered_file_path, index=False)



### Now we have a database that should only contain features that are: 
1)found across at least one sample across each condition; This way, we make sure that we are looking into compounds that are present across all conditions and are uniform across samples; Because we can only talk about integration of certain labelled substrate into a compound if that compound is seen in all conditions, and shows only incorporation of specific isotope for one condition;

2)classified based on the NPC class as Terpenoid alkaloids;

3)have NPC Class probability higher or equal to 0.6


#### Add information about the label that should be seen in labelled features

In [44]:
# Deuterium mass difference (difference between hydrogen and deuterium)
deuterium_mass_diff = 1.00627

In [45]:
updated_database = pd.read_csv('filtered_NPC-probability-db.csv')  # Assuming columns: 'id', 'compound_name', 'rt', 'row_m-z'


In [46]:
#Calculate modified m/z values for each alkaloid and add them as new columns
updated_database['M+3D'] = updated_database['mz'] + 3 * deuterium_mass_diff
updated_database['M+4D'] = updated_database['mz'] + 4 * deuterium_mass_diff
updated_database['M+5D'] = updated_database['mz'] + 5 * deuterium_mass_diff

updated_database.to_csv('updated_database.csv')

In [47]:
# Function to calculate ppm difference
def ppm_difference(mz1, mz2):
    return abs(mz1 - mz2) / mz1 * 1e6

In [48]:
# Load the CSV file
file_path = "updated_database.csv"
df = pd.read_csv(file_path)

# Add "SIRIUS" as a prefix to all column names
df.columns = ["SIRIUS_" + col for col in df.columns]

# Save the modified DataFrame back to a CSV file
output_path = "updated_database.csv"
df.to_csv(output_path, index=False)

print(f"Updated CSV saved to: {output_path}")

Updated CSV saved to: updated_database.csv


In [None]:
updated_database = pd.read_csv('updated_database.csv') 


In [None]:
mzmine_quant=pd.read_csv("mzmine_quant_filtered-columns.csv")

In [None]:
# Function to find matches
def find_deuterium_matches(updated_database, mzmine_quant, retention_tolerance=0.05, ppm_tolerance=5):
    matches = []  # Collect the combined rows with all details
    
    for index, alkaloid in updated_database.iterrows():
        rt = alkaloid['SIRIUS_rt']
        original_mz = alkaloid['SIRIUS_mz']
        alkaloid_id = alkaloid['SIRIUS_id']  

        
        # List of modified m/z columns
        modified_mz_columns = ['SIRIUS_M+3D', 'SIRIUS_M+4D', 'SIRIUS_M+5D']
        
        for feature in mzmine_quant.itertuples():
            feature_mz = feature.mz  # Access 'm/z' column
            feature_rt = feature.rt  # Access 'rt' column
            
            # Check each modified m/z column
            for mod_column in modified_mz_columns:
                mod_mz = alkaloid[mod_column]
                
                if abs(rt - feature_rt) <= retention_tolerance and ppm_difference(feature_mz, mod_mz) <= ppm_tolerance:
                    # Combine the alkaloid and feature data into a single dictionary
                    match_data = {
                        'Modification': mod_column,  # The type of modification
                        'Original m/z': original_mz,
                        'Modified m/z': mod_mz,
                        'Feature m/z': feature_mz,
                        'Feature RT': feature_rt,
                        'Original SIRIUS ID' : alkaloid_id
                    }
                    
                    # Add all columns from updated_database
                    match_data.update(alkaloid.to_dict())
                    
                    # Add all columns from mzmine_quant
                    match_data.update(feature._asdict())
                    
                    matches.append(match_data)
    
    # Create a DataFrame with the collected matches
    result_df = pd.DataFrame(matches)
    return result_df


In [None]:
# Find matches
result_df = find_deuterium_matches(updated_database, mzmine_quant)


In [None]:
# Final csv table with a list of terpenoid alkaloids that show deuterium incorporation
result_df.to_csv("labelled-metabolites-with-clean-columns.csv")
