# Packages

In [None]:
from elasticsearch import Elasticsearch
from sentence_transformers import SentenceTransformer
import pandas as pd

# Setting up ElasticSearch Server

In [None]:
es = Elasticsearch(
    "https://localhost:9200",
    basic_auth=("elastic", '{private key}'),
    ca_certs="/Users/judepops/Documents/PathIntegrate/Code/Processing/semantic_search/elasticsearch-8.13.2/config/certs/http_ca.crt"
)

es.ping()

### Cleaning the index

In [None]:
try:
    response = es.indices.delete(index='compounds')
    print("Index deleted:", response)
except Exception as e:
    print("An error occurred:", e)

response = es.indices.clear_cache(index='*')
print(response)

# Preparing the ChEBI database

In [None]:
df = pd.read_csv('names.tsv', sep="\t") # this is the dataframe from ChEBI which contains 400,000 manual chebi IDs
df.head()

## Prepare the data
df.isna().value_counts

In [None]:
# Subsetting to the key columns of interest
df = df[['NAME', 'COMPOUND_ID', 'TYPE', 'SOURCE']]

has_na = df.isna().any().any()  

if has_na:
    print("Has")
else:
    print("None")

### Converting name to string and filling any missing wtih unknown

In [None]:
df['NAME'] = df['NAME'].fillna('unknown').astype(str)

In [None]:
print(df.dtypes)

### Making sure there are no duplicate entries: if there are, we keep the KEGG COMPOUND entry (LLM V2)

In [None]:
# creatign a fucntion to decide which entry to priortise in the case of duplicates

def prioritise_duplicates(df):
    source_priority = {'KEGG COMPOUND': 1, 'ChEBI': 2}
    df['SOURCE_PRIORITY'] = df['SOURCE'].map(source_priority).fillna(3)
    df.sort_values(by=['NAME', 'SOURCE_PRIORITY'], inplace=True)
    df = df.drop_duplicates(subset='NAME', keep='first')
    df.drop(columns=['SOURCE_PRIORITY'], inplace=True)
    return df

# applying  function
df = prioritise_duplicates(df)

### Removing any compound names that are longer than 100 bytes - these are extremely long compound that simply dont exist in our data so there is no point indxing them

In [None]:
byte_sizes = df['NAME'].apply(lambda x: len(str(x).encode('utf-8')))
df = df[byte_sizes <= 100]

# Now, importing the pre-trained BERT model and converting the Name column into vectors

In [None]:
model = SentenceTransformer("all-mpnet-base-v2")

### Encoding each name into vector representation and saving it in vector column - this takes around 7 hours

In [None]:
df['NAME_VECTOR'] = df['NAME'].apply(lambda x: model.encode(x))

### saving file to pkl

In [None]:
# vector is a multidimensional array of numbers
df.to_pickle('KEGG_ChEBI_vectors.pkl')

# Creating a new index in the ElasticSearch server

In [None]:
# deletign the current index to make sure nothign is there
try:
    response = es.indices.delete(index='compounds')
    print(" deleted:", response)
except Exception as e:
    print(" error :", e)

### An index mapping file is used for the index

In [None]:
# creating the index - indexMapping.py in same directory
from indexMapping import indexMapping
es.indices.create(index="compounds", mappings=indexMapping)

In [None]:
# index mapping file (for visualisation puurposes:

indexMapping = {
    "properties":{
        "COMPOUND_ID":{
            "type":"long"
        },
        "TYPE":{
            "type":"text"
        },
        "SOURCE":{
            "type":"text"
        },
        "NAME_VECTOR":{
            "type":"dense_vector",
            "dims": 768,
            "index": True,
            "similarity":"l2_norm"
        },
    }
}


### converting the original dataframe to a dictionary file

In [None]:
df_dict = df.to_dict("records") # this is a parameter in to_dict taht specifies the orientation of the returned dictionary in a JSON format


In [None]:
df_dict

### function to print the indexmapping we added to ealstic search just to ameks ure

In [None]:
def print_index_mapping(index_name):
    try:
        mapping = es.indices.get_mapping(index=index_name)
        print(mapping[index_name]['mappings'])
    except Exception as e:
        print("error", e)
print_index_mapping("compounds")

### now, ingest each df_dict entry into ElasticSearhc, storing them under their name (compound names) - this takes around 8 hours

In [None]:
for record in record_list:
    try:
        es.index(index="compounds", document=record, id=record["NAME"])
    except Exception as e:
        print(e)


# Using the elasticsearch index for an example compound spermidine

In [None]:
# specifiying compound to search
input_keyword = 'spermidine'
vector_of_input_keyword = model.encode(input_keyword)

query = {
    "field": "NAME_VECTOR", # what we are searching against (the query is converted into a vector)
    "query_vector": vector_of_input_keyword,
    "k": 4,
    "num_candidates" : 10000, #the database size
}

res = es.knn_search(index='compounds', knn=query, source=['COMPOUND_ID', 'NAME', 'TYPE'])
res["hits"]["hits"] # accsesing resuts from the output JSON

### Now we can run this on all covid compounds in the dataset

In [None]:
# laoding in the original, cleaned metabolomics data
metabolomics_data_original = pd.read_csv('/Users/judepops/Documents/PathIntegrate/Code/Processing/Processing_Cleaned/cleaned_metabolomics_data_covid.csv')

# extracing the compound names
metabolomics_data_original.set_index('sample_id', inplace=True)
metabolomics_data = metabolomics_data_original.iloc[:, :-7]
metabolomics_data.columns = [col.strip().lower() for col in metabolomics_data.columns]
metabolomics_data

# retrieving the names and makign a dataframe
column_names = metabolomics_data.columns.tolist()
covid_compounds = pd.DataFrame(column_names, columns=['Compound Name'])

### funciton that performs the mapping of input covid compounds into their ChEBI IDs - additional parameters are added for LLM v2: kegg compound is selected and scores must be above 0.75 for matches

In [None]:
# first we initialise an output list to store the results
output_records = []

# looping through each compound in the dataframe
for compound_name in covid_compounds['Compound Name']:
    # encoding the compound name to a vector
    vector_of_compound_name = model.encode(compound_name)

    # defining the knn query for searching - we use the vectors to match l2norm
    query = {
        "field": "NAME_VECTOR",
        "query_vector": vector_of_compound_name,
        "k": 5,  # this returns the top 5 matches using KNN search
        "num_candidates": 10000 # the maximum fo 10000 candidates are checked each time
    }

    #  knn search in elasticsearch
    res = es.knn_search(index='compounds', knn=query, _source=['COMPOUND_ID', 'NAME', 'TYPE', 'SOURCE'])

    # getting the actual results from the search - extracting from teh JSON file
    hits = res["hits"]["hits"]

    # initialising variables to store  best matches
    best_hit = None
    best_score = -np.inf
    best_is_kegg = False  # tracking if best match is a kegg compound - important for llm v2

    # going through each hit to skip if score is not greater than 0.75 (this was added after threshold anlaysis to avoid poor compounds)
    for hit in hits:
        score = hit['_score']  
        if score <= 0.75:
            continue  # skipping if the score isnt above 0.75

        compound_id = str(hit['_source']['COMPOUND_ID']).strip()  # making sure compound_id is a string and stripped from confoudnign characters
        source = hit['_source'].get('SOURCE')
        type = hit['_source'].get('TYPE')
        is_kegg = source == 'KEGG COMPOUND'

        # deciding if the hit is better than the current best one
        if (best_hit is None) or (score > best_score) or \
           (score == best_score and is_kegg and not best_is_kegg):
            best_hit = hit
            best_score = score
            best_is_kegg = is_kegg

    # taking the top match for each compound
    if best_hit:  # check if we found a good match
        matched_name = best_hit['_source']['NAME']
        matched_compound_id = best_hit['_source']['COMPOUND_ID']

        # adding the result to the output list
        output_records.append({
            'Input Compound Name': compound_name,
            'Matched Compound Name': matched_name,
            'Matched COMPOUND_ID': matched_compound_id,
            'Source': best_hit['_source']['SOURCE']
        })

# reate a dataframe from the output records list that we created
output_df = pd.DataFrame(output_records)
output_df.to_csv('/Users/judepops/Documents/PathIntegrate/Code/Final_Scripts/Results/Results_B/0.75_KEGG_Elastic_automated.csv', index=False) # saving the final file which is the llm v2 fle in this case



In [None]:
# gettign the results are a new dataframe
scores = output_df.copy()

scores['Matched COMPOUND_ID'] = pd.to_numeric(scores['Matched COMPOUND_ID'], errors='coerce')
if scores['Matched COMPOUND_ID'].isnull().any():
    scores['Matched COMPOUND_ID'] = scores['Matched COMPOUND_ID'].fillna(0).astype(int)
else:
    scores['Matched COMPOUND_ID'] = scores['Matched COMPOUND_ID'].astype(int)

### subsetting the results to IDs that are in the manual dataframe - have IDs

In [None]:
correct_id = pd.read_csv('/Users/judepops/Documents/PathIntegrate/Code/Manual_Annotation/manual_annotations_raw_final_2.csv', index_col=0)

columns_to_drop = ['Automated_Match', 'Automated_ChEBI', 'Confusion_Matrix', 'Unnamed: 4', 'Manual_Match', 'Input Compound Name']
correct_id = correct_id.drop(columns=columns_to_drop)

correct_id.rename(columns={'Manual_ChEBI': 'ChEBI'}, inplace=True)

if correct_id['ChEBI'].isnull().any():
    correct_id['ChEBI'] = correct_id['ChEBI'].fillna(0).astype(int)
else:
    correct_id['ChEBI'] = correct_id['ChEBI'].astype(int)

correct_id

In [None]:
# stripping whitespace and converting to lower case for both dataframes
scores['Input Compound Name'] = scores['Input Compound Name'].str.strip().str.lower()
correct_id['Compound Name'] = correct_id['Compound Name'].str.strip().str.lower()
# renaing 
correct_id.rename(columns={'Compound Name': 'Query'}, inplace=True)
scores.rename(columns={'Input Compound Name': 'Query'}, inplace=True)
# merging the data 
merged_df = scores.merge(correct_id, on='Query', how='left')
# renaming columns
merged_df.rename(columns={'ChEBI': 'Correct COMPOUND_ID'}, inplace=True)

merged_df

In [None]:
merged_df['Matched COMPOUND_ID'] = pd.to_numeric(merged_df['Matched COMPOUND_ID'], errors='coerce')
merged_df['Correct COMPOUND_ID'] = pd.to_numeric(merged_df['Correct COMPOUND_ID'], errors='coerce')

# extracting compounds that have no correct ChEBI ID and compounds that have a correct ChEBI ID (identified manually)
df = merged_df[merged_df['Correct COMPOUND_ID'] != 0]
df_2 = merged_df[merged_df['Correct COMPOUND_ID'] == 0]


In [None]:
# saving as llm v2 results
df.to_csv('/Users/judepops/Documents/PathIntegrate/Code/Final_Scripts/Results/Results_B/llm_subset_v2.csv')
