In [1]:
import json
import os
from datasets import Dataset, load_from_disk
import pandas as pd
from itertools import count
from tqdm.auto import tqdm
from copy import deepcopy

from transformers import AutoTokenizer

import numpy as np
import matplotlib.pyplot as plt

  from .autonotebook import tqdm as notebook_tqdm


## Settings

In [6]:
# set your tokenizer: either local path of from hub
tokenizer_path = '/scratch/gpfs/JHA/mb5157/tokenizers/biomedbert_fast_tokenizer'
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)

# set model_name to the one used to discover head entities
# it is used only for forming str paths
model_name = 'qwen3-32b'

# set your output file path
output_path_prefix = f'/scratch/gpfs/JHA/mb5157/large_data/diabetes_2025_6years/dataset/for_training_tmp'

# set MIN_SCORE_THRESHOLD (alpha hyperparameter): injections with cosine similarity 
# less than MIN_SCORE_THRESHOLD are discarded
MIN_SCORE_THRESHOLD = 0.55

# set the path to your input dataset with all necessary columns
injections_path = f'/scratch/gpfs/JHA/mb5157/large_data/diabetes_2025_6years/dataset/for_training_tmp/train_{model_name}_heads_all_result'

In [3]:
# if there are any relations that you need to exclude from the seed KG, put them here.

undesired_relations = [
    'acted_on_by_process',
    'active_ingredient_of',
    'associated_procedure_of',
    'basis_of_strength_substance_of',
    'component_of',
    'consider_from',
    'direct_device_of',
    'direct_substance_of',
    'has_associated_finding',
    'has_finding_context',
    'has_interpretation',
    'has_laterality',
    'has_realization',
    'has_scale_type',
    'has_specimen',
    'has_subject_relationship_context',
    'has_temporal_context',
    'inverse_was_a',
    'mapped_from',
    'mapped_to',
    'moved_to',
    'negatively_regulated_by',
    'positively_regulated_by',
    'possibly_replaces',
    'precise_active_ingredient_of',
    'realization_of',
    'regulated_by',
    'replaced_by',
    'replaces',
    'was_a',
    'has_intent',
    'referred_to_by',
    'refers_to',
    'characterizes',
    'substance_used_by',
    'specimen_source_topography_of',
    'specimen_substance_of',
    'has_active_ingredient',
    'has_property',
]

In [4]:
pd.set_option('display.max_colwidth', None)
tqdm.pandas()

# Contextual based injections

In [14]:
dataset = load_from_disk(injections_path)
dataset

Dataset({
    features: ['input_ids', 'leaf_node_ids', 'leaf_relationships', 'attention_mask', 'input_nodes', 'start_indices', 'special_tokens_mask', 'head_positions', 'id', 'top_k_relations_with_scores'],
    num_rows: 989666
})

In [15]:
def add_id(dset):
    return dset.map(lambda _, idx: {"id": idx}, with_indices=True)

if 'id' not in dataset.column_names:
    dataset = add_id(dataset)
    injections_path_new = injections_path + '_indexed'
    dataset.save_to_disk(injections_path_new)

In [16]:
def convert_to_pandas(dataset):
    required = ["id", "input_ids","top_k_relations_with_scores"]
    for r in required:
        assert r in dataset.column_names, f'{r} is not in column_names'
        
    ds = dataset.remove_columns([c for c in dataset.column_names 
                                 if c not in required])
    df = ds.to_pandas()
    print("decoding sequences")
    df["sequence"] = df["input_ids"].progress_apply(lambda ids: tokenizer.decode(ids, skip_special_tokens=True))
    df = df.drop(columns='input_ids')
    print("loading scores from json")
    df["relations_dict"] = df["top_k_relations_with_scores"].progress_apply(json.loads)
    df = df.drop(columns='top_k_relations_with_scores')
    return df
    

def make_pairs(d):
    out = []
    global global_head_id
    for head, matches in d.items():
        matched_head_id = next(global_head_id)
        if matches:
            for m in matches:
                out.append((matched_head_id, head, m))
        else:
            # preserve the head with a None match
            out.append((matched_head_id, head, None))
    return out


def explode_pairs(df):
    df = df.explode('pairs').reset_index(drop=True)
    df["pairs"] = df["pairs"].apply(lambda x: x if isinstance(x, tuple) else (None, None, None))
    
    # Now every row is a 3-tuple
    df[["matched_head_id", "head", "match_dict"]] = pd.DataFrame(
        df["pairs"].tolist(), index=df.index,
        columns=["matched_head_id","head","match_dict"]
    )
    df['matched_head_id'] = df['matched_head_id'].astype("Int64")
    df = df.drop(columns='pairs')
    return df


def unpack_match(d):
    if not isinstance(d, dict):
        return pd.Series({
            'relation_type': None,
            'tail':          None,
            'score':         None,
        })
    parts = d['relation'].split(' | ')
    unpacked = pd.Series({
        'relation_type': parts[1]  if len(parts) > 1 else None,
        'tail':          parts[2]  if len(parts) > 2 else None,
        'score':         d.get('score'),
    })
    return unpacked


def parse_triples(df):
    print("Parsing triples")
    df[['relation_type','tail','score']] = df['match_dict'].progress_apply(unpack_match)
    df = df.drop(columns='match_dict')
    return df


def exclude_relations_with_few_injections(df, thr=100):
    df_top_relations = get_realtion_df(df)
    relations_few_injections = df_top_relations[df_top_relations['total'] < thr].relation_type.to_list()
    
    df = df[~df['relation_type'].isin(relations_few_injections)]
    print('num relations excluded:', len(relations_few_injections))
    print('relations excluded:')
    print(relations_few_injections)
    return df

In [17]:
def get_realtion_df(df):
    df_relations = df[['relation_type', 'tail']].groupby(by='relation_type').agg(total=('tail','count')).reset_index()
    df_relations = df_relations.sort_values(by='total', ascending=False).reset_index(drop=True)
    return df_relations
def threshold_cut_off(df, thr):
    # exclude low score relations
    df = df[df['score'] > thr]
    return df

## Preprocessing

In [18]:
df = convert_to_pandas(dataset)

decoding sequences


100%|██████████| 989666/989666 [02:34<00:00, 6388.47it/s]


loading scores from json


100%|██████████| 989666/989666 [00:15<00:00, 62680.24it/s]


In [19]:
# don't parallelize it unless making globabl_head_id available to each process
global_head_id = count(0) # should start over with each cell execution
df["pairs"] = df["relations_dict"].apply(make_pairs)
df = df.drop(columns='relations_dict')

In [20]:
df_interim = explode_pairs(df)

In [21]:
# this cell runs really long, sometimes half an hour on large datasets. It also waits
# for a while after the progress bar reaches 100%.
# the code could be rewritten more efficiently

df = parse_triples(df_interim)
print('sorting...')
df = df.sort_values(by='score', ascending=False).reset_index()
print('sorted')

Parsing triples


100%|██████████| 11818434/11818434 [25:47<00:00, 7637.63it/s] 


sorting...
sorted


#### some stats

In [22]:
# heads matched with UMLS out of all
pd.unique(df[df['tail'].notnull()].matched_head_id).shape

(558570,)

In [23]:
df = df[df['tail'].notnull()]

RANDOM_SEED_KG_DROP = None
# set not None only for ablations
# set which 0 < ratio < 1 you want to drop, e.g. 0.75
# RANDOM_SEED_KG_DROP = 0.75

rng = np.random.default_rng(42)
if RANDOM_SEED_KG_DROP is not None:
    mask = rng.random(len(df)) >= RANDOM_SEED_KG_DROP
    df_partial = df[mask]
else:
    df_partial = df

In [24]:
df_before_cutoff = deepcopy(df_partial)

## Set threshold

In [25]:
df_cutted = threshold_cut_off(df_before_cutoff, thr=MIN_SCORE_THRESHOLD)
df_cutted.score.describe()

count    1.125777e+06
mean     5.840059e-01
std      3.029101e-02
min      5.500000e-01
25%      5.610223e-01
50%      5.756189e-01
75%      5.985042e-01
max      8.483794e-01
Name: score, dtype: float64

#### exclude relations:

In [26]:
df_cutted['relation_type'] = df_cutted['relation_type'].map(lambda s: '_'.join(s.strip().split(' ')))

# exclude undesired_relations relations
df_cutted = df_cutted[~df_cutted['relation_type'].isin(undesired_relations)]
df_cutted.shape

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cutted['relation_type'] = df_cutted['relation_type'].map(lambda s: '_'.join(s.strip().split(' ')))


(1121757, 8)

# Selection algorithm

### leave only max_num_triples triples with the highest score

In [27]:
def select_triples_with_top_score(df, max_num_triples=1):
    """
    For each group of identical (head, relation_type, tail), keep the top
    `max_num_triples` rows by 'score'. Then returns all of them sorted
    descending by score.
    """
    df_sorted = df.sort_values("score", ascending=False)
    df_top_triples = (
        df_sorted.groupby(by=["head", "relation_type", "tail"],
            group_keys=False,
            sort=False,                   
        )
        .head(max_num_triples)
        .reset_index(drop=True)
    )
    return df_top_triples.sort_values("score", ascending=False).reset_index(drop=True)

In [28]:
df_top_triples = select_triples_with_top_score(df_cutted, 1)
df_top_triples = exclude_relations_with_few_injections(df_top_triples, 100)
df_top_triples.score.describe()

num relations excluded: 90
relations excluded:
['procedure_site_of', 'occurs_after', 'indirect_morphology_of', 'consider', 'device_used_by', 'process_acts_on', 'inheres_in', 'has_indirect_procedure_site', 'procedure_morphology_of', 'has_procedure_site', 'has_inherent_attribute', 'route_of_administration_of', 'subject_relationship_context_of', 'specimen_procedure_of', 'has_device_intended_site', 'temporally_related_to', 'device_intended_site_of', 'has_direct_morphology', 'after', 'uses_substance', 'has_part_anatomy_structure', 'has_direct_device', 'has_time_aspect', 'has_precondition', 'entire_anatomy_structure_of', 'energy_used_by', 'has_technique', 'role_played_by', 'characterized_by', 'procedure_context_of', 'uses_energy', 'uses_access_device', 'relative_to_part_of', 'inverse_during', 'compositional_material_of', 'regulates', 'has_route_of_administration', 'has_access', 'dose_form_intended_site_of', 'specimen_source_morphology_of', 'technique_of', 'property_of', 'direct_site_of', 'pr

count    112532.000000
mean          0.599352
std           0.040043
min           0.550000
25%           0.567677
50%           0.589680
75%           0.621437
max           0.848379
Name: score, dtype: float64

In [29]:
def maximize_score_then_diversity(df: pd.DataFrame, score_bucket_size=0.01, relation_bucket_size: int = 100) -> pd.DataFrame:
    """
    First, maximize score, then, within it, diversity.
    Output: df with a single match per head.
    """
    df = df.copy()
    max_s = df['score'].max()
    df["score_bucket_id"] = ((max_s - df["score"]) / score_bucket_size).astype(int)
    
    rel_counts = df["relation_type"].value_counts()
    df = df.assign(
        rel_count = df["relation_type"].map(rel_counts),
        rel_bucket= lambda d: d["rel_count"] // relation_bucket_size,
    )
    df_sorted = df.sort_values(["score_bucket_id", "rel_bucket", "score"], ascending=[True, True, False])
    result = df_sorted.drop_duplicates(subset="matched_head_id", keep="first").reset_index(drop=True)
    result = result.drop(columns=["rel_count", "rel_bucket", "score_bucket_id"])
    result = result.sort_values("score", ascending=False).reset_index(drop=True)
    return result

In [30]:
df_result = maximize_score_then_diversity(df_top_triples, relation_bucket_size=100)
df_result = exclude_relations_with_few_injections(df_result, 100)
df_result

num relations excluded: 14
relations excluded:
['method_of', 'has_direct_procedure_site', 'is_modification_of', 'direct_procedure_site_of', 'has_direct_substance', 'has_entire_anatomy_structure', 'has_pathological_process', 'has_occurrence', 'uses_device', 'direct_morphology_of', 'disposition_of', 'indirect_procedure_site_of', 'specimen_of', 'pathological_process_of']


Unnamed: 0,index,id,sequence,matched_head_id,head,relation_type,tail,score
0,9233004,777110,". diabetes mellitus ( dm ), a chronic metabolic disease characterised by elevated levels of blood glucose, is among the most common chronic diseases. the incidence and prevalence of dm have been increasing over the years. the complications of dm represent a serious health problem. the long ‐ term complications include macroangiopathy, microangiopathy and neuropathy as well as sexual dysfunction ( sd ) in both men and women. erectile dysfunction ( ed ) has been considered the most important sd in men with dm. the prevalence of ed is approximately 3. 5 ‐ fold higher in men with dm than in those without dm. common risk factors for the development of dm",874611,diabetes mellitus,cause_of,erectile dysfunction due to diabetes mellitus,0.848379
1,9233044,777110,". diabetes mellitus ( dm ), a chronic metabolic disease characterised by elevated levels of blood glucose, is among the most common chronic diseases. the incidence and prevalence of dm have been increasing over the years. the complications of dm represent a serious health problem. the long ‐ term complications include macroangiopathy, microangiopathy and neuropathy as well as sexual dysfunction ( sd ) in both men and women. erectile dysfunction ( ed ) has been considered the most important sd in men with dm. the prevalence of ed is approximately 3. 5 ‐ fold higher in men with dm than in those without dm. common risk factors for the development of dm",874612,dm,cause_of,erectile dysfunction due to diabetes mellitus,0.848379
2,11186983,938275,"it is caused by sporothrix, a fungus that usually results in zoonotic fungal diseases and sapronosis that are endemic in temperate regions. sporotrichosis is clinically presented in two main forms, which are lymphocutanous sporotrichosis ( lc ) and fixed sporotrichosis ( f ), especially in immunocompetent patients. however, severe clinical versions, such as cutaneous disseminated, mucosal, and extracutaneous forms may present in immunocompromised hosts. this study presents a case of cutaneous disseminated sporotrichosis in a patient with diabetes mellitus and information based on the review of relevant literature. clinicians should be on the",1060463,cutaneous disseminated sporotrichosis,has_causative_agent,sporothrix schenkii,0.826598
3,8030593,675604,"10. 1186 / s12974 - 022 - 02508 - 9. interleukin - 18 ( il - 18 ) is a classical member of the il - 1 superfamily of cytokines. as il - 1β, il - 18 precursor is processed by inflammasome / caspase - 1 into a mature and biologically active form. il - 18 binds to its specific receptor composed of two chains ( il - 18rα and il - 18rβ ) to trigger a similar intracellular signaling pathway as il - 1, ultimately leading to activation of nf - κb and inflammatory processes. independently of this il - 1 - like signaling, il - 18 also specifically induces ifn",759583,il - 18rα,part_of,interleukin - 18 - mediated signaling pathway,0.822278
4,8030596,675604,"10. 1186 / s12974 - 022 - 02508 - 9. interleukin - 18 ( il - 18 ) is a classical member of the il - 1 superfamily of cytokines. as il - 1β, il - 18 precursor is processed by inflammasome / caspase - 1 into a mature and biologically active form. il - 18 binds to its specific receptor composed of two chains ( il - 18rα and il - 18rβ ) to trigger a similar intracellular signaling pathway as il - 1, ultimately leading to activation of nf - κb and inflammatory processes. independently of this il - 1 - like signaling, il - 18 also specifically induces ifn",759584,il - 18rβ,part_of,interleukin - 18 - mediated signaling pathway,0.822278
...,...,...,...,...,...,...,...,...
29334,6389541,537301,"summary statistical data were obtained from genome - wide association study ( gwas ) to investigate the causal relationship between white cell ( wbc ) count, 5 specific wbc count, and lymphocyte subtypes cell count and t1d risk. after false discovery rate ( fdr ) correction, the results indicated that lower lymphocyte cell count ( odds ratio [ or ] per 1 standard deviation [ sd ] decrease = 0. 746, 95 % confidence interval ( ci ) : 0. 673 – 0. 828, p fdr = 0. 036 ), and basophil cell count ( or per 1 sd decrease = 0. 808, 95 % ci : 0.",604501,lymphocyte cell count,has_component,lymphocyte,0.550008
29335,10979284,922144,"results there were changes in lipid ratios representing desaturase enzyme activity in preterm infants in the first weeks of life with higher delta 6 desaturases ( d6d ) triglyceride ( tg ) indices but significantly lower delta 9 desaturase ( d9d ) and d6d ( pc ) indices. in comparison to term infants, preterm have lower delta 5 desaturase ( d5d ) but higher d6d indices at all postnatal ages. although point levels of desaturase indices were different, trajectories of changes in these indices over time were similar in preterm and term infants. conclusions this study findings suggest the patterns of desaturase indices in preterm infants differ from that",1040914,delta 6 desaturases ( d6d ),has_disposition,oxidoreductase,0.550006
29336,4018639,339038,"38 were significantly increased, those of the anti - inflammatory cytokine, il - 35, have been reduced in all cvd groups compared with healthy controls. regarding hematological indices, hematocrit, red blood cell distribution width, mean platelet ( plt ) volume, plateletcrit, plt distribution width, leukocyte count, and plt - to - lymphocyte and neutrophil - to - lymphocyte ratios were markedly increased in the diabetic and nondiabetic cvd groups compared with those of the healthy controls. oxidative stress and cytokine biomarkers may play a significant role in the complications of diabetic cardiomyopathy. moreover, hematological indices are particularly sensitive to systemic inflammatory changes and are novel markers for the early detection",380925,leukocyte count,is_interpreted_by,pancytopenia caused by antidiabetic drug,0.550005
29337,7424478,625017,"cimt. regarding endothelial dysfunction, blood glucose and left lequesne index were the independents predictive factors of decreased fmd. conclusion our study supported the accelerated subclinical atherosclerosis in patients with spa. this subclinical atherosclerosis was mainly mediated by traditional cv risk factors. nutritional therapy has been conventionally recommended for people with prediabetes as a method to delay or halt progression to type 2 diabetes. however, recommended nutritional strategies evolve over time. hence, we performed a scoping review on recently reported nutritional interventions for individuals with prediabetes. ovid medline, pubmed, embase, scopus, cinahl and psycinfo databases were searched to identify",701816,nutritional therapy,inverse_isa,nutrition therapy for congestive heart failure,0.550005


In [31]:
df_result.relation_type.value_counts()

relation_type
isa                          8651
inverse_isa                  5514
cause_of                     1440
interprets                   1268
associated_finding_of        1145
has_disposition              1084
focus_of                     1038
is_interpreted_by             963
has_associated_morphology     863
causative_agent_of            809
finding_site_of               741
associated_morphology_of      598
has_method                    529
has_finding_site              477
possibly_equivalent_to        447
has_component                 434
due_to                        365
has_part                      350
has_modification              310
associated_with               254
part_of                       211
plays_role                    194
occurs_before                 187
has_clinical_course           144
occurs_in                     138
same_as                       134
has_causative_agent           127
has_focus                     118
Name: count, dtype: int64

In [32]:
df_result.score.describe()

count    28533.000000
mean         0.613306
std          0.046269
min          0.550001
25%          0.576145
50%          0.604677
75%          0.641571
max          0.848379
Name: score, dtype: float64

### Save outputs to csv

In [33]:
def get_seed_kg(df_result):
    """leave unque triples only"""
    seed_kg = (
        df_result[['head', 'relation_type', 'tail']].groupby(by=["head", "relation_type", "tail"],
                group_keys=False,
                sort=False,                   
            )
            .head(1)
            .reset_index(drop=True)
        )
    return seed_kg

In [34]:
output_path = f"{output_path_prefix}/train_score_{MIN_SCORE_THRESHOLD}.csv"
df_result = df_result.drop(columns='index')
df_result.drop(columns=['matched_head_id', 'score']).to_csv(output_path, index=False)
print(f'saved to {output_path}')

saved to /scratch/gpfs/JHA/mb5157/large_data/diabetes_2025_6years/dataset/for_training_tmp//train_score_0.55.csv


In [35]:
# same for train and eval

unique_relations = sorted(df_result['relation_type'].unique())
relation_id_map = {relation: idx + 1 for idx, relation in enumerate(unique_relations)}
relation_id_map

{'associated_finding_of': 1,
 'associated_morphology_of': 2,
 'associated_with': 3,
 'causative_agent_of': 4,
 'cause_of': 5,
 'due_to': 6,
 'finding_site_of': 7,
 'focus_of': 8,
 'has_associated_morphology': 9,
 'has_causative_agent': 10,
 'has_clinical_course': 11,
 'has_component': 12,
 'has_disposition': 13,
 'has_finding_site': 14,
 'has_focus': 15,
 'has_method': 16,
 'has_modification': 17,
 'has_part': 18,
 'interprets': 19,
 'inverse_isa': 20,
 'is_interpreted_by': 21,
 'isa': 22,
 'occurs_before': 23,
 'occurs_in': 24,
 'part_of': 25,
 'plays_role': 26,
 'possibly_equivalent_to': 27,
 'same_as': 28}

In [36]:
rel_output_path = f"{output_path_prefix}/relation_map_score_{MIN_SCORE_THRESHOLD}.json"
with open(rel_output_path, "w") as f:
    json.dump(relation_id_map, f, indent=2)

print(f"Saved {len(relation_id_map)} relation types to {rel_output_path}")

Saved 28 relation types to /scratch/gpfs/JHA/mb5157/large_data/diabetes_2025_6years/dataset/for_training_tmp//relation_map_score_0.55.json


In [37]:
seed_kg_path = f'{output_path_prefix}/seed_kg_{MIN_SCORE_THRESHOLD}.csv'

seed_kg = get_seed_kg(df_result)
seed_kg.to_csv(seed_kg_path)
seed_kg.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28533 entries, 0 to 28532
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   head           28533 non-null  object
 1   relation_type  28533 non-null  object
 2   tail           28533 non-null  object
dtypes: object(3)
memory usage: 668.9+ KB


# Prepare eval dataset (need to run this part only once)
We should exclude the same set of relations

In [39]:
injections_path = f'{output_path_prefix}/eval_{model_name}_heads_all_result'

dataset_eval = load_from_disk(injections_path)
print(f'loaded from {injections_path}')
dataset_eval

loaded from /scratch/gpfs/JHA/mb5157/large_data/diabetes_2025_6years/dataset/for_training_tmp//eval_qwen3-32b_heads_all_result


Dataset({
    features: ['input_ids', 'leaf_node_ids', 'leaf_relationships', 'attention_mask', 'input_nodes', 'start_indices', 'special_tokens_mask', 'head_positions', 'id', 'top_k_relations_with_scores_cui'],
    num_rows: 110297
})

In [42]:
df_eval = convert_to_pandas(dataset_eval)

decoding sequences


100%|██████████| 110297/110297 [00:17<00:00, 6141.90it/s]


loading scores from json


100%|██████████| 110297/110297 [00:01<00:00, 63216.59it/s]


In [43]:
# don't parallelize it unless making globabl_head_id available to each process
global_head_id = count(0) # should start over with each cell execution
df_eval["pairs"] = df_eval["relations_dict"].apply(make_pairs)

df_eval = df_eval.drop(columns='relations_dict')
df_eval = explode_pairs(df_eval)
df_eval = parse_triples(df_eval)
df_eval = df_eval.sort_values(by='score', ascending=False).reset_index()

Parsing triples


100%|██████████| 1323069/1323069 [02:30<00:00, 8769.82it/s] 


In [44]:
df_eval = df_eval[df_eval['tail'].notnull()]

rng = np.random.default_rng(42)
if RANDOM_SEED_KG_DROP is not None:
    mask = rng.random(len(df_eval)) >= RANDOM_SEED_KG_DROP
    df_eval_partial = df_eval[mask]
else:
    df_eval_partial = df_eval

df_eval_before_thr = deepcopy(df_eval_partial)

## Setting threshold

In [45]:
df_eval_cutted = threshold_cut_off(df_eval_before_thr, MIN_SCORE_THRESHOLD)

In [46]:
df_eval_cutted['relation_type'] = df_eval_cutted['relation_type'].map(lambda s: '_'.join(s.split(' ')))
df_eval_cutted = df_eval_cutted[df_eval_cutted['relation_type'].isin(unique_relations)]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_eval_cutted['relation_type'] = df_eval_cutted['relation_type'].map(lambda s: '_'.join(s.split(' ')))


In [60]:
df_eval_top_triples = select_triples_with_top_score(df_eval_cutted, 1)
df_eval_result = maximize_score_then_diversity(df_eval_top_triples)
df_eval_result.count()

index              7138
id                 7138
sequence           7138
matched_head_id    7138
head               7138
relation_type      7138
tail               7138
score              7138
dtype: int64

In [62]:
eval_result_path = f"{output_path_prefix}/eval_score_{MIN_SCORE_THRESHOLD}.csv"
df_eval_result.drop(columns='index').drop(columns=['matched_head_id', 'score']).to_csv(eval_result_path, index=False)
print(f"saved to {eval_result_path}")

saved to /scratch/gpfs/JHA/mb5157/large_data/diabetes_2025_6years/dataset/for_training_tmp//eval_score_0.55.csv
