## A Large Language Model-based tool to facilitate data harmonization: Random Forest model used to align variables across cohort studies

In [None]:
#****************************************
# MIT License
# Copyright (c) 2025 Zexu Li, Jinying Chen
#  
# author(s): Zexu Li, Jinying Chen, Boston University Chobanian & Avedisian School of Medicine
# date: 2025-7-7
# ver: 1.0
# 
# This code was written to support data analysis for the Data Harmonization Using Natural Language 
# Processing (NLP harmonization) project and the 2025 paper published in PLOS One.
# The code is for research use only, and is provided as it is.
# 

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [None]:
# Load input data for ML model
datadir = "[path to input data]"
Final_df_ML = pd.read_csv(datadir + 'ML_dataset_021825_v3.csv')

In [None]:
Final_df_ML.columns.to_list()

In [None]:
Final_df_ML

# Example code for runing a single trial in the ML experiments
(Grid search of 50 trials presented in Grid_search_and_evaluation_50_trials.py)

## Split Final dataset to test/train/valid base on source variable

In [None]:
# Identify unique values in the 'source' column
unique_sources = Final_df_ML['Source'].unique()

# Choose a subset of unique sources for training and testing
train_sources, test_sources = train_test_split(unique_sources, test_size=0.2, random_state=42)

train_sources_small, validation_sources = train_test_split(train_sources, test_size=0.2, random_state=42)


# Filter the original DataFrame to create training and testing sets
train = Final_df_ML[Final_df_ML['Source'].isin(train_sources_small)]
Ori_train = Final_df_ML[Final_df_ML['Source'].isin(train_sources_small)]
validation =  Final_df_ML[Final_df_ML['Source'].isin(validation_sources)]
Ori_validation = Final_df_ML[Final_df_ML['Source'].isin(validation_sources)]
test = Final_df_ML[Final_df_ML['Source'].isin(test_sources)]
Ori_test = Final_df_ML[Final_df_ML['Source'].isin(test_sources)]

global global_validation
global_validation = Ori_validation


global global_test
global_test = Ori_test

global global_train
global_train = Ori_train


train= train[['miniLM_on_label', 'e5_on_label', 'mpnet_on_label', 'fuzzy_on_label','biolord_on_label',
                        'miniLM_on_label_key', 'e5_on_label_key', 'mpnet_on_label_key', 'fuzzy_on_label_key','biolord_on_label_key',
                        'miniLM_on_sheet', 'e5_on_sheet', 'mpnet_on_sheet', 'fuzzy_on_sheet','biolord_on_sheet',
                       'deriv_info_null_EU','deriv_info_len_EU','Label_len_EU',
                       'deriv_info_null_JP','deriv_info_len_JP','Label_len_JP','Mapping_result']]

validation= validation[['miniLM_on_label', 'e5_on_label', 'mpnet_on_label', 'fuzzy_on_label','biolord_on_label',
                        'miniLM_on_label_key', 'e5_on_label_key', 'mpnet_on_label_key', 'fuzzy_on_label_key','biolord_on_label_key',
                        'miniLM_on_sheet', 'e5_on_sheet', 'mpnet_on_sheet', 'fuzzy_on_sheet','biolord_on_sheet',
                       'deriv_info_null_EU','deriv_info_len_EU','Label_len_EU',
                       'deriv_info_null_JP','deriv_info_len_JP','Label_len_JP','Mapping_result']]

test= test[['miniLM_on_label', 'e5_on_label', 'mpnet_on_label', 'fuzzy_on_label','biolord_on_label',
                        'miniLM_on_label_key', 'e5_on_label_key', 'mpnet_on_label_key', 'fuzzy_on_label_key','biolord_on_label_key',
                        'miniLM_on_sheet', 'e5_on_sheet', 'mpnet_on_sheet', 'fuzzy_on_sheet','biolord_on_sheet',
                       'deriv_info_null_EU','deriv_info_len_EU','Label_len_EU',
                       'deriv_info_null_JP','deriv_info_len_JP','Label_len_JP','Mapping_result']]



X_validation = validation.drop('Mapping_result',axis = 1)
#X = np.array(X)
y_validation = validation[['Mapping_result']]

## Downsampling on the negative pairs (1:200 ratio)

In [None]:
import numpy as np
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_fscore_support
from sklearn.model_selection import train_test_split, GridSearchCV

#200 ratio setup
def select_true_and_false(df_source):
    true_values = df_source[df_source['Mapping_result'] == 1]
    false_values = df_source[df_source['Mapping_result'] == 0].sample(n=200*len(true_values), random_state=42)
    return pd.concat([true_values, false_values])

#testing RF model with default parameters
#repeating model training/testing for 10 times
'''
for i in range(10):
    print(i)

    final_train_set = Ori_train.groupby('Source').apply(select_true_and_false).reset_index(drop=True)
    final_train_set = final_train_set[['miniLM_on_label', 'e5_on_label', 'mpnet_on_label', 'fuzzy_on_label','biolord_on_label',
                        'miniLM_on_label_key', 'e5_on_label_key', 'mpnet_on_label_key', 'fuzzy_on_label_key','biolord_on_label_key',
                        'miniLM_on_sheet', 'e5_on_sheet', 'mpnet_on_sheet', 'fuzzy_on_sheet','biolord_on_sheet',
                       'deriv_info_null_EU','deriv_info_len_EU','Label_len_EU',
                       'deriv_info_null_JP','deriv_info_len_JP','Label_len_JP','Mapping_result']]

    
    X_train = final_train_set.drop('Mapping_result',axis = 1)
    y_train = final_train_set[['Mapping_result']].values.ravel()


    RF_model =  RandomForestClassifier()

    RF_model.fit(X_train,y_train)
    y_pred = RF_model.predict(X_validation)
    accuracy = accuracy_score(y_validation, y_pred)

    report = classification_report(y_validation, y_pred)
    #print(f'{n} random sample are selected')
    print(report)
    probability_estimates = RF_model.predict_proba(X_validation)
    
    temp_df = Ori_validation.copy()
    temp_df['probability'] = RF_model.predict_proba(X_validation)[:,1]
    temp_df['rank'] = temp_df.groupby('Source')['probability'].rank(ascending=False)
    positive = temp_df[temp_df['Mapping_result'] == 1]
    top5 = len(positive[positive['rank'] <=5])/ len(positive)
    top10 = len(positive[positive['rank'] <=10])/ len(positive)
    top20 = len(positive[positive['rank'] <=20])/ len(positive)
    top30 = len(positive[positive['rank'] <=30])/ len(positive)
    
    print(f'Hit rate: {top5},{top10},{top20},{top30} ')
    

    # Print the probability estimates for the first few samples
    print("Probability Estimates:")
    print(probability_estimates[:10])
'''

## Grid search for hyperparameter tuning

In [None]:
final_train_set = Ori_train.groupby('Source').apply(select_true_and_false).reset_index(drop=True)
global global_train
global_train = final_train_set
final_train_set = final_train_set[['miniLM_on_label', 'e5_on_label', 'mpnet_on_label', 'fuzzy_on_label','biolord_on_label',
                        'miniLM_on_label_key', 'e5_on_label_key', 'mpnet_on_label_key', 'fuzzy_on_label_key','biolord_on_label_key',
                        'miniLM_on_sheet', 'e5_on_sheet', 'mpnet_on_sheet', 'fuzzy_on_sheet','biolord_on_sheet',
                       'deriv_info_null_EU','deriv_info_len_EU','Label_len_EU',
                       'deriv_info_null_JP','deriv_info_len_JP','Label_len_JP','Mapping_result']]
X_train = final_train_set.drop('Mapping_result',axis = 1)
y_train = final_train_set[['Mapping_result']].values.ravel()

In [None]:
train = Final_df_ML[Final_df_ML['Source'].isin(train_sources)]
test = Final_df_ML[Final_df_ML['Source'].isin(test_sources)]

In [None]:
def split_into_sublists(lst, k):
    """
    Split a list into k sublists without overlapping.

    Parameters:
    - lst: List to be split.
    - k: Number of sublists.

    Returns:
    A list of k sublists.
    """
    n = len(lst)
    sublist_size = n // k
    remainder = n % k

    sublists = []
    start = 0

    for i in range(k):
        sublist_length = sublist_size + (1 if i < remainder else 0)
        sublists.append(lst[start:start + sublist_length])
        start += sublist_length

    return sublists


split_train_source = split_into_sublists(train_sources, 5)
#print(split_train_source)
for i in split_train_source:
    print(len(i))

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score, accuracy_score,f1_score
from sklearn.metrics import roc_curve, precision_recall_curve, auc, make_scorer, recall_score, accuracy_score, precision_score, confusion_matrix
from sklearn.model_selection import StratifiedKFold
import pandas as pd
import numpy as np
from sklearn.model_selection import ParameterGrid
param_grid = {
        'n_estimators': [50, 200,500,900],
        'max_depth': [2, 10, 15, 20], 
        'min_samples_split': [2, 5, 10, 15],
        'criterion': ['gini', 'entropy', 'log_loss'],
        'max_features':['sqrt', None]
        
    }


In [None]:
from sklearn.ensemble import RandomForestClassifier
RandomForestClassifier()

In [None]:
best_HR = -1
best_MRR = -1
n = 0
mean_HR_list = []
mean_MRR_list = []
para_list = []
for g in ParameterGrid(param_grid):
        #rf = RandomForestClassifier(random_state=42)
        #rf.set_params(**g)
        HR_list = []
        MRR_list = []
        for split in split_train_source:
            rf = RandomForestClassifier(random_state=42)
            rf.set_params(**g)
            validation_data = train[train['Source'].isin(split)]
            train_data = train[~train['Source'].isin(split)]
            
            final_train_set = train_data.groupby('Source').apply(select_true_and_false).reset_index(drop=True)
            final_validation_set = validation_data.copy()
            

            train_dataonly = final_train_set[['miniLM_on_label', 'e5_on_label', 'mpnet_on_label', 'fuzzy_on_label','biolord_on_label',
                        'miniLM_on_label_key', 'e5_on_label_key', 'mpnet_on_label_key', 'fuzzy_on_label_key','biolord_on_label_key',
                        'miniLM_on_sheet', 'e5_on_sheet', 'mpnet_on_sheet', 'fuzzy_on_sheet','biolord_on_sheet',
                       'deriv_info_null_EU','deriv_info_len_EU','Label_len_EU',
                       'deriv_info_null_JP','deriv_info_len_JP','Label_len_JP','Mapping_result']]
            validation_dataonly = final_validation_set[['miniLM_on_label', 'e5_on_label', 'mpnet_on_label', 'fuzzy_on_label','biolord_on_label',
                        'miniLM_on_label_key', 'e5_on_label_key', 'mpnet_on_label_key', 'fuzzy_on_label_key','biolord_on_label_key',
                        'miniLM_on_sheet', 'e5_on_sheet', 'mpnet_on_sheet', 'fuzzy_on_sheet','biolord_on_sheet',
                       'deriv_info_null_EU','deriv_info_len_EU','Label_len_EU',
                       'deriv_info_null_JP','deriv_info_len_JP','Label_len_JP','Mapping_result']]

            X_train = train_dataonly.drop('Mapping_result',axis = 1)
            y_train = train_dataonly[['Mapping_result']].values.ravel()
            X_validation = validation_dataonly.drop('Mapping_result',axis = 1)
            y_validation = validation_dataonly[['Mapping_result']].values.ravel()
            rf.fit(X_train,y_train)
            y_pred = rf.predict(X_validation)
            y_pred_proba = rf.predict_proba(X_validation)[:, 1]
            final_validation_set['probability'] = y_pred_proba
            final_validation_set['rank'] = final_validation_set.groupby('Source')['probability'].rank(ascending=False)
            positive = final_validation_set[final_validation_set['Mapping_result'] == 1]
            
            top30_HR = len(positive[positive['rank'] <=30])/ len(positive)
            HR_list.append(top30_HR)
            max_ranks = positive.groupby('Source')['rank'].idxmin()
            result_df = positive.loc[max_ranks]
            result_df['MRR'] = 1/result_df['rank']


            MRR = sum(result_df['MRR'])/ len(result_df)
            MRR_list.append(MRR)
        mean_HR = sum(HR_list) / len(HR_list)
        mean_MRR = sum(MRR_list) / len(MRR_list)
        if mean_HR > best_HR:
            best_HR = mean_HR
            best_grid_HR = g
        if mean_MRR > best_MRR:
            best_MRR = mean_MRR
            best_grid_MRR = g
        n+=1
        mean_HR_list.append(mean_HR)   
        mean_MRR_list.append(mean_MRR)
        para_list.append(g)
        print(f'Step {n} complete: model:{rf}, HR: {mean_HR}, MRR: {mean_MRR}')

In [None]:
rf = RandomForestClassifier(random_state=42)
rf.set_params(**best_grid_HR)
train = Final_df_ML[Final_df_ML['Source'].isin(train_sources)]
test = Final_df_ML[Final_df_ML['Source'].isin(test_sources)]


test_copy = test.copy()

test_data = test[['miniLM_on_label', 'e5_on_label', 'mpnet_on_label', 'fuzzy_on_label','biolord_on_label',
                        'miniLM_on_label_key', 'e5_on_label_key', 'mpnet_on_label_key', 'fuzzy_on_label_key','biolord_on_label_key',
                        'miniLM_on_sheet', 'e5_on_sheet', 'mpnet_on_sheet', 'fuzzy_on_sheet','biolord_on_sheet',
                       'deriv_info_null_EU','deriv_info_len_EU','Label_len_EU',
                       'deriv_info_null_JP','deriv_info_len_JP','Label_len_JP','Mapping_result']]
final_train_set = train.groupby('Source').apply(select_true_and_false).reset_index(drop=True)

train_dataonly = final_train_set[['miniLM_on_label', 'e5_on_label', 'mpnet_on_label', 'fuzzy_on_label','biolord_on_label',
                        'miniLM_on_label_key', 'e5_on_label_key', 'mpnet_on_label_key', 'fuzzy_on_label_key','biolord_on_label_key',
                        'miniLM_on_sheet', 'e5_on_sheet', 'mpnet_on_sheet', 'fuzzy_on_sheet','biolord_on_sheet',
                       'deriv_info_null_EU','deriv_info_len_EU','Label_len_EU',
                       'deriv_info_null_JP','deriv_info_len_JP','Label_len_JP','Mapping_result']]

    

X_test = test_data.drop('Mapping_result',axis = 1)
y_test = test_data[['Mapping_result']].values.ravel()

X_train = train_dataonly.drop('Mapping_result',axis = 1)
y_train = train_dataonly[['Mapping_result']].values.ravel()


rf.fit(X_train,y_train)
y_pred = rf.predict(X_test)
y_pred_proba = rf.predict_proba(X_test)[:, 1]
test_copy['probability'] = y_pred_proba
test_copy['rank'] = test_copy.groupby('Source')['probability'].rank(ascending=False)
positive = test_copy[test_copy['Mapping_result'] == 1]

top30_HR = len(positive[positive['rank'] <=30])/ len(positive)
top20_HR = len(positive[positive['rank'] <=20])/ len(positive)
top10_HR = len(positive[positive['rank'] <=10])/ len(positive)
top5_HR = len(positive[positive['rank'] <=5])/ len(positive)

max_ranks = positive.groupby('Source')['rank'].idxmin()
result_df = positive.loc[max_ranks]
result_df['MRR'] = 1/result_df['rank']
MRR = sum(result_df['MRR'])/ len(result_df)
print(top30_HR,top20_HR,top10_HR,top5_HR,MRR)