## A Large Language Model-based tool to facilitate data harmonization: individual NLP models used to align variables across cohort studies

In [1]:
#****************************************
# MIT License
# Copyright (c) 2025 Zexu Li, Suraj P. Prabhu, Jinying Chen
#  
# author(s): Zexu Li, Suraj P. Prabhu, Jinying Chen, Boston University Chobanian & Avedisian School of Medicine
# date: 2025-7-7
# ver: 1.0
# 
# This code was written to support data analysis for the Data Harmonization Using Natural Language 
# Processing (NLP harmonization) project and the 2025 paper published in PLOS One.
# The code is for research use only, and is provided as it is.
# 

## Section 1. packages importing and installation

In [None]:
import pandas as pd
from transformers import AutoTokenizer, AutoModel 
import torch
#pip install -U sentence-transformers
#pip install transformers==4.18.0
#conda update --all
#conda install xlrd
from scipy.spatial.distance import cosine
import matplotlib.pyplot as plt
#pip install fuzzywuzzy
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
#nltk.download('punkt')
#nltk.download('stopwords')
#pip install python-Levenshtein
import numpy as np
from sentence_transformers import CrossEncoder
import itertools

import string

In [None]:
#import library for Fuzzy match
#!pip install fuzzywuzzy
#!pip install python-Levenshtein
#!pip install --upgrade pip

import fuzzywuzzy
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

In [None]:
#!pip install keybert
from keybert import KeyBERT
kw_model = KeyBERT()

#!pip install adjustText
from adjustText import adjust_text

## Section 2. Data Dictionary and Truth Map Reading (Input files, example can be found in example_inputs folder)
The aim of the example is to find alignment for each EU variable in all JP variables. Source: EU_all, Target: JP_all, Truth_table: Ground truth for evaluation.
1. 'JP_all_ML1111.csv' contain all the variables from JP
2. 'EU_all_ML1111.csv' contain all the variables from EU
3. 'Truth_table_1115.csv' contain truth alignments. This is manual alignments for methods evaluation purpose, which will not exist for other datasets

In [None]:
datadir = "[path to input data]"

In [None]:
JP_all = pd.read_csv(datadir + 'JP_all_ML1111.csv') #Target Variables with columns: Sheet_Def	Domain	Variable	Label	Type	Codes	Rule for derivation
EU_all = pd.read_csv(datadir + 'EU_all_ML1111.csv') #Source Variables with columns: Sheet_Def	ADS Name	Variable	Label	Definition (derived Vars)	Type	CRF Question	Label_CRF
Truth_table = pd.read_csv(datadir + 'Truth_table_1115.csv') #Truth Map: with columns: Variable_EU	Variable_JP

top_n = len(JP_all)

## Section 3. Data dictionary Data preprocessing
1. Add key words extraction on rule for derivation.
2. Add derived measurements based on data dictionary data.

In [None]:
def get_len(x):
    '''return the number of words in sentence after replacing punctuation with space'''
    if pd.isnull(x):
        return 0
    else:
        x = str(x)
        translator = str.maketrans(string.punctuation, ' ' * len(string.punctuation)) #replace punctuation with space
        x = x.translate(translator)
        words = x.split() #split sentence on space
        word_count = len(words)
        return word_count
    
def validate_null(x):
    '''Validate whether the cell is null'''
    if pd.isnull(x):
        return True
    else:
        return False
    
def combine_JP_deriv_info(x):
    '''Combine multiple columns to form the Rule for derivation column'''
    if pd.isnull(x['Codes']) and pd.isnull(x['Rule for derivation']):
        return np.nan
    elif pd.isnull(x['Codes']) and pd.notnull(x['Rule for derivation']):
        return str(x['Rule for derivation'])
    elif pd.notnull(x['Codes']) and pd.isnull(x['Rule for derivation']):
        return str(x['Codes'])
    else:
        if x['Codes'] == x['Rule for derivation'] :
            return str(x['Rule for derivation'])
        else:
            return str(x['Codes']) + '; ' + str(x['Rule for derivation'])

def extract_words(x):
    '''Extract key words from rule for deriviation column if it has more than 20 words, using keybert'''
    if x['deriv_info_len'] > 20: #limit of words to begain key words extraction
        keyword_list = kw_model.extract_keywords(x['deriv_info'],keyphrase_ngram_range=(5, 5), stop_words='english', use_maxsum=True, nr_candidates=20, top_n=3)
        new_str = ''
        for i in keyword_list:
            new_str = new_str +i[0]  +'. '
        return new_str
    else:
        return x['deriv_info']

def label_keywords_combine(x):
    '''Combine label with deriv_info_key_words into one sentence (in case a variable don't have deriv_info_key_words as null)'''
    if pd.isnull(x['deriv_info_key_words']):
        return x['Label']
    else:
        if x['Label'] == x['deriv_info_key_words']:
            return x['Label']
        else:
            return str(x['Label']) + '. ' + str(x['deriv_info_key_words'])


In [None]:
#JP_all preprocessing 
JP_all = JP_all.drop_duplicates(subset=['Variable']) #Drop Duplicate columns
JP_all['deriv_info'] = JP_all.apply(combine_JP_deriv_info,axis = 1)  #Re-organize deriviation rule 
JP_all['deriv_info_null'] = JP_all['deriv_info'].apply(validate_null) #Check whether deriviation rule is null or not
JP_all['deriv_info_len'] = JP_all['deriv_info'].apply(get_len) #Return length of deriviation rule
JP_all['Label_len'] = JP_all['Label'].apply(get_len) #Return length of label rule
JP_all['deriv_info_key_words'] = JP_all.apply(extract_words,axis = 1) #extract key words if the rule for deriviation is too long
JP_all['label_keywords'] = JP_all.apply(label_keywords_combine, axis = 1) #Final version of deriviation rule info

In [None]:
#EU_all preprocessing 
EU_all = EU_all.drop_duplicates(subset=['Variable'])
EU_all['deriv_info'] = EU_all['Definition (derived Vars)']  #Re-organize deriviation rule
EU_all['deriv_info_null'] = EU_all['deriv_info'].apply(validate_null)  #Check whether deriviation rule is null or not
EU_all['deriv_info_len'] = EU_all['deriv_info'].apply(get_len) #Return length of deriviation rule
EU_all['Label_len'] = EU_all['Label'].apply(get_len) #Return length of label rule
EU_all['deriv_info_key_words'] = EU_all.apply(extract_words,axis = 1) #extract key words if the rule for deriviation is too long
EU_all['label_keywords'] = EU_all.apply(label_keywords_combine, axis = 1) #Final version of deriviation rule info

## Section 4. Fuzzy Match Class
Fuzzy match algorithms that used fuzzywuzzy package. Get similarity between two strings using Fuzzywuzzy algorithms
THe algorithm take 5 parameters as input: source_str_list,target_str_list,top_n,source_var_names,target_var_names
1. source_str_list: In this example, EU label/sheet_def/deriv_info strings in a list
2. source_var_names: In this example, EU variable name that corresponding to EU label/sheet_def/deriv_info strings in a list
3. target_str_list: In this example, JP label/sheet_def/deriv_info strings in a list
4. target_var_names: In this example, JP variable name that corresponding to EU label/sheet_def/deriv_info strings in a list
5. top_n: Number of top Similar variables you want to left in reuslt datasets. Must be smaller or equal to len(str_list2)

Return format:Target_Variable, Similar_var_#, Similar_value_#
1. Target_Variable: Variable names from the source data dictionary. Duplicate variable are dropped, variables with not label or definition are dropped.
2. Similar_var_#: Variable from the target data dictionary. Ranked based on the similarity score between target Target_Variable and Similar_var_#. ‘#’ stand for rank number. 
3. Similar_value_#: Similarity score for Similar_var_#. This score is calculated based on similarity between source variable definition and target variable definition.


In [None]:
class Fuzzy_Match_Similarity:
    def __init__(self,source_str_list,target_str_list,top_n,source_var_names,target_var_names):
        self.source_str_list = source_str_list
        self.target_str_list = target_str_list
        self.n = top_n
        self.target_name = source_var_names
        self.findin_name = target_var_names
    
    def text_preprocessing(self,text):
        # Lowercase
        text = text.lower()

        # Tokenization
        tokens = word_tokenize(text)

        # Removing Punctuation
        tokens = [token for token in tokens if token.isalpha()]

        # Stopword Removal
        stop_words = set(stopwords.words('english'))
        tokens = [token for token in tokens if token not in stop_words]

        # Stemming
        stemmer = PorterStemmer()
        tokens = [stemmer.stem(token) for token in tokens]

        # Join tokens back to a cleaned text
        cleaned_text = ' '.join(tokens)

        return cleaned_text
    
    
    def result_df(self):
        '''Build the empty dataframe for result'''
        col_name = ['Target_Variable']
        for i in range(self.n):
            str_temp = 'similar_var_'+str(i)
            sim_temp = 'similar_value_'+str(i)
            col_name.append(str_temp)
            col_name.append(sim_temp)
        df = pd.DataFrame(columns=col_name)
        return df
    
    def results(self,method):
        result_df = self.result_df()
        target_list = [self.text_preprocessing(text) for text in self.source_str_list]
        findin_list = [self.text_preprocessing(text) for text in self.target_str_list]
        for num1,text1 in enumerate(target_list):
            sim_list_partial_ratio = []
            row =[self.target_name[num1]]
            for num2,text2 in enumerate(findin_list):
                if method == 'token_sort_ratio':
                    sim_num_partial_ratio = fuzz.token_sort_ratio(text1,text2)
                elif method == 'ratio':
                    sim_num_partial_ratio = fuzz.ratio(text1,text2)
                elif method == 'partial_ratio':
                    sim_num_partial_ratio = fuzz.partial_ratio(text1,text2)
                elif method == 'token_set_ratio':
                    sim_num_partial_ratio = fuzz.token_set_ratio(text1,text2)
                elif method == 'partial_token_set_ratio':
                    sim_num_partial_ratio = fuzz.partial_token_set_ratio(text1,text2)
                elif method == 'partial_token_sort_ratio':
                    sim_num_partial_ratio = fuzz.partial_token_sort_ratio(text1,text2)
                else:
                    print('Method Not Included')
                
                
                sim_list_partial_ratio.append(sim_num_partial_ratio)
            
            sorted_list_with_positions = sorted(enumerate(sim_list_partial_ratio), key=lambda x: x[1],reverse=True)
            top_n_list = sorted_list_with_positions[0:self.n]
            for sets in top_n_list:
                row.append(self.findin_name[sets[0]])
                row.append(sets[1])
            result_df.loc[len(result_df.index)] = row
        return result_df
            
        



## Section 5. Semantic Textual Similarity Class
Get Semantic textual similarity between two string using corresponding transformer model.
Textual similarity is generated using cosine similarity. 

THe algorithm take 6 parameter as input: str_list1,str_list2,model,top_n,var_name_list1,var_name_list2
1. str_list1: In this example, EU label/sheet_def/deriv_info strings in a list
2. var_name_list1: In this example, EU variable name that corresponding to EU label/sheet_def/deriv_info strings in a list
3. str_list2: In this example, JP label/sheet_def/deriv_info strings in a list
4. var_name_list2: In this example, JP variable name that corresponding to EU label/sheet_def/deriv_info strings in a list
5. model: The name of the transformer model you want to use. Option are : "sentence-transformers/all-mpnet-base-v2", "intfloat/e5-large-v2", "sentence-transformers/all-MiniLM-L12-v2"
6. top_n: Number of top Similar variables you want to left in reuslt datasets. Must be smaller or equal to len(str_list2)

Return format:Target_Variable, Similar_var_#, Similar_value_#
1.	 Target_Variable: Variable names from the source data dictionary. Duplicate variable are dropped, variables with not label or definition are dropped.
2.	Similar_var_#: Variable from the target data dictionary. Ranked based on the similarity score between target Target_Variable and Similar_var_#. ‘#’ stand for rank number. 
3.	Similar_value_#: Similarity score for Similar_var_#. This score is calculated based on similarity between source variable definition and target variable definition.

In [None]:
class Semantic_Textual_Similarity:
    def __init__(self,str_list1,str_list2,model,top_n,var_name_list1,var_name_list2):
        self.str_list1 =str_list1
        self.str_list2 =str_list2
        self.name1 = var_name_list1
        self.name2 = var_name_list2
        self.model = model#"sentence-transformers/multi-qa-mpnet-base-dot-v1"
        self.n = top_n
    
    def get_embeddings(self, strs_list):
        '''Get embedding with transformer models'''
        #Mean Pooling - Take attention mask into account for correct averaging
        def mean_pooling(model_output, attention_mask):
            token_embeddings = model_output[0] #First element of model_output contains all token embeddings
            input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
            sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
            sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
            return sum_embeddings / sum_mask
        #Load AutoModel from huggingface model repository
        tokenizer = AutoTokenizer.from_pretrained(self.model)
        model = AutoModel.from_pretrained(self.model)
        #Tokenize sentences
        encoded_input = tokenizer(strs_list, padding=True, truncation=True, max_length=512, return_tensors='pt')
        print('Encode finished')#max_length = 128
        #Compute token embeddings
        with torch.no_grad():
            model_output = model(**encoded_input)
            print('Output finished')
        #Perform pooling. In this case, mean pooling
        sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
        print('Pooling finished')
        return sentence_embeddings
    
    def get_similarity(self,vector1,vector2):
        '''Cosine similarity between two embedding vectors'''
        return 1 - cosine(vector1, vector2)
    
    def result_df(self):
        '''Build the empty dataframe for result'''
        col_name = ['Target_Variable']
        for i in range(self.n):
            str_temp = 'similar_var_'+str(i)
            sim_temp = 'similar_value_'+str(i)
            col_name.append(str_temp)
            col_name.append(sim_temp)
        df = pd.DataFrame(columns=col_name)
        return df
        
   
                
                
    def result(self):
        '''Main function that output the result data frame'''
        result_df = self.result_df()
        if self.model == 'sentence-transformers/stsb-roberta-large':
            model = CrossEncoder('cross-encoder/stsb-roberta-large')
            for n in range(len(self.str_list1)):
                row = [self.name1[n]]
                combinations = list(itertools.product([self.str_list1[n]], self.str_list2))
                scores = model.predict(combinations)
                sorted_indices = np.argsort(scores)

                
                top_indices = sorted_indices[-self.n:]  # Get the last N indices
                selected_var = [self.name2[m] for m in top_indices]
                top_values = scores[top_indices]    
                for L in range(self.n):
                    row.append(selected_var[self.n-L-1])
                    row.append(top_values[self.n-L-1])
                result_df.loc[len(result_df.index)] = row
        
        else:
            if self.model in ['intfloat/multilingual-e5-large','intfloat/e5-large-v2']:
                prefix = 'query: '
                new_list1 = [prefix + item for item in self.str_list1]
                new_list2 = [prefix + item for item in self.str_list2]
                list1_embed = self.get_embeddings(new_list1)
                list2_embed = self.get_embeddings(new_list2)
            else:
                list1_embed = self.get_embeddings(self.str_list1)
                list2_embed = self.get_embeddings(self.str_list2)
            result_df = self.result_df()
            for num1,vec1 in enumerate(list1_embed):
                sim_list = []
                row =[self.name1[num1]]
                for num2,vec2 in enumerate(list2_embed):
                    sim_num = self.get_similarity(vec1,vec2)
                    sim_list.append(sim_num)

                sorted_list_with_positions = sorted(enumerate(sim_list), key=lambda x: x[1],reverse=True)
                top_n_list = sorted_list_with_positions[0:self.n]
                for sets in top_n_list:
                    row.append(self.name2[sets[0]])
                    row.append(sets[1])
                result_df.loc[len(result_df.index)] = row
        return result_df
        

## Section 6. Specify Input columns and Run Algorithms
#### Part 1. Specify which column to use for variables alignment algorithm (input columns).
In this example, three parts can be used for similarity comparison.
1. Sheet definition. Since both data dictionary has different data sheets for each category of dataset, we can alignment variables base in sheet definition.
2. Keywords from deriviation rules. The deriviation rule can be NULL for some variables, so it's combined with label.
3. Label. This is the definition of each variable, which is the main usage for variables' alignment.

#### Part 2. Run different algorithms with different NLP methods.
Three sets of options can be run for similarity: Sheet_Def, label_keywords, Label_CRF

In [None]:
#Part 1
#top_n Number of top Similar variables you want to left in reuslt datasets. Must be smaller or equal to len(str_list2)
top_n = len(JP_all['Label'])

#If we want to use sheet definition for similarity comparison criterion, these set should be use as input
#Source_Str1 = list(EU_all['Sheet_Def'].astype(str))
#Target_Str2 = list(JP_all['Sheet_Def'].astype(str))

#If we want to use deriviation rules for similarity comparison criterion, these set should be use as input
#Source_Str1 = list(EU_all['label_keywords'].astype(str))
#Target_Str2 = list(JP_all['label_keywords'].astype(str))

#If we want to use labels for similarity comparison criterion, these set should be use as input
Source_Str1 = list(EU_all['Label_CRF'].astype(str))
Target_Str2 = list(JP_all['Label'].astype(str))

#variable name
source_name1 = list(EU_all['Variable'])
target_name2 = list(JP_all['Variable'])

#Truth map
Truth_Map = Truth_table[['Variable_EU','Variable_JP']]

In [None]:
%%time
#Part 2
#Fuzzy Match with partial token sort ratio

fuzzy1  = Fuzzy_Match_Similarity(Source_Str1,Target_Str2,top_n,source_name1,target_name2)
partial_token_sort_ratio_df = fuzzy1.results('partial_token_sort_ratio')

In [None]:
%%time
#Fuzzy Match with token sort ratio
fuzzy1 = Fuzzy_Match_Similarity(Source_Str1,Target_Str2,top_n,source_name1,target_name2)
token_sort_ratio_df = fuzzy1.results('token_sort_ratio')

In [None]:
%%time
#Fuzzy Match with ratio
fuzzy1 = Fuzzy_Match_Similarity(Source_Str1,Target_Str2,top_n,source_name1,target_name2)
ratio_df = fuzzy1.results('ratio')

In [None]:
%%time
#Fuzzy Match with partial token set ratio

fuzzy1 = Fuzzy_Match_Similarity(Source_Str1,Target_Str2,top_n,source_name1,target_name2)
token_set_ratio_df = fuzzy1.results('token_set_ratio')

In [None]:
%%time
#Semantic similarity with sentence-transformers/all-mpnet-base-v2 model

STS7 = Semantic_Textual_Similarity(Source_Str1,Target_Str2,"sentence-transformers/all-mpnet-base-v2",top_n,source_name1,target_name2)
all_mpnet_base_v2_df = STS7.result()

In [None]:
%%time
#Semantic similarity with "intfloat/e5-large-v2" model
STS8 = Semantic_Textual_Similarity(Source_Str1,Target_Str2,"intfloat/e5-large-v2",top_n,source_name1,target_name2)
e5_large_v2_df = STS8.result()

In [None]:
%%time
#Semantic similarity with "sentence-transformers/all-MiniLM-L12-v2" model
STS10 = Semantic_Textual_Similarity(Source_Str1,Target_Str2,"sentence-transformers/all-MiniLM-L12-v2",top_n,source_name1,target_name2)
All_MiniLM_L12_v2_df = STS10.result()

In [None]:
%%time
#Semantic similarity with "FremyCompany/BioLORD-2023" model
STS11 = Semantic_Textual_Similarity(Source_Str1,Target_Str2,"FremyCompany/BioLORD-2023",top_n,source_name1,target_name2)
Biolord2023_df = STS11.result()

## Section 7. Algorithm Evaluation with Truth Map
Three Rank base methods are used for evaluation. The manual alignment truth map is only used in this evaluation process.
Detail information can be found in here:
https://towardsdatascience.com/ranking-evaluation-metrics-for-recommender-systems-263d0a66ef54
1. Hit ratio
2. Mean reciprocal rank
3. Average precision

In [None]:
top_n = len(JP_all['Label'])
#Get hardness of variable base on mean_reciprocal_rank
EU_reciprocal_rank = {}

class Rank_base_evaluation:
    def __init__(self,truth_map,result_df,Target_var_col_name_Truth,Findin_var_col_name_Truth):
        '''Truth map is all the pairs of manual aligned variables '''
        self.truth = truth_map
        self.Target_var_col_name_Truth =Target_var_col_name_Truth
        self.Findin_var_col_name_Truth = Findin_var_col_name_Truth
        self.result_df = result_df
        
        
    def Make_dict(self):
        '''Convert the turth map into dictionary {'Var1':['Var2','Var3']...}
        Conver the result_df into dictionary{'Var1':[rank1,rank2,rank3...]}'''
        global top_n
        grouped_truth = self.truth.groupby(self.Target_var_col_name_Truth)[self.Findin_var_col_name_Truth].apply(list).reset_index()
        Truth_dict = dict(zip(grouped_truth[self.Target_var_col_name_Truth], grouped_truth[self.Findin_var_col_name_Truth]))
        clo_names = []
        clo_value = []
        for k in range(top_n):
            name = 'similar_var_'+str(k)
            name_value =  'similar_value_'+str(k)
            clo_names.append(name)
            clo_value.append(name_value)
        
        Result_dict = {}
        Result_dict_value = {}
        Result_dict_score = {}
        
        for j in range(len(self.result_df)):
            key = self.result_df['Target_Variable'].iloc[j]
            items = list(self.result_df[clo_names].iloc[j].values)
            value = list(self.result_df[clo_value].iloc[j].values)
            rank = [sorted(value,reverse=True).index(x) +1 for x in value]
            Result_dict[key] = items
            Result_dict_value[key] = rank
            Result_dict_score[key] = value
        
        return Truth_dict,Result_dict,Result_dict_value,Result_dict_score
    
    def hit_ratio(self, k):
        global top_n
        """
        Calculate the Hit Ratio (HR) at a given rank k.
        """
        Truth_dict,Result_dict,Result_dict_value,Result_dict_score =self.Make_dict()
        if k<=top_n:
            total_var = len(Truth_dict)#08012023Result_dict  Truth table len = 30/ hit is at most 30
        
            correct_predictions = 0

            for Target_var, ranked_items in Result_dict.items():
                truth_items = Truth_dict.get(Target_var, set())
                top_k_items = ranked_items[:k]
                if any(item in truth_items for item in top_k_items):
                    correct_predictions += 1
            
            
            #print(f'Total of {correct_predictions} hit in {total_var} variabels')

            return correct_predictions / total_var
        else:
            print(f"Only top {top_n} similar variable is recorded. Narrow your evaluation rank k.")
            
    
    def mean_reciprocal_rank(self):
        """
        Calculate the Mean Reciprocal Rank (MRR).
        
        """
        Truth_dict,Result_dict,Result_dict_value, Result_dict_score =self.Make_dict()
        
        total_var = len(Truth_dict) #Result_dict
        reciprocal_ranks = []
        reciprocal_ranks_var = []

        for Target_var, ranked_items in Result_dict.items():
            truth_items = Truth_dict.get(Target_var, set())
            rank_list = Result_dict_value[Target_var]
            for rank, item in enumerate(ranked_items):
                if item in truth_items:
                    rank_value = rank_list[rank]
                    reciprocal_ranks.append(1 / rank_value)
                    reciprocal_ranks_var.append(Target_var)
                    #print(Target_var)
                    break
               

        return reciprocal_ranks,reciprocal_ranks_var,sum(reciprocal_ranks) / total_var
        
        
    def average_precision(self):
        """
        Calculate the Average Precision for each query and return the mean of all queries' Average Precision.
        
        """
        Truth_dict,Result_dict,Result_dict_value,Result_dict_score =self.Make_dict()
        total_var = len(Truth_dict) #Result_dict
        average_precisions = []

        for Target_var, ranked_items in Result_dict.items():
            truth_items = Truth_dict.get(Target_var, set())
            rank_list = Result_dict_value[Target_var]
            num_correct_predictions = 0
            precision_sum = 0.0

            for rank, item in enumerate(ranked_items):
                if item in truth_items:
                    rank_value = rank_list[rank]
                    num_correct_predictions += 1
                    precision_sum += num_correct_predictions / rank_value

            average_precisions.append(precision_sum / max(len(truth_items), 1))

        return sum(average_precisions) / total_var

In [None]:
#Get evaluation results
Eva_STS9 = Rank_base_evaluation(Truth_Map,token_set_ratio_df,'Variable_EU','Variable_JP')
Eva_STS12 = Rank_base_evaluation(Truth_Map,all_mpnet_base_v2_df,'Variable_EU','Variable_JP')
Eva_STS13 = Rank_base_evaluation(Truth_Map,e5_large_v2_df,'Variable_EU','Variable_JP')
Eva_STS16 = Rank_base_evaluation(Truth_Map,All_MiniLM_L12_v2_df,'Variable_EU','Variable_JP')
Eva_STS19 = Rank_base_evaluation(Truth_Map,Biolord2023_df,'Variable_EU','Variable_JP')

In [None]:
#Visulizing Hit Ratio Graph
plt.figure(figsize =(8, 8),dpi = 300)
Eva_STS9_topn_hitrario = []
Eva_STS12_topn_hitrario = []
Eva_STS13_topn_hitrario = []
Eva_STS16_topn_hitrario = []
Eva_STS19_topn_hitrario = []
x_values = []

for n in range(1,31):
    print(n)
    Eva_STS9_topn_hitrario.append(Eva_STS9.hit_ratio(n))
    Eva_STS12_topn_hitrario.append(Eva_STS12.hit_ratio(n))
    Eva_STS13_topn_hitrario.append(Eva_STS13.hit_ratio(n))
    Eva_STS16_topn_hitrario.append(Eva_STS16.hit_ratio(n))
    Eva_STS19_topn_hitrario.append(Eva_STS19.hit_ratio(n))
    x_values.append(n)   

In [None]:
#Plot points for hit ratio in top 1 to top 31
plt.scatter(x_values, Eva_STS9_topn_hitrario, label='token_set_ratio', color='#8A2BE2', marker='s')
plt.scatter(x_values, Eva_STS12_topn_hitrario, label='all_mpnet_base_v2', color='#7FFF00', marker='X')
plt.scatter(x_values, Eva_STS13_topn_hitrario, label='e5_large_v2', color='#458B00', marker='D')
plt.scatter(x_values, Eva_STS16_topn_hitrario, label='All_MiniLM_L12_v2_df', color='#8B8B00', marker='2')
plt.scatter(x_values, Eva_STS19_topn_hitrario, label='Biolord2023_df', color='#88c999', marker='2')


#Add lines between scatter
plt.plot(x_values, Eva_STS12_topn_hitrario, color='red')
plt.plot(x_values, Eva_STS13_topn_hitrario, color='red')
plt.plot(x_values, Eva_STS16_topn_hitrario, color='red')
plt.plot(x_values, Eva_STS19_topn_hitrario, color='red')
plt.plot(x_values, Eva_STS9_topn_hitrario, color='blue')


# Add labels and title
plt.xlabel('Truth Align in TOP 30')
plt.ylabel('Hit Ratio')
plt.title('Hit Ratio for Truth Aligned showed up in TOP 30 Similar')

# Add legend
plt.legend(loc='lower right', title='Models')

# Show the plot
plt.show()

In [None]:
#Visulizing mean_reciprocal_rank and average_precision Graph

plt.figure(figsize =(4, 4),dpi = 300)
columns = ['token_set_ratio','all_mpnet_base_v2','e5_large_v2','All_MiniLM_L12_v2','Biolord2023']
mean_reciprocal_rank = []
average_precision = []
for i in [Eva_STS9,Eva_STS12,Eva_STS13,Eva_STS16,Eva_STS19]:
    mean_reciprocal_rank.append(i.mean_reciprocal_rank()[2])
    average_precision.append(i.average_precision())
data = {'mean_reciprocal_rank':mean_reciprocal_rank,'mean_average_precision':average_precision}
Eva_df = pd.DataFrame(data = data,index = columns)
#Eva_df

plt.scatter(Eva_df['mean_reciprocal_rank'], Eva_df['mean_average_precision'])
texts = []
# Add index labels to the points
for i, (x, y) in enumerate(zip(Eva_df['mean_reciprocal_rank'], Eva_df['mean_average_precision'])):
    texts.append(plt.text(x, y, f"{Eva_df.index[i]}"))
    #plt.text(x, y, f"{Eva_df.index[i]}", ha='right', va='bottom')
    
#fix overlapping annotations / text
adjust_text(texts, only_move={'points':'y', 'texts':'y'}, arrowprops=dict(arrowstyle="->", color='r', lw=0.5))

# Add labels and title
plt.xlabel('mean_reciprocal_rank')
plt.ylabel('mean_average_precision')
plt.title('Models Performance')

# Show the plot
#plt.grid(True)
plt.show()