In [1]:
import pdb
import pathlib
import pandas as pd
import numpy as np
from scipy import sparse
from qa_metrics.prompt_llm import CloseLLM
import os
from dotenv import load_dotenv
import time
from gensim import corpora
from IPython.display import display
import json

### OPEN AI / Prompts to use

In [2]:
path_env = pathlib.Path(os.getcwd()).parent.parent / '.env'
load_dotenv(path_env)
api_key = os.getenv("OPENAI_API_KEY")
gpt_model = CloseLLM()
gpt_model.set_openai_api_key(api_key)

### Paths

In [3]:
model_path = pathlib.Path("/export/usuarios_ml4ds/lbartolome/Repos/umd/LinQAForge/data/models/POLI/rosie_1_20")
path_corpus_en = model_path / "train_data" / "corpus_EN.txt"
path_corpus_es = model_path / "train_data" / "corpus_ES.txt"
path_source = pathlib.Path("/export/usuarios_ml4ds/lbartolome/Repos/umd/LinQAForge/data/source/corpus_rosie/passages/translated/df_1.parquet")
path_orig_en = pathlib.Path("/export/usuarios_ml4ds/lbartolome/Repos/umd/LinQAForge/data/source/corpus_rosie/corpus_pass_en_tr.parquet")
path_orig_es = pathlib.Path("/export/usuarios_ml4ds/lbartolome/Repos/umd/LinQAForge/data/source/corpus_rosie/corpus_pass_es_tr.parquet")

### Auxiliary functions

In [4]:
def get_doc_top_tpcs(doc_distr, topn=2):
    sorted_tpc_indices = np.argsort(doc_distr)[::-1]
    top = sorted_tpc_indices[:topn].tolist()
    top_weight = [(k, doc_distr[k]) for k in top]
    return top_weight

In [5]:
def get_most_representative_per_tpc(mat, topn=10):
    # Find the most representative document for each topic based on a matrix mat
    top_docs_per_topic = []
    
    mat_ = mat.copy()
    
    mat_[mat_ > 0.8] = 0

    for doc_distr in mat_.T:
        sorted_docs_indices = np.argsort(doc_distr)[::-1]
        top = sorted_docs_indices[:topn].tolist()
        top_docs_per_topic.append(top)
    return top_docs_per_topic

In [6]:
def get_most_representative_per_tpc(mat, topn=10):
    # Find the most representative document for each topic based on a matrix mat
    top_docs_per_topic = []

    for doc_distr in mat.T:
        sorted_docs_indices = np.argsort(doc_distr)[::-1]
        top = sorted_docs_indices[:topn].tolist()
        top_docs_per_topic.append(top)
    return top_docs_per_topic

In [7]:
def load_prompt_template(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        file_contents = file.read()
    return file_contents

In [8]:
prompt_template = load_prompt_template("/export/usuarios_ml4ds/lbartolome/Repos/umd/LinQAForge/src/mapping/promt.txt")

### Read data source

In [9]:
raw = pd.read_parquet(path_source)
raw

Unnamed: 0,id_preproc,lemmas,lemmas_tr,doc_id,text,text_tr,lang
0,0,order avoid future allergic_reaction range vom...,evitar futuro reacción_alérgico vómito urticar...,EN_607595_99219-9,In order to avoid future allergic reactions – ...,"Para evitar futuras reacciones alérgicas, que ...",EN
1,1,author_affiliations shinyi_wu phd university s...,author_afiliations shinyi_wu phd university_of...,EN_183633_46512-40,"Author Affiliations: Shinyi Wu, PhD, Universit...","Author Afiliations: Shinyi Wu, PhD, University...",EN
2,2,review_feb good doctor,reviewed_on feb médico,EN_854328_122182-27,"Reviewed on Feb 24, 2023: He is a very good do...","Reviewed on Feb 24, 2023: Es un médico muy bueno.",EN
3,3,remove morning remove water balloon place syri...,quitar mañana quitar sacar agua globo coloquir...,EN_1361170_297168-6,"Removing the catheter:\n- In the morning, remo...","Quitar el catéter:\n- Por la mañana, quitar el...",EN
4,4,risk factor child diagnose mis_c age year old ...,factor_riesgo niño diagnosticado soler año eda...,EN_1091308_196292-20,Risk factors: Children diagnosed with MIS-C ar...,Factores de riesgo: Los niños diagnosticados c...,EN
...,...,...,...,...,...,...,...
1062898,1062898,compromiso hepático hepatoesplenomegalia alter...,liver involvement_hepatosplenomegaly liver pro...,ES_34067_11078-56,"El compromiso hepático está dado, principalmen...",Liver involvement is mainly due to hepatosplen...,ES
1062899,1062899,glioma vía óptico tipo glioma_grado crecimient...,optical_glioma low grade_glioma slow_growth oc...,ES_491927_64077-5,El glioma de la vía óptica es un tipo de gliom...,Optical glioma is a type of low-grade glioma (...,ES
1062900,1062900,gustar_donación deducible_impuesto vanguardia_...,tax_deductible donation_cut edge_research medi...,ES_408122_50072-0,¿Te gustaría hacer una donación deducible de i...,Would you like to make a tax-deductible donati...,ES
1062901,1062901,licencia_alquiler previo archivado encontrar v...,archive prior rental_license document verify d...,ES_497507_4521-2,Si no hay una licencia de alquiler previa arch...,If there is no archived prior rental license: ...,ES


In [10]:
df_en = pd.read_parquet(path_orig_en)
df_en

Unnamed: 0,passage_id,passage,title,url,lang,tr_text
0,0-0,COVID-19 and Telehealth Info for New Moms: The...,,https://newmomhealth.com/,eng_Latn,COVID-19 y Telehealth Info for New Moms: La en...
1,0-1,"Health care teams are offering prenatal, postp...",,https://newmomhealth.com/,eng_Latn,Los equipos de atención médica están ofreciend...
2,0-2,What does support and care look like for you: ...,,https://newmomhealth.com/,eng_Latn,¿Cómo es el apoyo y el cuidado para usted: Ust...
3,0-3,Mama stories: Listening to the stories and exp...,,https://newmomhealth.com/,eng_Latn,Historias de mamá: Escuchar las historias y ex...
4,0-4,Meeting new parents and finding your “village”...,,https://newmomhealth.com/,eng_Latn,Conocer a nuevos padres y encontrar tu “aldea”...
...,...,...,...,...,...,...
1426618,308563-4,Whole slide images generated by the BIT can be...,Services,https://www.nationwidechildrens.org/research/a...,eng_Latn,Las imágenes de diapositivas completas generad...
1426619,308563-5,"As part of the Biopathology Center, our team c...",Services,https://www.nationwidechildrens.org/research/a...,eng_Latn,"Como parte del Centro de Biopatología, ahora n..."
1426620,308567-0,Mission Statement: The Advanced Competency in ...,Learn More About the Advanced Competency in Pe...,https://www.nationwidechildrens.org/for-medica...,eng_Latn,Declaración de la misión: La Competencia Avanz...
1426621,308568-0,The Advanced Competency in Medical Education f...,Learn More About Our Advanced Competency in Me...,https://www.nationwidechildrens.org/for-medica...,eng_Latn,La Competencia Avanzada en Educación Médica pa...


In [11]:
df_es = pd.read_parquet(path_orig_es)
df_es

Unnamed: 0,passage_id,passage,title,url,lang,tr_text
0,1-6,Shop Log In Create an Account For Ophthalmolog...,Relatos de Pacientes,https://www.aao.org/salud-ocular/relatos-de-pa...,spa_Latn,Shop Log In Create an Account For Ophthalmolog...
1,2-6,Shop Log In Create an Account For Ophthalmolog...,Noticias,https://www.aao.org/salud-ocular/noticias-lista,spa_Latn,Shop Log In Create an Account For Ophthalmolog...
2,2-7,"Apenas SEP 28, 2022 por Brandon Johnson, MD Ae...",Noticias,https://www.aao.org/salud-ocular/noticias-lista,spa_Latn,"Just SEP 28, 2022 by Brandon Johnson, MD Aetna..."
3,2-8,La primera alternativa a la inyección mensual ...,Noticias,https://www.aao.org/salud-ocular/noticias-lista,spa_Latn,The first alternative to monthly injection for...
4,2-9,La mayoría recurren a los anteojos o a lentes ...,Noticias,https://www.aao.org/salud-ocular/noticias-lista,spa_Latn,Most resort to eyeglasses or special contact l...
...,...,...,...,...,...,...
1064784,10273-129,Cuando se ha iniciado la fisuración en el horm...,El Eurocódigo 2 y la evaluación de estructuras...,https://www.elsevier.es/es-revista-hormigon-ac...,spa_Latn,When cracking has been initiated in concrete b...
1064785,10273-130,La figura 8 representa un esquema de la capaci...,El Eurocódigo 2 y la evaluación de estructuras...,https://www.elsevier.es/es-revista-hormigon-ac...,spa_Latn,Figure 8 represents an outline of the carrying...
1064786,10273-131,Figura 8. Esquema de la respuesta a momento fl...,El Eurocódigo 2 y la evaluación de estructuras...,https://www.elsevier.es/es-revista-hormigon-ac...,spa_Latn,Figure 8. Outline of the moment-flector respon...
1064787,10273-132,Si se está procediendo a la evaluación de la e...,El Eurocódigo 2 y la evaluación de estructuras...,https://www.elsevier.es/es-revista-hormigon-ac...,spa_Latn,"If the structure is being evaluated, an inspec..."


### Read corpus

In [12]:
path_corpus = model_path / "train_data" / "corpus_EN.txt"
with path_corpus.open("r", encoding="utf-8") as f:
    lines = [line for line in f.readlines()]
corpus = [line.rsplit(" EN ")[1].strip().split() for line in lines]

ids = [line.split(" EN ")[0] for line in lines]
df = pd.DataFrame({"lemmas": [" ".join(doc) for doc in corpus]})
df["doc_id"] = ids
df["len"] = df['lemmas'].apply(lambda x: len(x.split()))

In [13]:
thetas = sparse.load_npz(model_path / "train_data" / "thetas_EN.npz")

In [14]:
thetas.shape

(2456016, 20)

In [15]:
# Dataframe with the English documents used for training and their top two topics
df["thetas"] = list(thetas.toarray())
df_en = df[df['doc_id'].str.startswith("EN")].copy()
df_en.loc[:, "top_k"] = df_en["thetas"].apply(get_doc_top_tpcs)
df_en

Unnamed: 0,lemmas,doc_id,len,thetas,top_k
0,order avoid future allergic_reaction range vom...,EN_607595_99219-9,24,"[0.0, 0.9795918464660645, 0.0, 0.0, 0.0, 0.0, ...","[(1, 0.9795918464660645), (8, 0.02040816284716..."
1,author_affiliations shinyi_wu phd university s...,EN_183633_46512-40,18,"[0.0, 0.1428571492433548, 0.0, 0.7142857313156...","[(3, 0.7142857313156128), (14, 0.1428571492433..."
2,review_feb good doctor,EN_854328_122182-27,3,"[0.22093023359775543, 0.0, 0.0, 0.0, 0.0116279...","[(15, 0.569767415523529), (0, 0.22093023359775..."
3,remove morning remove water balloon place syri...,EN_1361170_297168-6,41,"[0.0, 0.03448275849223137, 0.0, 0.0, 0.0, 0.10...","[(11, 0.3448275923728943), (17, 0.172413796186..."
4,risk factor child diagnose mis_c age year old ...,EN_1091308_196292-20,17,"[0.0, 0.0, 0.0, 0.23728813230991364, 0.0169491...","[(16, 0.6779661178588867), (3, 0.2372881323099..."
...,...,...,...,...,...
1393108,overview photograph presence tumor vertebrae t...,EN_1292789_287135-0,11,"[0.032258063554763794, 0.0, 0.0, 0.0, 0.0, 0.0...","[(19, 0.8709677457809448), (18, 0.032258063554..."
1393109,pain bad day esi information technology begin ...,EN_771214_115470-28,16,"[0.10891088843345642, 0.009900989942252636, 0....","[(16, 0.3316831588745117), (13, 0.207920789718..."
1393110,ask weaver parent promote healthy body image c...,EN_595076_97282-3,108,"[0.03846153989434242, 0.5769230723381042, 0.11...","[(1, 0.5769230723381042), (7, 0.11538461595773..."
1393111,choi_peter s meyerson_matthew targeted genomic...,EN_629755_104139-9,9,"[0.0, 0.4938271641731262, 0.12345679104328156,...","[(1, 0.4938271641731262), (10, 0.2716049253940..."


In [16]:
# Save each topic's documents into a different file

In [17]:
# Get topic keys
with open(model_path / "mallet_output" / "keys_EN.txt", 'r') as file:
    lines = file.readlines()
topic_en_keys = [line.strip() for line in lines]

In [18]:
df_en_raw = df_en.merge(raw, how="inner", on="doc_id")[
    ["doc_id", "id_preproc", "lemmas_x", "text", "len"]]
df_en_raw

Unnamed: 0,doc_id,id_preproc,lemmas_x,text,len
0,EN_607595_99219-9,0,order avoid future allergic_reaction range vom...,In order to avoid future allergic reactions – ...,24
1,EN_183633_46512-40,1,author_affiliations shinyi_wu phd university s...,"Author Affiliations: Shinyi Wu, PhD, Universit...",18
2,EN_854328_122182-27,2,review_feb good doctor,"Reviewed on Feb 24, 2023: He is a very good do...",3
3,EN_1361170_297168-6,3,remove morning remove water balloon place syri...,"Removing the catheter:\n- In the morning, remo...",41
4,EN_1091308_196292-20,4,risk factor child diagnose mis_c age year old ...,Risk factors: Children diagnosed with MIS-C ar...,17
...,...,...,...,...,...
1393108,EN_1292789_287135-0,1393108,overview photograph presence tumor vertebrae t...,Overview: This photograph shows the presence o...,11
1393109,EN_771214_115470-28,1393109,pain bad day esi information technology begin ...,Your pain may become worse for two to three da...,16
1393110,EN_595076_97282-3,1393110,ask weaver parent promote healthy body image c...,We asked Dr. Weaver how parents can promote he...,108
1393111,EN_629755_104139-9,1393111,choi_peter s meyerson_matthew targeted genomic...,"2014: Choi Peter S, Meyerson Matthew: Targeted...",9


In [19]:
responses = []
most_repr = get_most_representative_per_tpc(thetas.toarray()[:len(df_en_raw),:], topn=6)
print(most_repr)
for topic in range(len(topic_keys)):
    #print(f"Topic {topic}: {topic_keys[topic]}")
    #print(most_repr)
    most_repr_docs = [df_en_raw[df_en_raw.id_preproc == id].text.values.tolist()[0] for id in most_repr[topic][3:]]
    #print(most_repr_docs)
    time.sleep(1)
    this_tpc_promt = prompt_template.format(
        topic_keys[topic],
        *most_repr_docs
    )
    llm_response = gpt_model.prompt_gpt(
        prompt=this_tpc_promt, model_engine='gpt-3.5-turbo', temperature=0.1, max_tokens=500
    )  
    label, add, rationale = llm_response.split(" - ")
    
    responses.append(
        [topic_keys[topic], "\n".join(most_repr_docs), label, add, rationale]
    )

[[954902, 385335, 174244, 1210649, 1024977, 1874], [954902, 385335, 1179174, 1031471, 332534, 222868], [954902, 385335, 1037520, 665703, 635570, 654648], [385335, 954902, 1054278, 837441, 1264877, 1194370], [385335, 954902, 965034, 498321, 801186, 668619], [954902, 385335, 504774, 372058, 1366435, 220141], [954902, 385335, 143261, 205466, 1347130, 148693], [954902, 385335, 1297858, 512923, 131507, 1269145], [954902, 385335, 1050884, 548810, 863725, 1155226], [954902, 385335, 1291201, 606039, 925532, 842832], [954902, 385335, 583030, 971822, 1096558, 469067], [954902, 385335, 301908, 372347, 192803, 1392030], [954902, 385335, 522964, 660420, 76098, 940982], [385335, 954902, 739634, 1287287, 780941, 694414], [385335, 954902, 1172571, 180458, 112507, 74994], [385335, 954902, 152392, 955528, 152884, 428533], [954902, 385335, 1083400, 1085711, 765334, 1244572], [385335, 954902, 504067, 991310, 1193656, 1196572], [954902, 385335, 860176, 630064, 1268635, 72956], [385335, 954902, 924140, 8268

NameError: name 'topic_keys' is not defined

In [None]:
responses_df_en = pd.DataFrame(responses, columns=["topic", "most_repr_docs", "label", "add", "rationale"])

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)  # Prevents truncation of column content
#display(responses_df_en[responses_df_en["add"] == "False"])
display(responses_df_en)