In [1]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse
import pathlib
import os
from dotenv import load_dotenv

In [2]:
path_env = pathlib.Path(os.getcwd()).parent.parent / '.env'
path_env

PosixPath('/export/usuarios_ml4ds/lbartolome/Repos/umd/LinQAForge/.env')

In [3]:
load_dotenv(path_env)
api_key = os.getenv("OPENAI_API_KEY")

os.environ["OPENAI_API_KEY"] = api_key

In [4]:
#!#pip freeze > /export/usuarios01/clpindado/TFM/requirements

#langchain==0.1.17 langchain-community==0.0.36 langchain-core==0.2.1 langchain-huggingface==0.0.1 langchain-text-splitters==0.0.1

In [5]:
!python --version

Python 3.10.11


## GPTCache

In [6]:
#!pip install gptcache

In [7]:
# get the content(only question) form the prompt to cache
def get_msg_func(data, **_):
    return data.get("messages")[-1].content

In [8]:
from langchain_huggingface import HuggingFacePipeline

llm = HuggingFacePipeline.from_model_id(
    model_id="meta-llama/Meta-Llama-3-8B",
    device_map="auto",
    task="text-generation",
    pipeline_kwargs={
        "max_new_tokens": 1000,
        "top_k": 50,
        "temperature": 0.1,
        "do_sample": True
    },
)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [9]:
from gptcache import cache
from gptcache.embedding import Onnx
from gptcache.manager import CacheBase, VectorBase, get_data_manager
from gptcache.similarity_evaluation.distance import SearchDistanceEvaluation


onnx = Onnx()
cache_base = CacheBase('sqlite')
vector_base = VectorBase('faiss', dimension=onnx.dimension)
data_manager = get_data_manager(cache_base, vector_base)
cache.init(
    pre_embedding_func=get_msg_func,
    embedding_func=onnx.to_embeddings,
    data_manager=data_manager,
    similarity_evaluation=SearchDistanceEvaluation(),
    )
cache.set_openai_key()



In [10]:
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter, TextSplitter
from langchain_community.document_loaders.csv_loader import CSVLoader

text_splitter =  RecursiveCharacterTextSplitter(chunk_overlap=500, chunk_size=2000)

In [11]:
# Original Rosie corpora
path_orig_corpus_es = "/export/usuarios_ml4ds/lbartolome/Repos/umd/LinQAForge/data/source/corpus_rosie/corpus_strict_v2.0_es_compiled_documents_lang.parquet"
path_orig_corpus_en = "/export/usuarios_ml4ds/lbartolome/Repos/umd/LinQAForge/data/source/corpus_rosie/corpus_strict_v3.0_en_compiled_documents_lang.parquet"

# Proccessed Rosie corpora
path_df_processed = "/export/usuarios_ml4ds/lbartolome/Repos/umd/LinQAForge/data/source/corpus_rosie/df_0.1.parquet"

# Path save csv for LangChain index
path_to_index = "/export/usuarios_ml4ds/lbartolome/Repos/umd/LinQAForge/data/source/corpus_rosie/to_index"

# Path models
model_path = "/export/usuarios_ml4ds/lbartolome/Repos/umd/LinQAForge/data/models/LDA/rosie_0.1_100"

In [12]:
# Create index
test_en = pathlib.Path(path_to_index) / f"EN_{4}.csv"
test_es = pathlib.Path(path_to_index) / f"EN_{36}.csv"

loader_en = CSVLoader(
    file_path=test_en,
    source_column="text",
    #csv_args={
    #    "fieldnames": ["doc_id", "text", "url"],
    #}
)

data_en = loader_en.load()

In [13]:
df = pd.read_csv(test_en)
df.columns

Index(['text'], dtype='object')

In [14]:
doc = loader_en.load()[0]
doc

Document(page_content="text: Acute Kidney Injury Condition Basics What is acute kidney injury? Acute kidney injury (which used to be called acute renal failure) means that your kidneys have suddenly stopped working normally. Your kidneys remove waste products and help balance water and salt and other minerals (electrolytes) in your blood. When your kidneys stop working, waste products, fluids, and electrolytes build up in your body. This can cause problems that can be deadly. What causes it? Acute kidney injury has three main causes: - A sudden, serious drop in blood flow to the kidneys. Heavy blood loss, an injury, or a bad infection called sepsis can reduce blood flow to the kidneys. Not enough fluid in the body (dehydration) also can harm the kidneys. - Damage from some poisons, infections, and medicines such as ibuprofen. People who have serious, long-term health problems, such as chronic kidney disease, are more likely to have a kidney problem from medicines. Injury can also be ca

In [15]:
from langchain.prompts import PromptTemplate

custom_prompt = PromptTemplate(
    input_variables=["text"],
    template="""
    You are a highly intelligent assistant. Given the following text, generate insightful and relevant question and answer pairs.

    Text:
    {text}

    For each question, provide a detailed and concise answer based on the text.
    """
)

In [18]:
custom_prompt = chain.llm_chain.prompt
print(custom_prompt)
custom_prompt_mod = PromptTemplate(
    input_variables=["text"],
    template= custom_prompt.template + " Do not provide additional commentary and do not wrap your response in Markdown formatting. Return RAW, VALID JSON."
)
custom_prompt_mod

input_variables=['text'] template='You are a smart assistant designed to help high school teachers come up with reading comprehension questions.\nGiven a piece of text, you must come up with a question and answer pair that can be used to test a student\'s reading comprehension abilities.\nWhen coming up with this question/answer pair, you must respond in the following format:\n```\n{{\n    "question": "$YOUR_QUESTION_HERE",\n    "answer": "$THE_ANSWER_HERE"\n}}\n```\n\nEverything between the ``` must be valid json.\n\nPlease come up with a question/answer pair, in the specified JSON format, for the following text:\n----------------\n{text}'


PromptTemplate(input_variables=['text'], template='You are a smart assistant designed to help high school teachers come up with reading comprehension questions.\nGiven a piece of text, you must come up with a question and answer pair that can be used to test a student\'s reading comprehension abilities.\nWhen coming up with this question/answer pair, you must respond in the following format:\n```\n{{\n    "question": "$YOUR_QUESTION_HERE",\n    "answer": "$THE_ANSWER_HERE"\n}}\n```\n\nEverything between the ``` must be valid json.\n\nPlease come up with a question/answer pair, in the specified JSON format, for the following text:\n----------------\n{text} Do not provide additional commentary and do not wrap your response in Markdown formatting. Return RAW, VALID JSON.')

In [None]:
print(chain.llm_chain.prompt)

In [19]:
from langchain.chat_models import ChatOpenAI
from langchain.chains import QAGenerationChain
from gptcache.adapter.langchain_models import LangChainChat

#chat = ChatOpenAI(temperature=0) # using the following code to cache with gptcache
chat = LangChainChat(chat=ChatOpenAI(temperature=0))

#chain = QAGenerationChain.from_llm(chat, text_splitter=text_splitter)
chain = QAGenerationChain.from_llm(llm, text_splitter=text_splitter, prompt =custom_prompt_mod)
#print(chain.llm_chain.prompt)
qa = chain.invoke(doc.page_content)
qa

A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


> [0;32m/home/lbartolome/.local/lib/python3.10/site-packages/langchain/chains/qa_generation/base.py[0m(77)[0;36m_call[0;34m()[0m
[0;32m     74 [0;31m            [0;34m[[0m[0;34m{[0m[0;34m"text"[0m[0;34m:[0m [0md[0m[0;34m.[0m[0mpage_content[0m[0;34m}[0m [0;32mfor[0m [0md[0m [0;32min[0m [0mdocs[0m[0;34m][0m[0;34m,[0m [0mrun_manager[0m[0;34m=[0m[0mrun_manager[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     75 [0;31m        )
[0m[0;32m     76 [0;31m        [0;32mimport[0m [0mpdb[0m[0;34m;[0m [0mpdb[0m[0;34m.[0m[0mset_trace[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m---> 77 [0;31m        [0mqa[0m [0;34m=[0m [0;34m[[0m[0mjson[0m[0;34m.[0m[0mloads[0m[0;34m([0m[0mres[0m[0;34m[[0m[0;36m0[0m[0;34m][0m[0;34m.[0m[0mtext[0m[0;34m)[0m [0;32mfor[0m [0mres[0m [0;32min[0m [0mresults[0m[0;34m.[0m[0mgenerations[0m[0;34m][0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     78 [0;31m        [0;3

ipdb>  results


LLMResult(generations=[[Generation(text='You are a smart assistant designed to help high school teachers come up with reading comprehension questions.\nGiven a piece of text, you must come up with a question and answer pair that can be used to test a student\'s reading comprehension abilities.\nWhen coming up with this question/answer pair, you must respond in the following format:\n```\n{\n    "question": "$YOUR_QUESTION_HERE",\n    "answer": "$THE_ANSWER_HERE"\n}\n```\n\nEverything between the ``` must be valid json.\n\nPlease come up with a question/answer pair, in the specified JSON format, for the following text:\n----------------\ntext: Acute Kidney Injury Condition Basics What is acute kidney injury? Acute kidney injury (which used to be called acute renal failure) means that your kidneys have suddenly stopped working normally. Your kidneys remove waste products and help balance water and salt and other minerals (electrolytes) in your blood. When your kidneys stop working, waste

ipdb>  results[0]


*** TypeError: 'LLMResult' object is not subscriptable


ipdb>  results.generations


[[Generation(text='You are a smart assistant designed to help high school teachers come up with reading comprehension questions.\nGiven a piece of text, you must come up with a question and answer pair that can be used to test a student\'s reading comprehension abilities.\nWhen coming up with this question/answer pair, you must respond in the following format:\n```\n{\n    "question": "$YOUR_QUESTION_HERE",\n    "answer": "$THE_ANSWER_HERE"\n}\n```\n\nEverything between the ``` must be valid json.\n\nPlease come up with a question/answer pair, in the specified JSON format, for the following text:\n----------------\ntext: Acute Kidney Injury Condition Basics What is acute kidney injury? Acute kidney injury (which used to be called acute renal failure) means that your kidneys have suddenly stopped working normally. Your kidneys remove waste products and help balance water and salt and other minerals (electrolytes) in your blood. When your kidneys stop working, waste products, fluids, and

ipdb>  results.generations[0]


[Generation(text='You are a smart assistant designed to help high school teachers come up with reading comprehension questions.\nGiven a piece of text, you must come up with a question and answer pair that can be used to test a student\'s reading comprehension abilities.\nWhen coming up with this question/answer pair, you must respond in the following format:\n```\n{\n    "question": "$YOUR_QUESTION_HERE",\n    "answer": "$THE_ANSWER_HERE"\n}\n```\n\nEverything between the ``` must be valid json.\n\nPlease come up with a question/answer pair, in the specified JSON format, for the following text:\n----------------\ntext: Acute Kidney Injury Condition Basics What is acute kidney injury? Acute kidney injury (which used to be called acute renal failure) means that your kidneys have suddenly stopped working normally. Your kidneys remove waste products and help balance water and salt and other minerals (electrolytes) in your blood. When your kidneys stop working, waste products, fluids, and 

ipdb>  type(results.generations[0])


<class 'list'>


ipdb>  type(results.generations[0][0])


<class 'langchain_core.outputs.generation.Generation'>


ipdb>  results.generations[0][0].keys


*** AttributeError: 'Generation' object has no attribute 'keys'


ipdb>  results.generations[0][0].keys()


*** AttributeError: 'Generation' object has no attribute 'keys'


ipdb>  results.generations[0][0]


Generation(text='You are a smart assistant designed to help high school teachers come up with reading comprehension questions.\nGiven a piece of text, you must come up with a question and answer pair that can be used to test a student\'s reading comprehension abilities.\nWhen coming up with this question/answer pair, you must respond in the following format:\n```\n{\n    "question": "$YOUR_QUESTION_HERE",\n    "answer": "$THE_ANSWER_HERE"\n}\n```\n\nEverything between the ``` must be valid json.\n\nPlease come up with a question/answer pair, in the specified JSON format, for the following text:\n----------------\ntext: Acute Kidney Injury Condition Basics What is acute kidney injury? Acute kidney injury (which used to be called acute renal failure) means that your kidneys have suddenly stopped working normally. Your kidneys remove waste products and help balance water and salt and other minerals (electrolytes) in your blood. When your kidneys stop working, waste products, fluids, and e

ipdb>  results.generations[0][0].text


'You are a smart assistant designed to help high school teachers come up with reading comprehension questions.\nGiven a piece of text, you must come up with a question and answer pair that can be used to test a student\'s reading comprehension abilities.\nWhen coming up with this question/answer pair, you must respond in the following format:\n```\n{\n    "question": "$YOUR_QUESTION_HERE",\n    "answer": "$THE_ANSWER_HERE"\n}\n```\n\nEverything between the ``` must be valid json.\n\nPlease come up with a question/answer pair, in the specified JSON format, for the following text:\n----------------\ntext: Acute Kidney Injury Condition Basics What is acute kidney injury? Acute kidney injury (which used to be called acute renal failure) means that your kidneys have suddenly stopped working normally. Your kidneys remove waste products and help balance water and salt and other minerals (electrolytes) in your blood. When your kidneys stop working, waste products, fluids, and electrolytes buil

JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [35]:
qa

[{'question': 'What are the three main causes of acute kidney injury?',
  'answer': 'The three main causes of acute kidney injury are a sudden drop in blood flow to the kidneys, damage from poisons, infections, and medicines, and a sudden blockage that stops urine from flowing out of the kidneys.'},
 {'question': 'What are the three main causes of acute kidney injury?',
  'answer': 'The three main causes of acute kidney injury are a sudden drop in blood flow to the kidneys, damage from poisons, infections, and medicines, and a sudden blockage that stops urine from flowing out of the kidneys.'},
 {'question': 'What are some of the treatments that a doctor may use to help restore blood flow to the kidneys?',
  'answer': 'A doctor may need to restore blood flow to the kidneys, stop any medicines that may be causing the problem, or remove or bypass a blockage in the urinary tract.'}]

In [29]:
doc

Document(page_content='text: May also be called: Acute Lymphocytic Leukemia; ALL Acute lymphoblastic leukemia is a type of cancer that causes the body to make too many abnormal immature white blood cells (called lymphoblasts). More to Know Normally, white blood cells (WBCs) help fight infection and protect the body against disease. With leukemia, WBCs turn cancerous and multiply when they shouldn\'t, resulting in too many abnormal WBCs, which then interfere with organ function. In acute lymphoblastic leukemia (ALL), too many lymphoblasts are made. These cells are abnormal and can\'t mature into normal white blood cells. Abnormal cells build up, and fewer healthy cells are made, leading to serious complications. Doctors don\'t know what causes acute lymphoblastic leukemia, but it affects about 75% of kids with leukemia. ALL is called "acute" because it tends to get worse quickly if left untreated. Symptoms include fatigue, fever, bone pain, night sweats, recurrent infections, and easy b

In [None]:
#######

In [11]:
from langchain_huggingface import HuggingFacePipeline

llm = HuggingFacePipeline.from_model_id(
    model_id="meta-llama/Meta-Llama-3-8B",
    device_map="auto",
    task="text-generation",
    pipeline_kwargs={
        "max_new_tokens": 100,
        "top_k": 50,
        "temperature": 0.1,
        "do_sample": True
    },
)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [30]:
from langchain_community.document_loaders import TextLoader
text_splitter = RecursiveCharacterTextSplitter(chunk_overlap=50, chunk_size=1000)
loader = TextLoader("/export/usuarios_ml4ds/lbartolome/Repos/umd/LinQAForge/src/contradictions/ejemplo.txt", encoding="utf-8")
doc = loader.load()

In [31]:
texts = text_splitter.split_documents(doc)
texts[0]

Document(page_content='BOE.es - BOA-d-1991-90001 Decreto Legislativo 1/1991, de 19 de febrero de la Diputación General de Aragón, por el que se aprueba el texto refundido de la Ley de Ordenación de la Función Pública de la Comunidad Autónoma de Aragón.\\nBOA-d-1991-90001\\nDocumento BOA-d-1991-90001\\nDecreto Legislativo 1/1991, de 19 de febrero de la Diputación General de Aragón, por el que se aprueba el texto refundido de la Ley de Ordenación de la Función Pública de la Comunidad Autónoma de Aragón.Ver texto consolidado\\nPublicado en:\\n«BOA» núm. 25, de 1 de marzo de 1991, páginas 667 a 677 (11 págs.)\\nDepartamento:\\nComunidad Autónoma de Aragón\\nReferencia:\\nBOA-d-1991-90001\\nOtros formatos:\\nTexto\\nTEXTO ORIGINAL\\nPor Ley 2/1991, de 4 de enero, de las Cortes de Aragón, se modifican numerosos preceptos de la Ley 1/1986, de 20 de febrero, de Medidas para la Ordenación de la Función Pública de la Comunidad Autónoma de Aragón, aprobando la nueva redacción de aquéllos por los 

In [32]:
chain2 = QAGenerationChain.from_llm(llm=llm, text_splitter=text_splitter)
qa = chain.invoke(texts[0].page_content)
print(qa)

{'text': 'BOE.es - BOA-d-1991-90001 Decreto Legislativo 1/1991, de 19 de febrero de la Diputación General de Aragón, por el que se aprueba el texto refundido de la Ley de Ordenación de la Función Pública de la Comunidad Autónoma de Aragón.\\nBOA-d-1991-90001\\nDocumento BOA-d-1991-90001\\nDecreto Legislativo 1/1991, de 19 de febrero de la Diputación General de Aragón, por el que se aprueba el texto refundido de la Ley de Ordenación de la Función Pública de la Comunidad Autónoma de Aragón.Ver texto consolidado\\nPublicado en:\\n«BOA» núm. 25, de 1 de marzo de 1991, páginas 667 a 677 (11 págs.)\\nDepartamento:\\nComunidad Autónoma de Aragón\\nReferencia:\\nBOA-d-1991-90001\\nOtros formatos:\\nTexto\\nTEXTO ORIGINAL\\nPor Ley 2/1991, de 4 de enero, de las Cortes de Aragón, se modifican numerosos preceptos de la Ley 1/1986, de 20 de febrero, de Medidas para la Ordenación de la Función Pública de la Comunidad Autónoma de Aragón, aprobando la nueva redacción de aquéllos por los motivos que s