In [None]:
import logging
import os
import pathlib
from typing import Optional

import dspy
import pandas as pd
from dotenv import load_dotenv
from dspy.datasets import Dataset
from dspy.evaluate import Evaluate
from dspy.teleprompt import BootstrapFewShotWithRandomSearch, BootstrapFewShot
from scipy.spatial.distance import cosine
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split
from sentence_transformers import SentenceTransformer
import pandas as pd

In [1]:
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

NameError: name 'pd' is not defined

In [110]:
llm = dspy.HFClientTGI(model="meta-llama/Meta-Llama-3-8B ", port=8090, url="http://127.0.0.1")
dspy.settings.configure(lm=llm)

In [104]:
path_env = pathlib.Path(os.getcwd()).parent.parent / '.env'
load_dotenv(path_env)
api_key = os.getenv("OPENAI_API_KEY")

os.environ["OPENAI_API_KEY"] = api_key

llm = dspy.OpenAI(
    model="gpt-4o", #"gpt-3.5-turbo" , #"gpt-4-0125-preview",  #gpt-4o-2024-05-13, #"gpt-4-1106-preview", # TODO: try turbo-instruct,
    max_tokens=1000)

# Assumes the Weaviate collection has a text key `content`
dspy.settings.configure(lm=llm, trace=[])

In [150]:
import pathlib
import pandas as pd
import numpy as np
from scipy import sparse
from sentence_transformers import SentenceTransformer
import faiss


def create_faiss_index(df, text_column, id_column, model_name="all-mpnet-base-v2", index_file="faiss_index.index"):
    """
    Create a FAISS index from a DataFrame containing text data.

    Parameters:
    df (pd.DataFrame): The input DataFrame.
    text_column (str): The name of the column containing text data.
    id_column (str): The name of the column containing unique identifiers for the texts.
    model_name (str): The name of the SentenceTransformer model to use for embeddings.
    index_file (str): The file path to save the FAISS index.

    Returns:
    index: The FAISS index object.
    model: The SentenceTransformer model used for embeddings.
    ids: List of document IDs.
    texts: List of document texts.
    """
    texts = df[text_column].tolist()
    ids = df[id_column].tolist()

    model = SentenceTransformer(model_name, device="cuda")

    # Calculate embeddings for the texts
    embeddings = model.encode(texts, show_progress_bar=False)

    # Create a FAISS index
    d = embeddings.shape[1]
    index = faiss.IndexFlatIP(d)  

    # Normalize embeddings to unit length and add to index
    faiss.normalize_L2(embeddings)
    index.add(embeddings)

    # Save the index to a file
    faiss.write_index(index, index_file)

    return index, model, ids, texts

In [5]:
class ContradictionsDataset(Dataset):

    def __init__(
        self,
        data_fpath: str,
        dev_size: Optional[float] = 0.2,
        test_size: Optional[float] = 0.2,
        input_keys: str = ["answer1", "answer2", "question"],
        seed: Optional[int] = 11235,
        *args,
        **kwargs
    ) -> None:
        """
        fact -> question
        """

        super().__init__(*args, **kwargs)

        self._train = []
        self._dev = []
        self._test = []

        # Read the training data
        train_data = pd.read_excel(pathlib.Path(data_fpath))[
            ["answer1", "answer2", "question", "faith_strict", "faithfulness"]]

        for col in train_data.columns:
            train_data[col] = train_data[col].apply(
                lambda x: str(x).strip("\n\t"))

        train_data["faith_strict"] = train_data["faith_strict"].apply(
            lambda x: int(x))
        train_data["faith_strict"] = train_data["faith_strict"].astype(str)

        train_data["faithfulness"] = train_data["faithfulness"].apply(
            lambda x: int(x))
        train_data["faithfulness"] = train_data["faithfulness"].astype(str)

        train_data = train_data[
            (train_data.faithfulness == "0") |
            (train_data.faithfulness == "1")
        
        ]

        train_data, temp_data = train_test_split(
            train_data, test_size=dev_size + test_size, random_state=seed)
        dev_data, test_data = train_test_split(
            temp_data, test_size=test_size / (dev_size + test_size), random_state=seed)

        self._train = [
            dspy.Example({**row}).with_inputs(*input_keys) for row in self._convert_to_json(train_data)
        ]
        self._dev = [
            dspy.Example({**row}).with_inputs(*input_keys) for row in self._convert_to_json(dev_data)
        ]
        self._test = [
            dspy.Example({**row}).with_inputs(*input_keys) for row in self._convert_to_json(test_data)
        ]

    def _convert_to_json(self, data: pd.DataFrame):
        if data is not None:
            return data.to_dict(orient='records')

In [6]:
class CheckAnswersFaithfulness(dspy.Signature):
    """Verify whether ANSWER1 and ANSWER2 are FAITHFUL (1) to each other or not (0) given QUESTION. If its faithfulness can't be determined, return 2."""

    QUESTION = dspy.InputField()
    ANSWER1 = dspy.InputField()
    ANSWER2 = dspy.InputField()
    faithfulness = dspy.OutputField(
        desc="predicted label (1,0, or 2 only)", prefix="Faithfulness:")
    rationale = dspy.OutputField(desc="explains the relation between ANSWER1 and ANSWER2", prefix="Rationale:")

class ClassifyContradiction(dspy.Signature):
    ("""Classify the contradiction between ANSWER1 and ANSWER2 given QUESTION into: """
    """(0) Discrepancy: The answers might be correct within their respective contexts, byt they offer conflicting guidance or explanations that could lead to confusion. """
    """(1) Strict: The two answers provide directly opposing information."""
    )
    
    QUESTION = dspy.InputField()
    ANSWER1 = dspy.InputField()
    ANSWER2 = dspy.InputField()
    contradiction_type = dspy.OutputField(
        desc="predicted label (0 or 1 only)", prefix="Contradiction_type:")
    rationale = dspy.OutputField(desc="explains the type of contradiction", prefix="Rationale:")

class QACheckerModule(dspy.Module):
    def __init__(self):
        super().__init__()
        self.checker = dspy.ChainOfThought(CheckAnswersFaithfulness)

    def process_faithfulness(self, faithfulness):

        try:
            if "0" in faithfulness:
                return 0
            elif "1" in faithfulness:
                return 1
            elif "2" in faithfulness:
                return 2
        except Exception as e:
            print(f"Error: {e}")
            print(f"Faithfulness: {faithfulness}")
            return faithfulness

    def forward(self, answer1, answer2, question):
        response = self.checker(ANSWER1=answer1, ANSWER2=answer2, QUESTION=question)
        print(f"-- -- faithfulness: {response.faithfulness}")
        print(f"-- -- rationale: {response.rationale}")

        return dspy.Prediction(faithfulness=self.process_faithfulness(response.faithfulness), rationale=response.rationale)

class ClassifyContradictionModule(dspy.Module):
    def __init__(self):
        super().__init__()
        self.contrad_classifier = dspy.ChainOfThought(ClassifyContradiction)

    def process_contrad_type(self, contrad_type):
        try:
            if "0" in contrad_type:
                return 0
            elif "1" in contrad_type:
                return 1
        except Exception as e:
            print(f"Error: {e}")
            print(f"Contrd_type: {contrad_type}")
            return contrad_type

    def forward(self, answer1, answer2, question):
        response = self.contrad_classifier(ANSWER1=answer1, ANSWER2=answer2, QUESTION=question)
        print(f"-- -- faithfulness: {response.contradiction_type}")
        print(f"-- -- rationale: {response.rationale}")

        return dspy.Prediction(contrad_type=self.process_contrad_type(response.contradiction_type), rationale=response.rationale)

In [7]:
mbd=4
mld=16
ncp=2
mr=5
dev_size=0.25
data_path = "/export/usuarios_ml4ds/lbartolome/Repos/umd/LinQAForge/src/qa_system/tr_data/contradictions_dtset.xlsx"

def combined_score(example, pred, trace=None):

    pred_faith = pred["faithfulness"]
    ground_faith = int(example["faith_strict"]) if isinstance(example["faith_strict"], str) or isinstance(
        example["faith_strict"], float) else example["faith_strict"]

    return 1 if pred_faith == ground_faith else 0

def combined_score_class(example, pred, trace=None):

    pred_faith = pred["contrad_type"]
    ground_faith = int(example["faithfulness"]) if isinstance(example["faithfulness"], str) or isinstance(
        example["faithfulness"], float) else example["faithfulness"]

    return 1 if pred_faith == ground_faith else 0

In [8]:
dataset = ContradictionsDataset(
    data_fpath=data_path, dev_size=dev_size)


trainset = dataset._train
devset = dataset._dev
testset = dataset._test


config = dict(max_bootstrapped_demos=mbd, max_labeled_demos=mld,
              num_candidate_programs=ncp, max_rounds=mr)
teleprompter = BootstrapFewShotWithRandomSearch(
    metric=combined_score_class, **config)

compiled_pred = teleprompter.compile(
    ClassifyContradictionModule(), trainset=trainset, valset=devset)

Going to sample between 1 and 4 traces per predictor.
Will attempt to train 2 candidate sets.


Average Metric: 3 / 3  (100.0):  18%|███████████████████████████████████████                                                                                                                                                                                | 2/11 [00:01<00:12,  1.36s/it]

-- -- faithfulness: Contradiction_type: 0
-- -- rationale: Both answers suggest introducing solid food around the same age (4 to 6 months), but they offer different types of food recommendations. ANSWER 1 suggests mixing breast milk with solid food, while ANSWER 2 suggests giving rice and cow's milk before introducing solid food. The discrepancy lies in the type of food recommended, which could lead to confusion for someone seeking guidance.
-- -- faithfulness: Contradiction_type: 1
-- -- rationale: The two answers provide directly opposing information. ANSWER 1 suggests that it is safe to use a sauna or hot tub during pregnancy for a short period, while ANSWER 2 states that using a sauna or hot tub during pregnancy should be avoided due to the risk of birth defects. These statements cannot both be true simultaneously, making this a strict contradiction.
-- -- faithfulness: Contradiction_type: 0
-- -- rationale: The answers might be correct within their respective contexts. ANSWER 1 pr

Average Metric: 5 / 6  (83.3):  55%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                  | 6/11 [00:01<00:01,  4.58it/s]

-- -- faithfulness: Contradiction_type: 0
-- -- rationale: The answers might be correct within their respective contexts. ANSWER 1 provides a specific weight limit (10 pounds) that pregnant women should avoid lifting, while ANSWER 2 suggests that lifting moderate weights can be safe depending on the woman's fitness level. These answers offer conflicting guidance that could lead to confusion, but they are not directly opposing; rather, they reflect different perspectives or guidelines on the same issue.
-- -- faithfulness: Contradiction_type: 0
-- -- rationale: The answers provide conflicting guidance on the amount of food intake during pregnancy. ANSWER 1 suggests that one should "eat for two," implying a significant increase in food intake, while ANSWER 2 emphasizes the importance of a balanced diet without necessarily increasing the quantity of food. Both answers might be correct within their respective contexts, but they offer different approaches that could lead to confusion.
-- --

Average Metric: 7 / 8  (87.5):  64%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                              | 7/11 [00:02<00:01,  3.21it/s]

-- -- faithfulness: Contradiction_type: 1
-- -- rationale: The two answers provide directly opposing information. ANSWER 1 states that breastfeeding can reduce the risk of breast cancer for the mother, while ANSWER 2 claims that breastfeeding has no significant impact on the mother's risk of developing breast cancer. These statements cannot both be true simultaneously, making this a strict contradiction.
-- -- faithfulness: Contradiction_type: 1
-- -- rationale: The two answers provide directly opposing information. ANSWER 1 suggests that pregnant women should eat liver for its high vitamin A content, while ANSWER 2 warns that high vitamin A intake from liver can be harmful during pregnancy. These statements cannot both be true simultaneously, leading to a strict contradiction.


Average Metric: 8 / 11  (72.7): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11/11 [00:02<00:00,  3.89it/s]
  df = df.applymap(truncate_cell)


-- -- faithfulness: Contradiction_type: 1
-- -- rationale: The two answers provide directly opposing information. ANSWER 1 suggests that raw fish is typically eaten during pregnancy, while ANSWER 2 states that pregnant women eat only healthy foods like lentils, ghee, nuts, and milk, which implies that raw fish is not included. This creates a strict contradiction as the guidance on what foods are typically eaten during pregnancy is directly conflicting.
-- -- faithfulness: Contradiction_type: 1
-- -- rationale: ANSWER 1 states that women must not cry out or scream during labour, which is a strict directive. ANSWER 2, on the other hand, suggests that a woman's manner during labour depends on her personality and that some may moan, grunt, or even get hysterical, implying that making noise is acceptable. These two answers provide directly opposing information regarding whether it is permissible for women to scream during labour.
-- -- faithfulness: Contradiction_type: 1
-- -- rationale: Th

Average Metric: 8 / 11  (72.7): 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11/11 [00:00<00:00, 516.92it/s]


Average Metric: 8 / 11  (72.7%)
Score: 72.73 for set: [16]
Scores so far: [72.73, 72.73]
Best score: 72.73


  4%|██████████▋                                                                                                                                                                                                                                            | 1/23 [00:01<00:26,  1.21s/it]

-- -- faithfulness: Contradiction_type: 0
-- -- rationale: The answers might be correct within their respective contexts. ANSWER 1 suggests that caffeine intake can contribute to miscarriage, which could be true at higher levels of consumption. ANSWER 2 specifies that moderate caffeine intake does not significantly increase the risk, implying that the risk might be associated with higher levels of caffeine. The conflicting guidance could lead to confusion about what constitutes a safe level of caffeine intake during pregnancy.


  9%|█████████████████████▍                                                                                                                                                                                                                                 | 2/23 [00:02<00:22,  1.06s/it]

-- -- faithfulness: Contradiction_type: 1
-- -- rationale: The two answers provide directly opposing information. ANSWER 1 states that high levels of stress can cause preterm labor, while ANSWER 2 asserts that there is no direct link between stress levels and preterm labor. These statements cannot both be true simultaneously, making this a strict contradiction.


 13%|████████████████████████████████▏                                                                                                                                                                                                                      | 3/23 [00:03<00:22,  1.12s/it]

-- -- faithfulness: Contradiction_type: 1
-- -- rationale: The two answers provide directly opposing information. ANSWER 1 states that drinking milk during pregnancy can increase the baby's risk of lactose intolerance, while ANSWER 2 asserts that there is no evidence linking maternal milk consumption to lactose intolerance in the baby. This is a strict contradiction as they cannot both be true simultaneously.


 17%|██████████████████████████████████████████▉                                                                                                                                                                                                            | 4/23 [00:04<00:19,  1.05s/it]

-- -- faithfulness: Contradiction_type: 1
-- -- rationale: The two answers provide directly opposing information. ANSWER 1 states that spicy foods can induce labor in late pregnancy, while ANSWER 2 asserts that spicy foods do not induce labor and dismisses it as a common myth. This is a strict contradiction as they cannot both be true simultaneously.


 22%|█████████████████████████████████████████████████████▋                                                                                                                                                                                                 | 5/23 [00:05<00:18,  1.04s/it]

-- -- faithfulness: Contradiction_type: 0
-- -- rationale: The answers might be correct within their respective contexts. ANSWER 1 provides a general caution against high-intensity workouts during pregnancy, while ANSWER 2 offers a conditional statement that such workouts can be safe for women who were already accustomed to them before pregnancy. This discrepancy could lead to confusion but does not strictly oppose each other.


 26%|████████████████████████████████████████████████████████████████▍                                                                                                                                                                                      | 6/23 [00:06<00:18,  1.08s/it]

-- -- faithfulness: Contradiction_type: 0
-- -- rationale: Both answers provide different statistics regarding the percentage of pregnancies that end in miscarriage. ANSWER 1 states that around 15% of pregnancies end in miscarriage, while ANSWER 2 suggests that pregnancy loss occurs in up to 1 in every 4 pregnancies (which is approximately 25%). These answers might be correct within their respective contexts or sources, but they offer conflicting information that could lead to confusion.


 30%|███████████████████████████████████████████████████████████████████████████▏                                                                                                                                                                           | 7/23 [00:08<00:20,  1.28s/it]

-- -- faithfulness: Contradiction_type: 1
-- -- rationale: The two answers provide directly opposing information. ANSWER 1 states that pregnant women should avoid eating seafood entirely, while ANSWER 2 states that certain types of seafood, like salmon and sardines, are safe and beneficial during pregnancy. These statements cannot both be true simultaneously, leading to a strict contradiction.


 35%|█████████████████████████████████████████████████████████████████████████████████████▉                                                                                                                                                                 | 8/23 [00:09<00:17,  1.15s/it]


-- -- faithfulness: Contradiction_type: 1
-- -- rationale: The two answers provide directly opposing information. ANSWER 1 states that secondhand smoke increases the risk of low birth weight, while ANSWER 2 claims that secondhand smoke has no proven effect on birth weight. These statements cannot both be true simultaneously, making this a strict contradiction.


  0%|                                                                                                                                                                                                                                                               | 0/23 [00:00<?, ?it/s]
  0%|                                                                                                                                                                                                                                                               | 0/23 [00:00<?, ?it/s]
  0%|                                                                                                                                                                                                                                                               | 0/23 [00:00<?, ?it/s]
  0%|                                                                                                                                               

Bootstrapped 4 full traces after 1 examples in round 4.


Average Metric: 1 / 2  (50.0):   9%|███████████████████▋                                                                                                                                                                                                    | 1/11 [00:01<00:11,  1.11s/it]

-- -- faithfulness: Contradiction_type: 0
-- -- rationale: The two answers might be correct within their respective contexts but offer conflicting guidance. ANSWER 1 provides a specific recommendation of drinking at least 8 glasses of water daily, while ANSWER 2 suggests that fluid needs can vary and women should drink according to their thirst. This discrepancy could lead to confusion but does not present directly opposing information.
-- -- faithfulness: Contradiction_type: 0
-- -- rationale: The two answers might be correct within their respective contexts but offer conflicting guidance that could lead to confusion. ANSWER 1 provides a specific weight limit (10 pounds) that pregnant women should avoid lifting, while ANSWER 2 suggests that lifting moderate weights is generally safe, depending on the woman's fitness level. These statements do not directly oppose each other but offer different perspectives on what is considered safe, leading to a discrepancy.


Average Metric: 2 / 3  (66.7):  27%|██████████████████████████████████████████████████████████▉                                                                                                                                                             | 3/11 [00:01<00:03,  2.59it/s]

-- -- faithfulness: Contradiction_type: 0
-- -- rationale: The two answers might be correct within their respective contexts but offer conflicting guidance. ANSWER 1 states that a midwife or doctor is present at childbirth, which is generally true in a medical or hospital setting. ANSWER 2 describes a scenario where most women give birth at home without professional help, often assisted by a grandmother, which could be true in certain cultural or socio-economic contexts. These answers do not directly oppose each other but could lead to confusion due to their differing contexts.


Average Metric: 4 / 5  (80.0):  36%|██████████████████████████████████████████████████████████████████████████████▌                                                                                                                                         | 4/11 [00:02<00:03,  2.11it/s]

-- -- faithfulness: QUESTION: Is it safe to use a sauna or hot tub while pregnant?

ANSWER 1: It’s safe to use a sauna or hot tub during pregnancy as long as it’s for a short period.

ANSWER 2: Using a sauna or hot tub during pregnancy can increase the risk of birth defects and should be avoided.

Contradiction_type: 1
-- -- rationale: The two answers provide directly opposing information. ANSWER 1 states that it is safe to use a sauna or hot tub during pregnancy for a short period, while ANSWER 2 asserts that using a sauna or hot tub during pregnancy can increase the risk of birth defects and should be avoided. These statements cannot both be true simultaneously, making this a strict contradiction.
-- -- faithfulness: QUESTION: Do I need to eat more while I am pregnant to ensure proper nutrition for my baby?

ANSWER 1: During pregnancy, it's important to eat for two to ensure the baby gets enough nutrients.

ANSWER 2: Eating a balanced diet is crucial during pregnancy, but there's no 

Average Metric: 5 / 6  (83.3):  55%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                  | 6/11 [00:02<00:01,  3.08it/s]

-- -- faithfulness: Contradiction_type: 1
-- -- rationale: The two answers provide directly opposing information. ANSWER 1 suggests that pregnant women should eat liver for its high vitamin A content, implying it is beneficial. In contrast, ANSWER 2 warns that high vitamin A intake from liver can be harmful during pregnancy. These statements cannot both be true simultaneously, making this a strict contradiction.


Average Metric: 6 / 7  (85.7):  64%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                              | 7/11 [00:02<00:01,  3.15it/s]

-- -- faithfulness: QUESTION: When should solid food be introduced to an infant, and what types of food are recommended?

ANSWER 1: Breast milk is mixed with solid food when the infant is about 4 to 6 months of age.

ANSWER 2: Rice and cows milk are given to six month old babies and then solid food.

Contradiction_type: 0
-- -- rationale: The two answers provide conflicting guidance on the introduction of solid foods and types of food recommended, but they might be correct within their respective contexts. ANSWER 1 suggests introducing solid food mixed with breast milk at 4 to 6 months, while ANSWER 2 mentions giving rice and cow's milk at six months before solid food. This discrepancy could lead to confusion but does not strictly contradict each other as they could be interpreted as different approaches or stages in introducing solid foods.


Average Metric: 7 / 8  (87.5):  73%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                           | 8/11 [00:02<00:00,  3.43it/s]

-- -- faithfulness: Contradiction_type: 1
-- -- rationale: The two answers provide directly opposing information. ANSWER 1 states that breastfeeding can reduce the risk of breast cancer for the mother, while ANSWER 2 asserts that breastfeeding has no significant impact on the mother's risk of developing breast cancer. These statements cannot both be true simultaneously, making this a strict contradiction.


Average Metric: 7 / 9  (77.8):  82%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                       | 9/11 [00:03<00:00,  3.30it/s]

-- -- faithfulness: Contradiction_type: 0
-- -- rationale: The two answers might be correct within their respective contexts but offer conflicting guidance that could lead to confusion. ANSWER 1 suggests that losing a small amount of weight during the first trimester is normal, while ANSWER 2 implies that any weight loss during the first trimester should be a cause for concern. These statements do not directly oppose each other but provide conflicting advice, making this a discrepancy.


Average Metric: 8 / 10  (80.0):  91%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                   | 10/11 [00:03<00:00,  3.55it/s]

-- -- faithfulness: QUESTION: What foods can I eat during pregnancy?

ANSWER 1: Typical food eaten during the pregnancy is raw fish.

ANSWER 2: Pregnant women eat healthy food only lentils, ghee, nuts, and milk.

Contradiction_type: 0
-- -- rationale: The two answers might be correct within their respective contexts but offer conflicting guidance. ANSWER 1 suggests that raw fish is a typical food eaten during pregnancy, while ANSWER 2 lists lentils, ghee, nuts, and milk as the only healthy foods for pregnant women. This discrepancy could lead to confusion about what foods are appropriate during pregnancy, but it does not strictly contradict as they could be referring to different dietary practices or cultural norms.


Average Metric: 9 / 11  (81.8): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11/11 [00:03<00:00,  2.98it/s]


-- -- faithfulness: QUESTION: Can I scream during labour?

ANSWER 1: The women must not cry out or scream during labour.

ANSWER 2: Women’s manner/demeanour during labour depends on her personality, Some will moan and grunt during labour and others may get hysterical, Modesty remains a controlling factor when delivery occurs.

Contradiction_type: 0
-- -- rationale: The two answers offer conflicting guidance but are not directly opposing. ANSWER 1 provides a strict directive that women must not cry out or scream during labor, while ANSWER 2 suggests that the behavior during labor depends on the woman's personality and that some may scream or get hysterical. While they offer different perspectives, they do not directly contradict each other in a strict sense, making this a discrepancy.
Average Metric: 9 / 11  (81.8%)
Score: 81.82 for set: [16]
New best score: 81.82 for seed -1
Scores so far: [72.73, 72.73, 81.82]
Best score: 81.82
Average of max per entry across top 1 scores: 0.818181818

  4%|██████████▋                                                                                                                                                                                                                                            | 1/23 [00:01<00:28,  1.31s/it]

-- -- faithfulness: Contradiction_type: 1
-- -- rationale: The two answers provide directly opposing information. ANSWER 1 states that women who are carriers of the faulty X chromosome have a 1 in 2 chance of passing it to a child, while ANSWER 2 claims that women who carry the faulty X chromosome are unlikely to pass it on to their children. These statements cannot both be true simultaneously, making this a strict contradiction.


  9%|█████████████████████▍                                                                                                                                                                                                                                 | 2/23 [00:02<00:26,  1.28s/it]

-- -- faithfulness: Contradiction_type: 0
-- -- rationale: The answers might be correct within their respective contexts. ANSWER 1 advises against hot baths to prevent overheating, while ANSWER 2 suggests that moderate warm baths are safe. The discrepancy arises from the difference between "hot" and "moderate warm" baths, which could lead to confusion but are not directly opposing.


 13%|████████████████████████████████▏                                                                                                                                                                                                                      | 3/23 [00:03<00:26,  1.30s/it]

-- -- faithfulness: Contradiction_type: 0
-- -- rationale: The answers might be correct within their respective contexts. ANSWER 1 emphasizes the risk of toxoplasmosis, suggesting avoidance as a precaution. ANSWER 2 acknowledges the risk but suggests that with proper hygiene, having a cat can be safe. The guidance offered is conflicting but not directly opposing, as both answers recognize the risk of toxoplasmosis but differ in their recommendations on how to manage it.


 17%|██████████████████████████████████████████▉                                                                                                                                                                                                            | 4/23 [00:04<00:22,  1.16s/it]

-- -- faithfulness: Contradiction_type: 1
-- -- rationale: The two answers provide directly opposing information. ANSWER 1 specifies that pregnant women should sleep on their left side to avoid compressing major blood vessels, while ANSWER 2 states that any sleeping position is safe for pregnant women. These statements cannot both be true simultaneously, making this a strict contradiction.


 22%|█████████████████████████████████████████████████████▋                                                                                                                                                                                                 | 5/23 [00:05<00:20,  1.14s/it]

-- -- faithfulness: Contradiction_type: 0
-- -- rationale: The answers provide conflicting guidance. ANSWER 1 suggests that prenatal vitamins should be taken daily throughout pregnancy, while ANSWER 2 implies that they are optional and should only be taken if recommended by a healthcare provider. Both answers could be correct within their respective contexts, but they offer different advice that could lead to confusion.
-- -- faithfulness: Contradiction_type: 0
-- -- rationale: The answers might be correct within their respective contexts. ANSWER 1 suggests that caffeine intake can contribute to miscarriage, which could be true at higher levels of consumption. ANSWER 2 specifies that moderate caffeine intake does not significantly increase the risk, implying that the risk might be associated with higher levels of caffeine. The conflicting guidance could lead to confusion about what constitutes a safe level of caffeine intake during pregnancy.
-- -- faithfulness: Contradiction_type: 1
-

 43%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                                                           | 10/23 [00:07<00:09,  1.40it/s]


-- -- faithfulness: Contradiction_type: 1
-- -- rationale: The two answers provide directly opposing information. ANSWER 1 states that a high-protein diet is necessary for all pregnant women, while ANSWER 2 suggests that a balanced diet with moderate protein intake is sufficient for most pregnant women. These statements cannot both be true simultaneously, leading to a strict contradiction.
-- -- faithfulness: Contradiction_type: 0
-- -- rationale: The answers might be correct within their respective contexts. ANSWER 1 provides a general caution against high-intensity workouts during pregnancy, while ANSWER 2 offers a conditional statement that such workouts can be safe for women who were already accustomed to them before pregnancy. This discrepancy could lead to confusion but does not strictly oppose each other.
-- -- faithfulness: Contradiction_type: 1
-- -- rationale: The two answers provide directly opposing information. ANSWER 1 states that drinking milk during pregnancy can increa

  0%|                                                                                                                                                                                                                                                               | 0/23 [00:00<?, ?it/s]
  0%|                                                                                                                                                                                                                                                               | 0/23 [00:00<?, ?it/s]
  0%|                                                                                                                                                                                                                                                               | 0/23 [00:00<?, ?it/s]
  0%|                                                                                                                                               

Bootstrapped 4 full traces after 1 examples in round 4.


Average Metric: 2 / 2  (100.0):   9%|███████████████████▌                                                                                                                                                                                                   | 1/11 [00:01<00:11,  1.16s/it]

-- -- faithfulness: Contradiction_type: 0
-- -- rationale: The answers might be correct within their respective contexts, but they offer conflicting guidance. ANSWER 1 provides a specific recommendation of drinking at least 8 glasses of water daily, while ANSWER 2 suggests that fluid needs can vary and women should drink according to their thirst. Both answers aim to ensure adequate hydration but differ in their approach, leading to potential confusion.
-- -- faithfulness: Contradiction_type: 1
-- -- rationale: The two answers provide directly opposing information. ANSWER 1 states that it is safe to use a sauna or hot tub during pregnancy as long as it is for a short period, while ANSWER 2 claims that using a sauna or hot tub during pregnancy can increase the risk of birth defects and should be avoided. These statements cannot both be true simultaneously, making this a strict contradiction.


Average Metric: 3 / 4  (75.0):  36%|██████████████████████████████████████████████████████████████████████████████▌                                                                                                                                         | 4/11 [00:01<00:02,  3.12it/s]

-- -- faithfulness: Contradiction_type: 0
-- -- rationale: The answers might be correct within their respective contexts. ANSWER 1 states that a midwife or doctor is present at childbirth, which is generally true in a medical or hospital setting. ANSWER 2 describes a scenario where most women give birth at home without professional help, often assisted by a grandmother, which could be true in certain cultural or socio-economic contexts. The guidance offered is conflicting but not directly opposing, as both answers recognize different childbirth practices based on context.
-- -- faithfulness: Contradiction_type: 0
-- -- rationale: The answers might be correct within their respective contexts, but they offer conflicting guidance. ANSWER 1 provides a specific weight limit, suggesting that pregnant women should avoid lifting anything heavier than 10 pounds. ANSWER 2, on the other hand, suggests that lifting moderate weights is generally safe, depending on the woman's fitness level. While t

Average Metric: 6 / 7  (85.7):  55%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                  | 6/11 [00:02<00:01,  2.88it/s]

-- -- faithfulness: QUESTION: When should solid food be introduced to an infant, and what types of food are recommended?

ANSWER 1: Breast milk is mixed with solid food when the infant is about 4 to 6 months of age.

ANSWER 2: Rice and cows milk are given to six month old babies and then solid food

Contradiction_type: 0
-- -- rationale: The answers might be correct within their respective contexts but offer conflicting guidance. ANSWER 1 suggests introducing solid food mixed with breast milk at 4 to 6 months, while ANSWER 2 suggests giving rice and cow's milk at six months before introducing solid food. The guidance on the types of food and timing is conflicting, but not directly opposing, as both answers agree on introducing solid food around the six-month mark but differ in the specifics of what to introduce and when.
-- -- faithfulness: Contradiction_type: 0
-- -- rationale: The answers might be correct within their respective contexts, but they offer conflicting guidance. ANSWER 1

Average Metric: 8 / 9  (88.9):  82%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                       | 9/11 [00:02<00:00,  4.87it/s]

-- -- faithfulness: QUESTION: Is it safe to eat liver during pregnancy?

ANSWER 1: Pregnant women should eat liver for its high vitamin A content.

ANSWER 2: High vitamin A intake from liver can be harmful during pregnancy.

Contradiction_type: 1
-- -- rationale: The two answers provide directly opposing information. ANSWER 1 suggests that pregnant women should eat liver because of its high vitamin A content, while ANSWER 2 warns that high vitamin A intake from liver can be harmful during pregnancy. These statements cannot both be true simultaneously, making this a strict contradiction.


Average Metric: 8 / 10  (80.0):  91%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                   | 10/11 [00:03<00:00,  2.80it/s]

-- -- faithfulness: QUESTION: Can I scream during labour?

ANSWER 1: The women must not cry out or scream during labour.

ANSWER 2: Women’s manner/demeanour during labour depends on her personality, Some will moan and grunt during labour and others may get hysterical, Modesty remains a controlling factor when delivery occurs.

Contradiction_type: 1
-- -- rationale: The two answers provide directly opposing information. ANSWER 1 states that women must not cry out or scream during labour, implying a strict prohibition. ANSWER 2, on the other hand, suggests that a woman's behavior during labour varies based on her personality, and some may indeed scream or get hysterical. These statements cannot both be true simultaneously, making this a strict contradiction.


Average Metric: 8 / 11  (72.7): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11/11 [00:03<00:00,  2.80it/s]


-- -- faithfulness: Contradiction_type: 0
-- -- rationale: The answers might be correct within their respective contexts. ANSWER 1 suggests that losing a small amount of weight during the first trimester can be normal, possibly due to factors like morning sickness. ANSWER 2 implies that weight loss during the first trimester should be a cause for concern, which could be interpreted as a more cautious approach. The guidance offered is conflicting but not directly opposing, as both answers address weight loss but differ in their perspectives on its normalcy and potential concern.
Average Metric: 8 / 11  (72.7%)
Score: 72.73 for set: [16]
Scores so far: [72.73, 72.73, 81.82, 72.73]
Best score: 81.82
Average of max per entry across top 1 scores: 0.8181818181818182
Average of max per entry across top 2 scores: 0.9090909090909091
Average of max per entry across top 3 scores: 0.9090909090909091
Average of max per entry across top 5 scores: 0.9090909090909091
Average of max per entry across to

  0%|                                                                                                                                                                                                                                                               | 0/23 [00:00<?, ?it/s]

-- -- faithfulness: Contradiction_type: 1
-- -- rationale: The two answers provide directly opposing information. ANSWER 1 states that women who are carriers of the faulty X chromosome have a 1 in 2 chance of passing it to a child, while ANSWER 2 claims that women who carry the faulty X chromosome are unlikely to pass it on to their children. These statements cannot both be true simultaneously, making this a strict contradiction.
-- -- faithfulness: Contradiction_type: 0
-- -- rationale: Both answers provide different statistics regarding the percentage of pregnancies that end in miscarriage. ANSWER 1 states that around 15% of pregnancies end in miscarriage, while ANSWER 2 suggests that pregnancy loss occurs in up to 1 in every 4 pregnancies (which is approximately 25%). These answers might be correct within their respective contexts or sources, but they offer conflicting information that could lead to confusion.


 13%|████████████████████████████████▏                                                                                                                                                                                                                      | 3/23 [00:01<00:07,  2.72it/s]

-- -- faithfulness: Contradiction_type: 1
-- -- rationale: The two answers provide directly opposing information. ANSWER 1 states that pregnant women should avoid caffeine completely, while ANSWER 2 states that small amounts of caffeine are generally considered safe during pregnancy. These statements cannot both be true simultaneously, leading to a strict contradiction.


 17%|██████████████████████████████████████████▉                                                                                                                                                                                                            | 4/23 [00:02<00:10,  1.81it/s]


-- -- faithfulness: Contradiction_type: 1
-- -- rationale: The two answers provide directly opposing information. ANSWER 1 states that positive CRP results occur during the last half of pregnancy, while ANSWER 2 states that positive CRP results are rare during the last half of pregnancy and more common in the early stages. This is a strict contradiction as they cannot both be true simultaneously.


  0%|                                                                                                                                                                                                                                                               | 0/23 [00:00<?, ?it/s]
  0%|                                                                                                                                                                                                                                                               | 0/23 [00:00<?, ?it/s]
  0%|                                                                                                                                                                                                                                                               | 0/23 [00:00<?, ?it/s]
  0%|                                                                                                                                               

Bootstrapped 2 full traces after 1 examples in round 4.


Average Metric: 3 / 4  (75.0):  27%|██████████████████████████████████████████████████████████▉                                                                                                                                                             | 3/11 [00:01<00:02,  2.75it/s]

-- -- faithfulness: Contradiction_type: 0
-- -- rationale: The two answers might be correct within their respective contexts but offer conflicting guidance that could lead to confusion. ANSWER 1 provides a specific weight limit (10 pounds) that pregnant women should avoid lifting, while ANSWER 2 suggests that lifting moderate weights is generally safe, depending on the woman's fitness level. These statements do not directly oppose each other but offer different perspectives on what is considered safe, leading to a discrepancy.
-- -- faithfulness: Contradiction_type: 1
-- -- rationale: The two answers provide directly opposing information. ANSWER 1 states that it is safe to use a sauna or hot tub during pregnancy as long as it is for a short period, while ANSWER 2 states that using a sauna or hot tub during pregnancy can increase the risk of birth defects and should be avoided. These statements cannot both be true simultaneously, making this a strict contradiction.
-- -- faithfulness: C

Average Metric: 5 / 6  (83.3):  45%|██████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                                     | 5/11 [00:01<00:01,  4.32it/s]

-- -- faithfulness: Contradiction_type: 0
-- -- rationale: The two answers might be correct within their respective contexts but offer conflicting guidance that could lead to confusion. ANSWER 1 provides a specific recommendation of drinking at least 8 glasses of water daily, while ANSWER 2 suggests that fluid needs can vary and women should drink according to their thirst. These statements do not directly oppose each other but offer different approaches to hydration during pregnancy, leading to a discrepancy.
-- -- faithfulness: Contradiction_type: 0
-- -- rationale: The two answers provide conflicting guidance on the introduction of solid food to an infant, but they might be correct within their respective contexts. ANSWER 1 suggests introducing solid food mixed with breast milk at 4 to 6 months of age, while ANSWER 2 mentions giving rice and cow's milk to six-month-old babies before introducing solid food. The discrepancy lies in the specific types of food and the timing, which coul

Average Metric: 7 / 8  (87.5):  73%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                           | 8/11 [00:02<00:00,  3.95it/s]

-- -- faithfulness: Contradiction_type: 1
-- -- rationale: The two answers provide directly opposing information. ANSWER 1 states that breastfeeding can reduce the risk of breast cancer for the mother, while ANSWER 2 claims that breastfeeding has no significant impact on the mother's risk of developing breast cancer. These statements cannot both be true simultaneously, making this a strict contradiction.
-- -- faithfulness: Contradiction_type: 0
-- -- rationale: The two answers might be correct within their respective contexts but offer conflicting guidance that could lead to confusion. ANSWER 1 suggests that raw fish is a typical food eaten during pregnancy, while ANSWER 2 lists lentils, ghee, nuts, and milk as the foods pregnant women eat, implying a focus on healthy foods. These answers do not directly oppose each other but provide different dietary recommendations, leading to a discrepancy.


Average Metric: 8 / 11  (72.7): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11/11 [00:02<00:00,  4.12it/s]

-- -- faithfulness: Contradiction_type: 0
-- -- rationale: The answers might be correct within their respective contexts but offer conflicting guidance that could lead to confusion. ANSWER 1 suggests that pregnant women should eat liver for its high vitamin A content, implying it is beneficial. ANSWER 2, on the other hand, warns that high vitamin A intake from liver can be harmful during pregnancy. These statements do not directly oppose each other but provide conflicting advice, making this a discrepancy.
-- -- faithfulness: Contradiction_type: 0
-- -- rationale: The two answers might be correct within their respective contexts but offer conflicting guidance. ANSWER 1 states that women must not cry out or scream during labour, which suggests a strict rule or guideline. ANSWER 2, on the other hand, acknowledges that women's behavior during labor varies based on personality, indicating that some may indeed scream or make noise. This could lead to confusion as one answer implies a strict




In [9]:
from dspy.evaluate import Evaluate

# Set up the evaluator, which can be re-used in your code.
evaluator = Evaluate(devset=testset, num_threads=1, display_progress=True, display_table=50)

In [10]:
evaluator(compiled_pred, metric=combined_score_class)

Average Metric: 1 / 1  (100.0):  11%|████████████████████████                                                                                                                                                                                                | 1/9 [00:01<00:09,  1.15s/it]

-- -- faithfulness: Contradiction_type: 1
-- -- rationale: The two answers provide directly opposing information. ANSWER 1 states that pregnant women should avoid all herbal teas, while ANSWER 2 asserts that some herbal teas are safe and even beneficial during pregnancy. These statements cannot both be true simultaneously, making this a strict contradiction.


Average Metric: 2 / 2  (100.0):  22%|████████████████████████████████████████████████                                                                                                                                                                        | 2/9 [00:02<00:08,  1.22s/it]

-- -- faithfulness: Contradiction_type: 1
-- -- rationale: The two answers provide directly opposing information. ANSWER 1 states that it is important to avoid nuts during pregnancy due to the risk of allergies, while ANSWER 2 asserts that there is no evidence that eating nuts during pregnancy increases the risk of allergies in the baby. These statements cannot both be true simultaneously, making this a strict contradiction.


Average Metric: 2 / 3  (66.7):  33%|████████████████████████████████████████████████████████████████████████▎                                                                                                                                                | 3/9 [00:04<00:08,  1.39s/it]

-- -- faithfulness: Contradiction_type: 0
-- -- rationale: The two answers might be correct within their respective contexts but offer conflicting guidance that could lead to confusion. ANSWER 1 states that shortness of breath is normal during pregnancy, which can be true for many women due to physiological changes. ANSWER 2 suggests that shortness of breath can indicate a serious problem and should be checked, which is also valid as it could be a symptom of a more serious condition in some cases. These statements do not directly oppose each other but provide different perspectives that could confuse someone seeking advice.


Average Metric: 3 / 4  (75.0):  44%|████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                                        | 4/9 [00:05<00:06,  1.26s/it]

-- -- faithfulness: Contradiction_type: 1
-- -- rationale: The two answers provide directly opposing information. ANSWER 1 states that it is common to experience mild swelling in the ankles and feet during pregnancy, while ANSWER 2 asserts that swelling in the ankles and feet is abnormal and should be checked by a healthcare provider. These statements cannot both be true simultaneously, making this a strict contradiction.


Average Metric: 3 / 5  (60.0):  56%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                | 5/9 [00:07<00:06,  1.54s/it]

-- -- faithfulness: Contradiction_type: 0
-- -- rationale: The two answers might be correct within their respective contexts but offer conflicting guidance. ANSWER 1 states that sleeping on your back during pregnancy is unsafe due to restricted blood flow, while ANSWER 2 suggests that it is generally safe but side sleeping is preferred. This discrepancy could lead to confusion, but it does not present directly opposing information. Instead, it highlights a preference and a potential risk without outright denying the other statement.


Average Metric: 3 / 6  (50.0):  67%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                        | 6/9 [00:08<00:04,  1.64s/it]

-- -- faithfulness: QUESTION: Does the shot decrease bone density for people with disabilities?

ANSWER 1: The shot, which temporarily decreases bone density while in use, is probably not the best option for people with disabilities that affect bone density.

ANSWER 2: People with disabilities that affect bone density may find the shot beneficial.

Contradiction_type: 0
-- -- rationale: The two answers might be correct within their respective contexts but offer conflicting guidance. ANSWER 1 focuses on the potential negative impact of the shot on bone density, suggesting it may not be the best option for people with disabilities affecting bone density. ANSWER 2, on the other hand, suggests that the shot may still be beneficial for these individuals, possibly considering other factors or benefits. This discrepancy could lead to confusion but does not strictly contradict each other.


Average Metric: 4 / 7  (57.1):  78%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                | 7/9 [00:10<00:03,  1.55s/it]

-- -- faithfulness: Contradiction_type: 0
-- -- rationale: The two answers might be correct within their respective contexts but offer conflicting guidance. ANSWER 1 suggests that all pregnant women need to take iron supplements, which could be a general recommendation. ANSWER 2, however, specifies that iron supplements are only necessary if the woman is diagnosed with iron deficiency anemia, which is a more specific medical guideline. This discrepancy could lead to confusion but does not strictly contradict each other as they could be interpreted as addressing different situations or levels of medical advice.


Average Metric: 5 / 8  (62.5):  89%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                        | 8/9 [00:11<00:01,  1.52s/it]

-- -- faithfulness: Contradiction_type: 0
-- -- rationale: The two answers might be correct within their respective contexts but offer conflicting guidance. ANSWER 1 suggests that consuming fish oil can benefit the baby's brain development, while ANSWER 2 warns against fish oil supplements due to potential risks. These statements do not directly oppose each other but provide conflicting advice that could lead to confusion. Therefore, this is a discrepancy.


Average Metric: 5 / 9  (55.6): 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:12<00:00,  1.44s/it]

-- -- faithfulness: Contradiction_type: 0
-- -- rationale: The two answers might be correct within their respective contexts but offer conflicting guidance. ANSWER 1 states that cosmetic dental procedures are safe during pregnancy, while ANSWER 2 advises postponing non-essential dental procedures until after pregnancy. This could lead to confusion, but it is not a strict contradiction as both statements could be true depending on the specific circumstances or differing medical opinions.
Average Metric: 5 / 9  (55.6%)





Unnamed: 0,answer1,answer2,question,faith_strict,faithfulness,contrad_type,rationale,combined_score_class
0,Pregnant women should avoid all herbal teas.,Some herbal teas are safe and even beneficial during pregnancy.,Is it safe to drink herbal teas while pregnant?,0,1,1,"The two answers provide directly opposing information. ANSWER 1 states that pregnant women should avoid all herbal teas, while ANSWER 2 asserts that some herbal...",1
1,It's important to avoid nuts during pregnancy due to the risk of allergies.,There is no evidence that eating nuts during pregnancy increases the risk of allergies in the baby.,Should I avoid nuts during pregnancy?,0,1,1,"The two answers provide directly opposing information. ANSWER 1 states that it is important to avoid nuts during pregnancy due to the risk of allergies,...",1
2,It’s normal for pregnant women to experience shortness of breath.,Shortness of breath during pregnancy can indicate a serious problem and should be checked.,Is shortness of breath normal during pregnancy?,0,1,0,The two answers might be correct within their respective contexts but offer conflicting guidance that could lead to confusion. ANSWER 1 states that shortness of...,0
3,"It's common to experience mild swelling during pregnancy, especially in the ankles and feet.",Swelling in the ankles and feet is abnormal during pregnancy and should be checked by a healthcare provider.,Is swelling in the ankles and feet during pregnancy normal?,0,1,1,"The two answers provide directly opposing information. ANSWER 1 states that it is common to experience mild swelling in the ankles and feet during pregnancy,...",1
4,Sleeping on your back during pregnancy is unsafe as it can restrict blood flow.,"Sleeping on your back is generally safe, but side sleeping is preferred.",Is it safe to sleep on my back while pregnant?,0,1,0,The two answers might be correct within their respective contexts but offer conflicting guidance. ANSWER 1 states that sleeping on your back during pregnancy is...,0
5,"The shot, which temporarily decreases bone density while in use, is probably not the best option for people with disabilities that affect bone density.",People with disabilities that affect bone density may find the shot beneficial.,Does the shot decrease bone density for people with disabilities?,0,1,0,The two answers might be correct within their respective contexts but offer conflicting guidance. ANSWER 1 focuses on the potential negative impact of the shot...,0
6,All pregnant women need to take iron supplements.,Iron supplements are only necessary if the woman is diagnosed with iron deficiency anemia.,Should I take iron supplements during pregnancy?,0,0,0,The two answers might be correct within their respective contexts but offer conflicting guidance. ANSWER 1 suggests that all pregnant women need to take iron...,1
7,Consuming a diet high in fish oil during pregnancy can help boost the baby's brain development.,Fish oil supplements are unnecessary during pregnancy and could increase the risk of bleeding.,Is it beneficial for pregnant women to consume fish oil?,0,0,0,The two answers might be correct within their respective contexts but offer conflicting guidance. ANSWER 1 suggests that consuming fish oil can benefit the baby's...,1
8,Cosmetic dental procedures are safe during pregnancy.,Non-essential dental procedures should be postponed until after pregnancy.,Can I undergo cosmetic dental procedures while pregnant?,0,1,0,"The two answers might be correct within their respective contexts but offer conflicting guidance. ANSWER 1 states that cosmetic dental procedures are safe during pregnancy,...",0


55.56

In [13]:
import spacy

# Load the spaCy model
nlp = spacy.load('en_core_web_sm')

def segment_facts(sentence):
    # Process the sentence with spaCy
    doc = nlp(sentence)
    
    # Identify the subject
    subject = None
    for token in doc:
        if token.dep_ == "nsubj":
            subject = token.text
            break

    if not subject:
        # Fallback: if no clear subject, try the first proper noun or noun
        for token in doc:
            if token.pos_ in ("PROPN", "NOUN"):
                subject = token.text
                break

    if not subject:
        # If no subject found, return the original sentence as the only fact
        return [sentence]

    # Initialize a list to store the segmented facts
    segmented_facts = []

    # Find the root verb/action in the sentence
    for token in doc:
        if token.dep_ == "ROOT":
            action = token.text
            break

    # Construct sentences from conjuncts, objects, and other dependencies
    for chunk in doc.noun_chunks:
        if chunk.root.dep_ in ("dobj", "pobj", "attr"):
            segmented_facts.append(f"{subject} {action} {chunk.text}.")

    # Handle compound actions and conjunctions
    for token in doc:
        if token.dep_ == "conj":
            segmented_facts.append(f"{subject} {action} {token.text}.")
        elif token.dep_ == "prep":
            obj = " ".join([child.text for child in token.children])
            segmented_facts.append(f"{subject} {action} {token.text} {obj}.")

    return segmented_facts

# Example sentences to segment
sentences = [
    "Chorioamnionitis: A condition during pregnancy that can cause unexplained fever with uterine tenderness, a high white blood cell count, rapid heart rate in the fetus, rapid heart rate in the woman, and/or foul-smelling vaginal discharge.",
    "Diabetes mellitus is a metabolic disease that causes high blood sugar.",
    "Hypertension is a condition that can lead to heart disease, stroke, and kidney failure."
]

# Process each sentence
for sentence in sentences:
    facts = segment_facts(sentence)
    for fact in facts:
        print(fact)
    print("-" * 40)


that Chorioamnionitis pregnancy.
that Chorioamnionitis unexplained fever.
that Chorioamnionitis uterine tenderness.
that Chorioamnionitis the fetus.
that Chorioamnionitis the woman.
that Chorioamnionitis during pregnancy.
that Chorioamnionitis with tenderness.
that Chorioamnionitis rate.
that Chorioamnionitis in fetus.
that Chorioamnionitis rate.
that Chorioamnionitis in woman.
that Chorioamnionitis discharge.
----------------------------------------
mellitus is a metabolic disease.
mellitus is high blood sugar.
----------------------------------------
Hypertension is a condition.
Hypertension is heart disease.
Hypertension is to disease.
Hypertension is stroke.
----------------------------------------


In [15]:
import spacy
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification

nlp = spacy.load("en_core_web_sm")
model_name = "biodatlab/score-claim-identification"
tokenizer_name = "allenai/scibert_scivocab_uncased"

tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

def inference(abstract: str):
    """
    Split an abstract into sentences and perform claim identification.
    """
    if abstract.strip() == "":
        return "Please provide an abstract as an input."
    claims = []
    sents = [sent.text for sent in nlp(abstract).sents]  # a list of sentences
    inputs = tokenizer(
        sents,
        return_tensors="pt",
        truncation=True,
        padding="longest"
    )
    logits = model(**inputs).logits
    preds = logits.argmax(dim=1)  # convert logits to predictions
    claims = [sent for sent, pred in zip(sents, preds) if pred == 1]
    if len(claims) > 0:
        return ".\n".join(claims)
    else:
        return "No claims found from a given abstract."

claims = inference(sentences[0])  # string of claim joining with \n




config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/228k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/817 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [4]:
!export CUDA_VISIBLE_DEVICES=2

In [22]:
from transformers import T5ForConditionalGeneration, T5TokenizerFast
hft = T5TokenizerFast.from_pretrained('varadhbhatnagar/fc-claim-det-T5-base')
hfm = T5ForConditionalGeneration.from_pretrained('varadhbhatnagar/fc-claim-det-T5-base').to("cpu")
row = 'Chorioamnionitis: A condition during pregnancy that can cause unexplained fever with uterine tenderness, a high white blood cell count, rapid heart rate in the fetus, rapid heart rate in the woman, and/or foul-smelling vaginal discharge.'
row2 = "Which children are at risk for X-linked agammaglobulinemia: Women who are carriers have a 1 in 2 chance of passing the faulty X chromosome to a child. This is true for every pregnancy. If a daughter gets the gene, she will likely be a healthy carrier like her mother. If a son gets the gene, he will have X-linked agammaglobulinemia."
tokenized_text = hft.encode(row2, return_tensors="pt")
summary_ids = hfm.generate(tokenized_text,
                                  num_beams=10,
                                  no_repeat_ngram_size=2,
                                  min_length=5,
                                  max_length=100,
                                  early_stopping=True)

output = hft.decode(summary_ids[0], skip_special_tokens=True)


In [23]:
output

'agammaglobulinemia is caused by faulty X chromosomes.'

In [10]:
from transformers import pipeline

classifier = pipeline(
        "text-classification", 
        model="Nithiwat/bert-base_claimbuster"
)

classifier("there is no earthquake in turkey")        

config.json:   0%|          | 0.00/881 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/348 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

[{'label': 'LABEL_1', 'score': 0.9970910549163818}]

In [11]:
def initalize_model():
    model = pipeline('summarization', model='ainize/bart-base-cnn') #  google/pegasus-xsum
    return model

def summarise(text, model):
    summary = model(text, min_length=30, do_sample=False)

    return summary[0]

# initialize ML model
model = initalize_model()

config.json:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/558M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/261 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

In [16]:
summary = summarise("Which children are at risk for X-linked agammaglobulinemia: Women who are carriers have a 1 in 2 chance of passing the faulty X chromosome to a child. This is true for every pregnancy. If a daughter gets the gene, she will likely be a healthy carrier like her mother. If a son gets the gene, he will have X-linked agammaglobulinemia.", model)
summary

Your max_length is set to 128, but your input_length is only 87. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=43)


{'summary_text': 'Women who are carriers have a 1 in 2 chance of passing the faulty X chromosome to a child .\nThis is true for every pregnancy .\nIf a daughter gets the gene, she will likely be a healthy carrier like her mother .'}

In [15]:
pred = classifier(output)[0]
print(pred)



{'label': 'LABEL_0', 'score': 0.8251485824584961}


In [31]:
from transformers import T5ForConditionalGeneration, T5Tokenizer

tokenizer = T5Tokenizer.from_pretrained("Babelscape/t5-base-summarization-claim-extractor")
model = T5ForConditionalGeneration.from_pretrained("Babelscape/t5-base-summarization-claim-extractor")
summary = 'Simone Biles made a triumphant return to the Olympic stage at the Paris 2024 Games, competing in the women’s gymnastics qualifications. Overcoming a previous struggle with the “twisties” that led to her withdrawal from events at the Tokyo 2020 Olympics, Biles dazzled with strong performances on all apparatus, helping the U.S. team secure a commanding lead in the qualifications. Her routines showcased her resilience and skill, drawing enthusiastic support from a star-studded audience'

tokenizer_config.json:   0%|          | 0.00/20.8k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.59k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


config.json:   0%|          | 0.00/1.50k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/164 [00:00<?, ?B/s]

In [34]:
tok_input = tokenizer.batch_encode_plus([row2], return_tensors="pt", padding=True)
claims = model.generate(**tok_input)
claims = tokenizer.batch_decode(claims, skip_special_tokens=True)
claims

['Women who are carriers have a 1 in 2 chance of passing the faulty X chromosome to a child. This is true for every pregnancy. If a daughter gets the gene, she will likely be a healthy carrier like her mother. If a son gets the gene, he will have X-linked agammaglobulinemia.']

In [37]:
pred = classifier(claims)[0]
print(pred)

{'label': 'LABEL_1', 'score': 0.9928246736526489}


In [41]:
import pathlib
import pandas as pd
import numpy as np
from scipy import sparse
print("Loading data...")
path_orig_en = pathlib.Path("/export/usuarios_ml4ds/lbartolome/Repos/umd/LinQAForge/data/source/corpus_rosie/corpus_pass_en_tr.parquet")
path_orig_es = pathlib.Path("/export/usuarios_ml4ds/lbartolome/Repos/umd/LinQAForge/data/source/corpus_rosie/corpus_pass_es_tr.parquet")
path_source = pathlib.Path("/export/usuarios_ml4ds/lbartolome/Repos/umd/LinQAForge/data/source/corpus_rosie/passages/multi_blade_filtered/df_1.parquet")

path_model = pathlib.Path("/export/usuarios_ml4ds/lbartolome/Repos/umd/LinQAForge/data/models/MULTI_BLADE_FILTERED/ldatm_rosie_1_20")
path_corpus_en = path_model / "train_data" / "corpus_EN.txt"
path_corpus_es = path_model / "train_data" / "corpus_ES.txt"

persist_directory = (path_model / 'db_contr_mono').as_posix()

raw = pd.read_parquet(path_source)
with path_corpus_en.open("r", encoding="utf-8") as f:
    lines = [line for line in f.readlines()]
corpus_en = [line.rsplit(" 0 ")[1].strip().split() for line in lines]

ids = [line.split(" 0 ")[0] for line in lines]
df_en = pd.DataFrame({"lemmas": [" ".join(doc) for doc in corpus_en]})
df_en["doc_id"] = ids
df_en["len"] = df_en['lemmas'].apply(lambda x: len(x.split()))
df_en["id_top"] = range(len(df_en))
df_en_raw = df_en.merge(raw, how="inner", on="doc_id")[["doc_id", "id_top", "id_preproc", "lemmas_x", "text", "len"]]

# Read thetas 
thetas = sparse.load_npz(path_model.joinpath(f"mallet_output/{'EN'}/thetas.npz")).toarray()
betas = np.load((path_model.joinpath(f"mallet_output/{'EN'}/betas.npy")))
def get_thetas_str(row,thetas):
    return " ".join([f"{id_}|{round(el, 4)}" for id_,el in enumerate(thetas[row]) if el!=0.0])

def get_most_repr_tpc(row,thetas):
    return np.argmax(thetas[row])

# Save thetas in dataframe and "assigned topic"
df_en_raw["thetas"] = df_en_raw.apply(lambda row: get_thetas_str(row['id_top'], thetas), axis=1)
df_en_raw["id_tpc"] = df_en_raw.apply(lambda row: get_most_repr_tpc(row['id_top'], thetas), axis=1)
tpc = 0
df_tpc = df_en_raw[df_en_raw.id_tpc == tpc]

Loading data...


In [3]:
import pathlib
import pandas as pd
import numpy as np
from scipy import sparse
print("Loading data...")
path_orig_en = pathlib.Path("/export/usuarios_ml4ds/lbartolome/Repos/umd/LinQAForge/data/source/corpus_rosie/corpus_pass_en_tr.parquet")
path_orig_es = pathlib.Path("/export/usuarios_ml4ds/lbartolome/Repos/umd/LinQAForge/data/source/corpus_rosie/corpus_pass_es_tr.parquet")
path_source = pathlib.Path("/export/usuarios_ml4ds/lbartolome/Repos/umd/LinQAForge/data/source/corpus_rosie/passages/multi_blade_filtered/df_1.parquet")

path_model = pathlib.Path("/export/usuarios_ml4ds/lbartolome/Repos/umd/LinQAForge/data/models/MULTI_BLADE_FILTERED/ldatm_rosie_1_20")
path_corpus_en = path_model / "train_data" / "corpus_EN.txt"
path_corpus_es = path_model / "train_data" / "corpus_ES.txt"

persist_directory = (path_model / 'db_contr_mono').as_posix()

raw = pd.read_parquet(path_source)
with path_corpus_es.open("r", encoding="utf-8") as f:
    lines = [line for line in f.readlines()]
corpus_es = [line.rsplit(" 0 ")[1].strip().split() for line in lines]

ids = [line.split(" 0 ")[0] for line in lines]
df_en = pd.DataFrame({"lemmas": [" ".join(doc) for doc in corpus_es]})
df_en["doc_id"] = ids
df_en["len"] = df_en['lemmas'].apply(lambda x: len(x.split()))
df_en["id_top"] = range(len(df_en))
df_en_raw = df_en.merge(raw, how="inner", on="doc_id")[["doc_id", "id_top", "id_preproc", "lemmas_x", "text", "len"]]

# Read thetas 
thetas = sparse.load_npz(path_model.joinpath(f"mallet_output/{'ES'}/thetas.npz")).toarray()
betas = np.load((path_model.joinpath(f"mallet_output/{'ES'}/betas.npy")))
def get_thetas_str(row,thetas):
    return " ".join([f"{id_}|{round(el, 4)}" for id_,el in enumerate(thetas[row]) if el!=0.0])

def get_most_repr_tpc(row,thetas):
    return np.argmax(thetas[row])

# Save thetas in dataframe and "assigned topic"
df_en_raw["thetas"] = df_en_raw.apply(lambda row: get_thetas_str(row['id_top'], thetas), axis=1)
df_en_raw["id_tpc"] = df_en_raw.apply(lambda row: get_most_repr_tpc(row['id_top'], thetas), axis=1)
tpc = 0
df_tpc_es = df_en_raw[df_en_raw.id_tpc == tpc]

Loading data...


In [45]:
df_tpc.head()

Unnamed: 0,doc_id,id_top,id_preproc,lemmas_x,text,len,thetas,id_tpc
20,EN_513461_85839-22,20,14328,help baby fall_asleep baby know sleep informat...,How can you help your baby fall asleep: Not al...,38,0|0.9968000054359436 12|0.0031999999191612005,0
46,EN_1393570_301605-29,46,34634,sure cover baby head increase baby risk baby p...,Make sure nothing is covering your baby's head...,103,0|1.0,0
82,EN_1386177_300946-35,82,63331,baby sleep firm_flat mattress_firm surface_sla...,"Your baby should sleep on a firm, flat mattres...",85,0|0.8909000158309937 1|0.012600000016391277 3|...,0
86,EN_977522_167004-14,86,67739,sleep pattern sleep habit influence baby tempe...,Sleep patterns: Sleep habits are influenced by...,17,0|0.5479000210762024 10|0.003700000001117587 1...,0
118,EN_172959_7099-5,118,97719,information technology work parent_grandparent...,Why it works: We found that many parents and g...,24,0|0.4487000107765198 1|0.18140000104904175 10|...,0


In [133]:
df_tpc_es

Unnamed: 0,doc_id,id_top,id_preproc,lemmas_x,text,len,thetas,id_tpc
9,ES_584418_68197-9,9,224,hijo talasemia_hbss dispuesto proporcionar muestra sangre orina crisis dolor analizar cambio sangre orina monitorear producir lesión renal_agudo crisis dolor síndrome torácico agudo participante año ingresar crisis dolor síndrome torácico agudo cabo_birmingham alabama,¿Usted o su hijo tienen talasemia HbSS o Sβ0 y están dispuestos a proporcionar muestras de sangre y orina durante una crisis de dolor? Este estudio analizará si los cambios en la sangre y la orina se pueden utilizar para monitorear si se ha producido lesión renal aguda como resultado de una crisis de dolor o síndrome torácico agudo. Los participantes de este estudio deben tener entre 1 y 25 años y deben ser ingresados con una crisis de dolor o síndrome torácico agudo. Este estudio se lleva a cabo en Birmingham (Alabama).,32,0|0.75 19|0.25,0
44,ES_603718_75256-10,44,841,hijo recuento_plaquetario investigar_recuento plaqueta_inmadura marcador riesgo sangrado nacido_trombocitopenia comparación_recuento plaquetario_plaqueta inmadura_plaqueta producido_marcador producción_plaqueta participar nacido semana edad_gestacional peso_nacer gramo_recuento plaquetario cabo múltiple unidos_países bajos_suecia hagar_clic nombre_ensayo sitio_web ensayo_clínico nih_inglés,"¿Tiene su hijo un recuento plaquetario bajo? Este estudio investiga los recuentos de plaquetas inmaduras como un marcador del riesgo de sangrado en recién nacidos con trombocitopenia, en comparación con los recuentos plaquetarios solos. Las plaquetas inmaduras son las plaquetas producidas más recientemente y posiblemente sean un mejor marcador de la producción de plaquetas. Para participar en este estudio, debe tener un recién nacido de menos de 32 semanas de edad gestacional, debe tener un peso al nacer de más de 500 gramos, y tener un recuento plaquetario de menos de 100 × 109/l. Este estudio se lleva a cabo en múltiples lugares de los Estados Unidos, los Países Bajos y Suecia. Haga clic en el nombre del ensayo para obtener más información en el sitio web de ensayos clínicos de los NIH (en inglés).",29,0|1.0,0
70,ES_592711_74128-1,70,1254,analizar medicamento llamado_metformina ayudar tratar aneurisma_aórtico abdomen diagnóstico amágenes_metformina ayudar prevenir aumento tamaño aneurisma_aórtico participar año diagnosticar aneurisma_aorta abdominal cabo stanford_california hagar_clic nombre_ensayo sitio_web ensayo_clínico nih_inglés,"Este estudio analiza si un medicamento llamado metformina puede ayudar a tratar los aneurismas aórticos en el abdomen. El estudio utilizará estudios de diagnóstico por imágenes para examinar si la metformina puede ayudar a prevenir el aumento de tamaño de estos aneurismas aórticos. Para participar en este estudio, debe tener entre 55 y 90 años y habérsele diagnosticado un aneurisma de aorta abdominal. Este estudio se lleva a cabo en Stanford (California). Haga clic en el nombre del ensayo para obtener más información en el sitio web de ensayos clínicos de los NIH (en inglés).",26,0|1.0,0
156,ES_486251_63877-48,156,2817,paciente participar ensayo_clínico ayudar mejorar forma tratar cáncer futuro ensayo_clínico tratamiento eficaz responder_pregunta importante ayudar avanzar,"Los pacientes que participan en los ensayos clínicos también ayudan a mejorar la forma en que se tratará el cáncer en el futuro. Aunque los ensayos clínicos no siempre llevan a tratamientos eficaces, a menudo responden a preguntas importantes y ayudan a avanzar en la investigación.",16,0|0.8104000091552734 9|0.06369999796152115 17|0.125900000333786,0
252,ES_593115_74164-9,252,4712,hijo anemia célula_falciform interés contribuir_recopilación dato plazo monitorear paciente recoger_muestra biológico paciente enfermedad célula_falciform comprender evolución plazo participante edad enfermedad célula_falciform cabo centro illinois_luisiana carolina_norte tennessee,"¿Usted o su hijo tienen la anemia de células falciformes y también, interés en contribuir a la recopilación de datos a largo plazo? Este estudio monitorea pacientes y recoge muestras biológicas de pacientes con enfermedad de células falciformes para comprender mejor la evolución a largo plazo Los participantes pueden tener cualquier edad y deben tener enfermedad de células falciformes. Este estudio se lleva a cabo en seis centros en Illinois, Luisiana, Carolina del Norte y Tennessee.",26,0|1.0,0
...,...,...,...,...,...,...,...,...
60662,ES_579773_67632-8,60662,1061456,diagnosticar síndrome_takotsubo investigar corto_plazo síndrome_takotsubo denominado síndrome corazón roto síndrome_takotsubo afección cardíaco temporal causado estrés emocional físico participar diagnóstico reciente síndrome_takotsubo año cabo_providence rhode_island hagar_clic nombre_ensayo sitio_web ensayo_clínico nih_inglés,"¿Se le ha diagnosticado recientemente síndrome de Takotsubo? El objetivo de este estudio es investigar los efectos a corto y largo plazo del síndrome de Takotsubo, también denominado a veces ""síndrome del corazón roto"". Habitualmente, el síndrome de Takotsubo es una afección cardíaca temporal causada por estrés emocional o físico. Para participar en este estudio, debe tener diagnóstico reciente de síndrome de Takotsubo y debe tener al menos 18 años. Este estudio se lleva a cabo en Providence (Rhode Island). Haga clic en el nombre del ensayo para obtener más información en el sitio web de ensayos clínicos de los NIH (en inglés).",29,0|1.0,0
60696,ES_592738_74130-6,60696,1061991,analizar medicamento llamado_anastrozol utilizado tratar cáncer_mama mejorar capacidad ejercicio paciente hipertensión_pulmonar participar año hipertensión_arterial pulmonar prueba caminata_minuto cabo múltiple centro unidos california_colorado maryland_misuri pensilvania_rhode island_tennessee,"Este estudio analiza si un medicamento llamado anastrozol, utilizado actualmente para tratar el cáncer de mama, mejora la capacidad para el ejercicio en pacientes con hipertensión pulmonar. Para participar en este estudio, debe tener al menos 18 años, hipertensión arterial pulmonar y debe poder realizar una prueba de caminata de 6 minutos. Este estudio se lleva a cabo en múltiples centros de los Estados Unidos, que incluyen California, Colorado, Maryland, Misuri, Pensilvania, Rhode Island y Tennessee.",25,0|1.0,0
60698,ES_592759_74132-5,60698,1062015,crisis dolor complicación pulmonar anemia célula_falciform interés comprender crisis dolor complicación pulmonar producir paciente enfermedad célula_falciform paciente trastorno afectar glóbulo_rojo participar año enfermedad célula_falciform presunto_conocido rasgo_falciforme you trastorno afectar glóbulo_rojo participante examen físico recibir atención médico estándar enfermedad célula_falciform incluido prueba seguimiento rutina cabo washington bethesda_maryland,"Crisis de dolor y complicaciones pulmonares en la anemia de células falciformes El interés de este estudio es comprender las crisis de dolor y las complicaciones pulmonares que se producen en pacientes con enfermedad de células falciformes y en los pacientes con otros trastornos que afectan los glóbulos rojos. Para participar en este estudio, debe tener al menos 2 años, con enfermedad de células falciformes presunta o conocida, rasgo falciforme u otros trastornos que afectan los glóbulos rojos. A los participantes se les realizará un examen físico y recibirán atención médica estándar para la enfermedad de células falciformes, incluidas pruebas y procedimientos de seguimiento de rutina. Este estudio se lleva a cabo en Washington (D.C.) y Bethesda (Maryland).",46,0|1.0,0
60740,ES_597996_74674-2,60740,1062566,sufrir enfermedad cardíaco requerir_dci prevenir_paro cardíaco_súbito aprender signo_diferenciar paciente riesgo arritmia_paro cardíaco_súbito investigador planear_seguimiento paciente reemplazo_cdi reciente año participar año cabo washington_baltimore maryland_richmond virginia,"¿Sufre una enfermedad cardíaca que requiere un DCI para prevenir un paro cardíaco súbito? El objetivo de este estudio es aprender qué signos diferencian a los pacientes que tienen un mayor riesgo de arritmia que lleva a un paro cardíaco súbito. Los investigadores planean realizar el seguimiento de los pacientes con un reemplazo de CDI reciente durante 10 años. Para participar en este estudio, debe tener entre 18 y 85 años. Este estudio se lleva a cabo en Washington (D.C.), Baltimore (Maryland) y Richmond (Virginia).",24,0|1.0,0


In [152]:
index_en, model_en, ids_en, texts_en = create_faiss_index(df_tpc, text_column='text', id_column='doc_id', index_file='faiss_index_en.index')
index_es, model_es, ids_es, texts_es = create_faiss_index(df_tpc_es, text_column='text', id_column='doc_id', index_file='faiss_index_es.index')

In [71]:
from transformers import T5ForConditionalGeneration, T5Tokenizer
from transformers import pipeline
import itertools

tokenizer = T5Tokenizer.from_pretrained("Babelscape/t5-base-summarization-claim-extractor")
claim_extractor = T5ForConditionalGeneration.from_pretrained("Babelscape/t5-base-summarization-claim-extractor")
classifier = pipeline(
        "text-classification", 
        model="Nithiwat/bert-base_claimbuster"
)

#lens = [len(c) for c in claims]
#texts = [[text[i]]*lens[i] for i in range(len(lens))]
#scores = [classifier(c)[0]['label'] for c in claims] 

def generate_claims(text):
    tok_input = tokenizer.batch_encode_plus([text], return_tensors="pt", padding=True)
    claims = claim_extractor.generate(**tok_input)
    claims = tokenizer.batch_decode(claims, skip_special_tokens=True)[0].split(".")

    scores = [classifier(c)[0]['label'] for c in claims] 

    return claims, scores

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [52]:
df_tpc.text.values.tolist()[0]

'How can you help your baby fall asleep: Not all babies know how to put themselves to sleep. When it is time for bed, many parents want to rock or breastfeed a baby to sleep. Establishing a routine at bedtime is a good idea. However, be sure that the baby does not fall asleep while eating or in your arms. This may become a pattern and the baby may begin to expect to be in your arms in order to fall asleep. When the baby briefly awakens during a sleep cycle, he or she may not be able to go back to sleep on his or her own.'

In [81]:
all_ids = []
all_texts = []
all_claims = []
all_scores = []

for id_, el in df_tpc.iterrows():
    claims, scores = generate_claims(el.text)
    texts = [el.text]*len(claims)
    ids = [el.doc_id]*len(claims)

    all_ids.append(ids)
    all_texts.append(texts)
    all_claims.append(claims)
    all_scores.append(scores)


results = pd.DataFrame(
    {
        "ids": list(itertools.chain(*all_ids)),
        "text" : list(itertools.chain(*all_texts)),
        "claims" : list(itertools.chain(*all_claims)),
        "score" : list(itertools.chain(*all_scores))
    }
)


In [85]:
filtered = results[results.score == "LABEL_1"].drop_duplicates(subset=['claims'], keep='first', inplace=False)

filtered

Unnamed: 0,ids,text,claims,score
0,EN_513461_85839-22,"How can you help your baby fall asleep: Not all babies know how to put themselves to sleep. When it is time for bed, many parents want to rock or breastfeed a baby to sleep. Establishing a routine at bedtime is a good idea. However, be sure that the baby does not fall asleep while eating or in your arms. This may become a pattern and the baby may begin to expect to be in your arms in order to fall asleep. When the baby briefly awakens during a sleep cycle, he or she may not be able to go back to sleep on his or her own.",Not all babies know how to put themselves to sleep,LABEL_1
9,EN_1393570_301605-29,"Make sure nothing is covering your baby's head. These increase a baby's risk of suffocating\n- Put your baby in other positions while they are awake. This helps your baby grow stronger. It also helps prevent your baby from having a misshaped head. When your baby is awake, hold your baby. Give your baby time on their tummy while awake and supervised for short periods of time beginning soon after coming home from the hospital. Slowly increase tummy time to at least 15 to 30 minutes each day by 7 weeks old. Try not to let your baby sit in a seat or swing for long periods of time\n- Don't using sitting devices for routine sleep. Infant seats, car seats, strollers, infant carriers, and infant swings are not advised for routine sleep. These may lead to blockage of a baby's airway or suffocation. If your baby is in a sitting device, remove them from the device and put them in the crib or other appropriate surface as soon as is safe and practical\n- Make sure your baby doesn't get overheated when sleeping. Keep the room at a temperature that is comfortable for you and your baby. Dress your baby lightly. Instead of using blankets, keep your baby warm by dressing them in a sleep sack, or a wearable blanket. Don't use a hat on your baby indoors\n- Use caution when swaddling your baby. Swaddling doesn't reduce the risk for SIDS. If you choose to swaddle your baby, make sure they are on their back and the swaddle is not too tight.",These increase a baby's risk of suffocating,LABEL_1
31,EN_1386177_300946-35,"Your baby should sleep on a firm, flat mattress or firm surface with no slant. Cover the mattress with a fitted sheet. Don’t use fluffy blankets or comforters. Don’t let your baby sleep on a waterbed, air mattress, sofa, sheepskin, pillow, or other soft material. Don’t put soft toys, pillows, or bumper pads in the crib\n- Not overheating. Keep your baby warm but not too warm. The temperature in your baby’s room should feel comfortable to you. Don't overbundle, overdress, or cover a baby's face or head. Don't put a hat on your baby when indoors\n- Sharing a room. The American Academy of Pediatrics advises that babies sleep close to the parent's bed, but in a separate crib or bassinet for babies. This is advised ideally for the baby's first year. But you should do this at least for the first 6 months\n- Not sharing a bed. Don't put your baby to sleep in a bed with other children. Don’t put your baby to sleep on a sofa, either alone or with another person. Don't share your bed with your baby, especially if you are using alcohol or other drugs. You can bring your baby to your bed for feedings and comforting. But return your baby to the crib for sleep. Bed sharing is also not advised for twins or other multiples\n- Not allowing smoking around your baby. The risk of SIDS is higher for babies whose mothers smoked during pregnancy.",The risk of SIDS is higher for babies whose mothers whose mothers smoked during pregnancy,LABEL_1
43,EN_535414_88300-16,"Helping your baby fall asleep: Babies may not be able to establish their own sleeping and waking patterns. Surprisingly, not all babies know how to put themselves to sleep, or are able to go back to sleep if they are awakened in the night. When it is time for bed, many parents want to rock or breastfeed a baby to help them fall asleep. Establishing a routine at bedtime is a good idea. However, be sure that your baby does not fall asleep in your arms. This may become a pattern and your baby may begin to expect to be in your arms in order to fall asleep. When your baby briefly awakens during a sleep cycle, he or she may not be able to go back to sleep on their own.",Not all babies know how to put themselves to sleep,LABEL_1
44,EN_535414_88300-16,"Helping your baby fall asleep: Babies may not be able to establish their own sleeping and waking patterns. Surprisingly, not all babies know how to put themselves to sleep, or are able to go back to sleep if they are awakened in the night. When it is time for bed, many parents want to rock or breastfeed a baby to help them fall asleep. Establishing a routine at bedtime is a good idea. However, be sure that your baby does not fall asleep in your arms. This may become a pattern and your baby may begin to expect to be in your arms in order to fall asleep. When your baby briefly awakens during a sleep cycle, he or she may not be able to go back to sleep on their own.",Not all babies are able to go back to sleep if they are awakened in the night,LABEL_1
76,EN_1283218_284566-2,"Although the cause of SIDS is unknown, there are steps you can take to reduce the risk. These include:\n- Placing your baby on his or her back to sleep, even for short naps. ""Tummy time"" is for when babies are awake and someone is watching\n- Having your baby sleep in your room for at least the first six months. Your baby should sleep close to you, but on a separate surface designed for infants, such as a crib or bassinet.\n- Using a firm sleep surface, such as a crib mattress covered with a fitted sheet\n- Keeping soft objects and loose bedding away from your baby's sleep area\n- Breastfeeding your baby\n- Making sure that your baby doesn't get too hot. Keep the room at a comfortable temperature for an adult.\n- Not smoking during pregnancy or allowing anyone to smoke near your baby.",The cause of SIDS is unknown,LABEL_1
86,EN_992647_17713-5,"There is evidence that early introduction of pacifiers negatively impacts breastfeeding outcomes. However, pacifier use during the birth hospitalization can provide comfort during painful procedures (e.g., circumcision) when the infant cannot otherwise be comforted. Pacifiers can also reduce the risk of SIDS, which occurs most commonly between 2-4 months of age. Therefore, the AAP’s policy statements (Policy Statement: Breastfeeding and the Use of Human Milk | Pediatrics | American Academy of Pediatrics (aap.org), and SIDS and Other Sleep-Related Infant Deaths: Updated 2016 Recommendations for a Safe Infant Sleeping Environment) recommend that introduction of pacifiers for breastfed infants be delayed until breastfeeding is firmly established, which is generally within the first few weeks.",There is evidence that early introduction of pacifiers negatively impacts breastfeeding outcomes,LABEL_1
90,EN_992647_17713-5,"There is evidence that early introduction of pacifiers negatively impacts breastfeeding outcomes. However, pacifier use during the birth hospitalization can provide comfort during painful procedures (e.g., circumcision) when the infant cannot otherwise be comforted. Pacifiers can also reduce the risk of SIDS, which occurs most commonly between 2-4 months of age. Therefore, the AAP’s policy statements (Policy Statement: Breastfeeding and the Use of Human Milk | Pediatrics | American Academy of Pediatrics (aap.org), and SIDS and Other Sleep-Related Infant Deaths: Updated 2016 Recommendations for a Safe Infant Sleeping Environment) recommend that introduction of pacifiers for breastfed infants be delayed until breastfeeding is firmly established, which is generally within the first few weeks.","Pacifiers can also reduce the risk of SIDS, which occurs most commonly between 2-4 months of age",LABEL_1
91,EN_992647_17713-5,"There is evidence that early introduction of pacifiers negatively impacts breastfeeding outcomes. However, pacifier use during the birth hospitalization can provide comfort during painful procedures (e.g., circumcision) when the infant cannot otherwise be comforted. Pacifiers can also reduce the risk of SIDS, which occurs most commonly between 2-4 months of age. Therefore, the AAP’s policy statements (Policy Statement: Breastfeeding and the Use of Human Milk | Pediatrics | American Academy of Pediatrics (aap.org), and SIDS and Other Sleep-Related Infant Deaths: Updated 2016 Recommendations for a Safe Infant Sleeping Environment) recommend that introduction of pacifiers for breastfed infants be delayed until breastfeeding is firmly established, which is generally within the first few weeks.","The AAP’s policy statements recommend that introduction of pacifiers for breastfed infants be delayed until breastfeeding is firmly established, which is generally within the first few weeks",LABEL_1
110,EN_1392211_301477-33,"Smoke of any kind increases a baby’s risk of dying while sleeping, especially babies who are sick\n- Don't share a bed with your baby. This is extra important if your baby is very young or small or was born prematurely. This is also extra important if you have been drinking alcohol, used marijuana, or taken any medicines or illegal drugs. Don't put your baby to sleep in a bed with other children or adults. You can bring your baby to your bed for feedings and comforting. But return your baby to the crib or bassinet for sleep. Don't fall asleep with your baby. Bed sharing is also not advised for twins or other multiples\n- Share your room instead of your bed with your baby. The American Academy of Pediatrics recommends that babies sleep in the same room as their parents, close to their parents' bed. But babies should be in a separate bed or crib appropriate for babies. This sleeping arrangement is recommended for at least the first 6 months\n- Use correct bedding. Your baby should sleep on a firm, flat mattress or firm surface with no slant. The mattress should fit tightly and be designed just for the crib. Cover the mattress with a fitted sheet. Don’t use fluffy blankets or comforters. Don’t let your baby sleep on an adult bed, waterbed, air mattress, sofa, sheepskin, pillow, or other soft material. Don’t put soft toys, pillows, or bumper pads in the crib. Don't use weighted blankets, sleepers, swaddles, or other weighted items.","Smoke of any kind increases a baby’s risk of dying while sleeping, especially babies who are sick",LABEL_1


In [125]:
class GenerateQuestion(dspy.Signature):
    fact = dspy.InputField()
    context = dspy.InputField()
    question = dspy.OutputField(desc="it asks the fact", prefix="Question:")


class QAGeneratorModule(dspy.Module):
    def __init__(self):
        super().__init__()
        self.generate_question = dspy.Predict("fact,context->question")

    def forward(self, fact,context):
        question = self.generate_question(fact=fact, context=context).question
        return dspy.Prediction(question=question)

qa_gen = QAGeneratorModule()

In [158]:
class TranslatorModule(dspy.Module):
    def __init__(self):
        super().__init__()
        self.translate = dspy.Predict("english->spanish")

    def forward(self, english):
        spanish = self.translate(english=english).spanish
        return spanish
tr = TranslatorModule()
tr("What is the best way to help your baby fall asleep?")

'¿Cómo puedo ayudar a mi bebé a dormir?'

In [126]:
questions = [qa_gen(q,con).question for q,con in zip(filtered.claims.values.tolist(),filtered.text.values.tolist())]
filtered["question"] = questions

In [154]:
filtered["len"] = filtered["question"].apply(lambda x: len(x))
filtered_2 = filtered[
    (filtered["len"] <= 300) &
    (filtered["question"].str.endswith("?"))
].drop_duplicates(subset=['question'], keep='first', inplace=False)
filtered_2.to_excel("test_tpc12.xlsx")

In [155]:
filtered_2

Unnamed: 0,ids,text,claims,score,question,len
0,EN_513461_85839-22,"How can you help your baby fall asleep: Not all babies know how to put themselves to sleep. When it is time for bed, many parents want to rock or breastfeed a baby to sleep. Establishing a routine at bedtime is a good idea. However, be sure that the baby does not fall asleep while eating or in your arms. This may become a pattern and the baby may begin to expect to be in your arms in order to fall asleep. When the baby briefly awakens during a sleep cycle, he or she may not be able to go back to sleep on his or her own.",Not all babies know how to put themselves to sleep,LABEL_1,What is the best way to help your baby fall asleep?,51
9,EN_1393570_301605-29,"Make sure nothing is covering your baby's head. These increase a baby's risk of suffocating\n- Put your baby in other positions while they are awake. This helps your baby grow stronger. It also helps prevent your baby from having a misshaped head. When your baby is awake, hold your baby. Give your baby time on their tummy while awake and supervised for short periods of time beginning soon after coming home from the hospital. Slowly increase tummy time to at least 15 to 30 minutes each day by 7 weeks old. Try not to let your baby sit in a seat or swing for long periods of time\n- Don't using sitting devices for routine sleep. Infant seats, car seats, strollers, infant carriers, and infant swings are not advised for routine sleep. These may lead to blockage of a baby's airway or suffocation. If your baby is in a sitting device, remove them from the device and put them in the crib or other appropriate surface as soon as is safe and practical\n- Make sure your baby doesn't get overheated when sleeping. Keep the room at a temperature that is comfortable for you and your baby. Dress your baby lightly. Instead of using blankets, keep your baby warm by dressing them in a sleep sack, or a wearable blanket. Don't use a hat on your baby indoors\n- Use caution when swaddling your baby. Swaddling doesn't reduce the risk for SIDS. If you choose to swaddle your baby, make sure they are on their back and the swaddle is not too tight.",These increase a baby's risk of suffocating,LABEL_1,What are some ways to reduce the risk of SIDS?,46
31,EN_1386177_300946-35,"Your baby should sleep on a firm, flat mattress or firm surface with no slant. Cover the mattress with a fitted sheet. Don’t use fluffy blankets or comforters. Don’t let your baby sleep on a waterbed, air mattress, sofa, sheepskin, pillow, or other soft material. Don’t put soft toys, pillows, or bumper pads in the crib\n- Not overheating. Keep your baby warm but not too warm. The temperature in your baby’s room should feel comfortable to you. Don't overbundle, overdress, or cover a baby's face or head. Don't put a hat on your baby when indoors\n- Sharing a room. The American Academy of Pediatrics advises that babies sleep close to the parent's bed, but in a separate crib or bassinet for babies. This is advised ideally for the baby's first year. But you should do this at least for the first 6 months\n- Not sharing a bed. Don't put your baby to sleep in a bed with other children. Don’t put your baby to sleep on a sofa, either alone or with another person. Don't share your bed with your baby, especially if you are using alcohol or other drugs. You can bring your baby to your bed for feedings and comforting. But return your baby to the crib for sleep. Bed sharing is also not advised for twins or other multiples\n- Not allowing smoking around your baby. The risk of SIDS is higher for babies whose mothers smoked during pregnancy.",The risk of SIDS is higher for babies whose mothers whose mothers smoked during pregnancy,LABEL_1,What are the risk factors for SIDS?,35
86,EN_992647_17713-5,"There is evidence that early introduction of pacifiers negatively impacts breastfeeding outcomes. However, pacifier use during the birth hospitalization can provide comfort during painful procedures (e.g., circumcision) when the infant cannot otherwise be comforted. Pacifiers can also reduce the risk of SIDS, which occurs most commonly between 2-4 months of age. Therefore, the AAP’s policy statements (Policy Statement: Breastfeeding and the Use of Human Milk | Pediatrics | American Academy of Pediatrics (aap.org), and SIDS and Other Sleep-Related Infant Deaths: Updated 2016 Recommendations for a Safe Infant Sleeping Environment) recommend that introduction of pacifiers for breastfed infants be delayed until breastfeeding is firmly established, which is generally within the first few weeks.",There is evidence that early introduction of pacifiers negatively impacts breastfeeding outcomes,LABEL_1,What is the AAP’s policy on pacifier use during the birth hospitalization?,74
90,EN_992647_17713-5,"There is evidence that early introduction of pacifiers negatively impacts breastfeeding outcomes. However, pacifier use during the birth hospitalization can provide comfort during painful procedures (e.g., circumcision) when the infant cannot otherwise be comforted. Pacifiers can also reduce the risk of SIDS, which occurs most commonly between 2-4 months of age. Therefore, the AAP’s policy statements (Policy Statement: Breastfeeding and the Use of Human Milk | Pediatrics | American Academy of Pediatrics (aap.org), and SIDS and Other Sleep-Related Infant Deaths: Updated 2016 Recommendations for a Safe Infant Sleeping Environment) recommend that introduction of pacifiers for breastfed infants be delayed until breastfeeding is firmly established, which is generally within the first few weeks.","Pacifiers can also reduce the risk of SIDS, which occurs most commonly between 2-4 months of age",LABEL_1,What is the AAP’s policy on pacifier use for breastfed infants?,63
110,EN_1392211_301477-33,"Smoke of any kind increases a baby’s risk of dying while sleeping, especially babies who are sick\n- Don't share a bed with your baby. This is extra important if your baby is very young or small or was born prematurely. This is also extra important if you have been drinking alcohol, used marijuana, or taken any medicines or illegal drugs. Don't put your baby to sleep in a bed with other children or adults. You can bring your baby to your bed for feedings and comforting. But return your baby to the crib or bassinet for sleep. Don't fall asleep with your baby. Bed sharing is also not advised for twins or other multiples\n- Share your room instead of your bed with your baby. The American Academy of Pediatrics recommends that babies sleep in the same room as their parents, close to their parents' bed. But babies should be in a separate bed or crib appropriate for babies. This sleeping arrangement is recommended for at least the first 6 months\n- Use correct bedding. Your baby should sleep on a firm, flat mattress or firm surface with no slant. The mattress should fit tightly and be designed just for the crib. Cover the mattress with a fitted sheet. Don’t use fluffy blankets or comforters. Don’t let your baby sleep on an adult bed, waterbed, air mattress, sofa, sheepskin, pillow, or other soft material. Don’t put soft toys, pillows, or bumper pads in the crib. Don't use weighted blankets, sleepers, swaddles, or other weighted items.","Smoke of any kind increases a baby’s risk of dying while sleeping, especially babies who are sick",LABEL_1,What is the best way to reduce the risk of SIDS?,48
141,EN_932540_155228-2,"Experts recommend room-sharing for at least the first 6 months of life, especially if you’re breastfeeding. Here are some ideas:\n- Put a bassinet, play yard, or crib next to your bed. This lets you keep that desired closeness that makes it easier to breastfeed at night. It also lowers your baby’s risk of SIDS.\n- Buy a bassinet or play yard with one side that is lower, which attaches to your bed. This allows you to be next to your baby without the chance of rolling over onto your infant..",Experts recommend room-sharing for at least the first 6 months of life,LABEL_1,What are some ideas for room-sharing with your baby?,52
165,EN_977526_167004-18,"During a baby's first few months, the brain matures. The baby gradually can sleep for longer periods. By age 3 months, most babies sleep for their longest period (up to 7 to 8 hours) during the night and develop set nap times. They are also more alert when awake than they were when they were younger.","By age 3 months, most babies sleep for their longest period (up to 7 to 8 hours) during the night",LABEL_1,What is the longest period of time that a baby can sleep during the night?,74
187,EN_1393572_301605-41,"Don't using sitting devices for routine sleep. Infant seats, car seats, strollers, infant carriers, and infant swings are not advised for routine sleep. These may lead to blockage of a baby's airway or suffocation. If your baby is in a sitting device, remove them from the device and put them in the crib or other appropriate surface as soon as is safe and practical.","Infant seats, car seats, strollers, infant carriers, and infant swings are not advised for routine sleep",LABEL_1,What are the risks of using sitting devices for routine sleep?,62
221,EN_1393569_301605-28,"Smoke of any kind increases a baby’s risk of dying while sleeping, especially babies who are sick\n- Don't share a bed with your baby. This is extra important if your baby is very young or small or was born prematurely. This is also extra important if you have been drinking alcohol, used marijuana, or taken any medicines or illegal drugs. Don't put your baby to sleep in a bed with other children or adults. You can bring your baby to your bed for feedings and comforting. But return your baby to the crib or bassinet for sleep. Don't fall asleep with your baby. Bed sharing is also not advised for twins or other multiples\n- Share your room instead of your bed with your baby. The AAP recommends that infants sleep in the same room as their parents, close to their parents' bed. But babies should be in a separate bed or crib appropriate for babies. This sleeping arrangement is recommended for at least the first 6 months\n- Use correct bedding. Your baby should sleep on a firm, flat mattress or firm surface with no slant. The mattress should fit tightly and be designed just for the crib. Cover the mattress with a fitted sheet. Don’t use fluffy blankets or comforters. Don’t let your baby sleep on an adult bed, waterbed, air mattress, sofa, sheepskin, pillow, or other soft material. Don’t put soft toys, pillows, or bumper pads in the crib. Don't use weighted blankets, sleepers, swaddles, or other weighted items.",Bed sharing is not recommended for twins or other multiples,LABEL_1,What is the recommended sleeping arrangement for twins or other multiples?,74


In [None]:
###

In [4]:
df_tpc_es

Unnamed: 0,doc_id,id_top,id_preproc,lemmas_x,text,len,thetas,id_tpc
9,ES_584418_68197-9,9,224,hijo talasemia_hbss dispuesto proporcionar mue...,¿Usted o su hijo tienen talasemia HbSS o Sβ0 y...,32,0|0.75 19|0.25,0
44,ES_603718_75256-10,44,841,hijo recuento_plaquetario investigar_recuento ...,¿Tiene su hijo un recuento plaquetario bajo? E...,29,0|1.0,0
70,ES_592711_74128-1,70,1254,analizar medicamento llamado_metformina ayudar...,Este estudio analiza si un medicamento llamado...,26,0|1.0,0
156,ES_486251_63877-48,156,2817,paciente participar ensayo_clínico ayudar mejo...,Los pacientes que participan en los ensayos cl...,16,0|0.8104000091552734 9|0.06369999796152115 17|...,0
252,ES_593115_74164-9,252,4712,hijo anemia célula_falciform interés contribui...,¿Usted o su hijo tienen la anemia de células f...,26,0|1.0,0
...,...,...,...,...,...,...,...,...
60662,ES_579773_67632-8,60662,1061456,diagnosticar síndrome_takotsubo investigar cor...,¿Se le ha diagnosticado recientemente síndrome...,29,0|1.0,0
60696,ES_592738_74130-6,60696,1061991,analizar medicamento llamado_anastrozol utiliz...,Este estudio analiza si un medicamento llamado...,25,0|1.0,0
60698,ES_592759_74132-5,60698,1062015,crisis dolor complicación pulmonar anemia célu...,Crisis de dolor y complicaciones pulmonares en...,46,0|1.0,0
60740,ES_597996_74674-2,60740,1062566,sufrir enfermedad cardíaco requerir_dci preven...,¿Sufre una enfermedad cardíaca que requiere un...,24,0|1.0,0


In [19]:
df_all = pd.read_parquet("/export/usuarios_ml4ds/lbartolome/Repos/umd/LinQAForge/data/source/corpus_rosie/passages/translated_stops/df_1.parquet")
df_all = df_all[df_all.lang == "ES"]

In [20]:
from sentence_transformers import SentenceTransformer, util
import itertools

query = "¿Cuál es la mejor manera de ayudar a mi bebé a conciliar el sueño?"
docs = df_all.text.values.tolist()#list(itertools.chain(*[text.split(".") for text in df_en_raw.text.values.tolist()]))#df_tpc_es.text.values.tolist()

#Load the model
model = SentenceTransformer('SeyedAli/Multilingual-Text-Semantic-Search-Siamese-BERT-V1')

#Encode query and documents
query_emb = model.encode(query)
doc_emb = model.encode(docs)

#Compute dot score between query and all document embeddings
scores = util.dot_score(query_emb, doc_emb)[0].cpu().tolist()

#Combine docs & scores
doc_score_pairs = list(zip(docs, scores))

#Sort by decreasing score
doc_score_pairs = sorted(doc_score_pairs, key=lambda x: x[1], reverse=True)[:3]

#Output passages & scores
for doc, score in doc_score_pairs:
    print(score, doc)




0.8129085302352905 ¿Cuál es la mejor manera de ayudar al equipo médico?
0.8129085302352905 ¿Cuál es la mejor manera de ayudar al equipo médico?
0.8129085302352905 ¿Cuál es la mejor manera de ayudar al equipo médico?


In [21]:
query_emb = model.encode("¿Cómo puedo ayudar a dormir a mi bebé?")

#Compute dot score between query and all document embeddings
scores = util.dot_score(query_emb, doc_emb)[0].cpu().tolist()

#Combine docs & scores
doc_score_pairs = list(zip(docs, scores))

#Sort by decreasing score
doc_score_pairs = sorted(doc_score_pairs, key=lambda x: x[1], reverse=True)[:10]

#Output passages & scores
for doc, score in doc_score_pairs:
    print(score, doc)

0.7978652715682983 ¿Cómo puedo protegerme a mí misma y a mi bebé?
0.7978652715682983 ¿Cómo puedo protegerme a mí misma y a mi bebé?
0.7732670307159424 ¿Cómo puede ayudar al bebé a quedarse dormido: No todos los bebés saben cómo dormirse solos. Cuando es la hora de dormir, muchos padres mecen al bebé. Los recién nacidos y bebés pequeños se duermen mientras están amamantando. Tener una rutina para la hora de dormir es una buena idea. Pero si un bebé más grande se duerme mientras come o en brazos, puede convertirse en un patrón. Puede que el bebé empiece a esperar estar en brazos para dormirse. Cuando el bebé se despierta por poco tiempo durante un ciclo de sueño, es posible que no pueda volver a dormirse solo.
0.7732670307159424 ¿Cómo puede ayudar al bebé a quedarse dormido: No todos los bebés saben cómo dormirse solos. Cuando es la hora de dormir, muchos padres mecen al bebé. Los recién nacidos y bebés pequeños se duermen mientras están amamantando. Tener una rutina para la hora de dorm

In [27]:
from sentence_transformers import SentenceTransformer, util
import time

class Index(object):
    def __init__(self, corpus: list, doc_ids: list, model_name: str = 'SeyedAli/Multilingual-Text-Semantic-Search-Siamese-BERT-V1'):
        self.model = SentenceTransformer(model_name)
        self.corpus = corpus
        self.doc_ids = doc_ids
        
        # Preprocess the corpus: split each document into sentences
        self.sentences = []
        self.mapping = []
        for doc_id, document in enumerate(corpus):
            
            # Split the document into sentences based on "."
            substrings = document.split(".")
            substrings = [s.strip() for s in substrings if s.strip()]  #
            
            self.sentences.extend(substrings)
            self.mapping.extend([doc_id] * len(substrings))
        
        print("-- -- Encoding corpus...")
        start_time = time.time()
        self.index = self.model.encode(self.sentences)
        print("-- -- Corpus encoded in {} minutes".format((time.time()-start_time)/60))
    
    def retrieve(self, query, topk=5):
        # Encode query
        query_emb = self.model.encode(query)
        
        # Compute dot score between query and all sentence embeddings
        scores = util.dot_score(query_emb, self.index)[0].cpu().tolist()
        
        # Combine sentences & scores
        sentence_score_pairs = list(zip(self.sentences, scores, self.mapping))
        
        # Sort by decreasing score
        sentence_score_pairs = sorted(sentence_score_pairs, key=lambda x: x[1], reverse=True)[:topk]
        
        # Retrieve the topk results with original document mapping
        results = []
        for sentence, score, doc_id in sentence_score_pairs:
            results.append({
                "document_id": self.doc_ids[doc_id],
                "sentence": sentence,
                "score": score,
                "original_document": self.corpus[doc_id]
            })
        
        return results

In [33]:
index = Index(corpus=df_all.text.values.tolist(), doc_ids=df_all.doc_id.values.tolist())

-- -- Encoding corpus...
-- -- Corpus encoded in 27.29057859579722 minutes


In [34]:
index.sentences[0:10]

['Información sobre el seguro médico: Wisconsin Medicaid (solo servicios de Badgercare y planificación familiar) aceptado',
 'El tratamiento no solo se ocupa de las drogas',
 'Le ayuda a tomar el control de su vida para que no tenga que depender de las drogas',
 'Aprenderá buenos motivos para dejar las drogas',
 'Mantenerse alejado de las drogas es un proceso de por vida que implica compromiso y esfuerzo',
 'Los factores de riesgo para la enfermedad de Addison de tipo autoinmunitario incluyen otras enfermedades autoinmunitarias:\\n- Hinchazón (inflamación) de la glándula tiroides que con frecuencia lleva a una reducción en la función tiroidea (tiroiditis crónica)\\n- La glándula tiroides produce un exceso de hormona tiroidea (hipertiroidismo, enfermedad de Graves)\\n- Erupción que produce picazón con ronchas y ampollas ( dermatitis herpetiforme)\\n- Las glándulas paratiroides del cuello no producen suficiente hormona paratiroidea (hipoparatiroidismo)\\n- La hipófisis no produce cantida

In [38]:
import pickle; pickle.dump(index, open('index_store_es_all.pkl', 'wb'))

In [37]:
query = "How can I help my baby to sleep?"
results = index.retrieve(query, topk=10)

for result in results:
    print(f"Document ID: {result['document_id']}")
    print(f"Sentence: {result['sentence']}")
    print(f"Score: {result['score']}")
    print(f"Original Document: {result['original_document']}")
    print("\n")

Document ID: ES_166253_1505-3
Sentence: Entre los libros sobre estrategias CIO más populares se encuentran Solve Your Child’s Sleep Problems, de Ferber, Healthy Sleep Habits, Happy Child, de Weissbluth, y The Sleepeasy Solution, de Waldburger y Spivack
Score: 0.6411253809928894
Original Document: Hay dos métodos básicos para entrenar el sueño. El primero consiste en dejar que el niño “llore” (CIO, por sus siglas en inglés). La idea que subyace al CIO es que los padres a veces se interponen en el camino para que los niños aprendan a dormirse. Con el CIO, el bebé aprenderá a dormirse solo después de algunas noches difíciles. Los padres pueden dejar que su bebé llore y vigilarlo en periodos de tiempo que se alargan gradualmente. Entre los libros sobre estrategias CIO más populares se encuentran Solve Your Child’s Sleep Problems, de Ferber, Healthy Sleep Habits, Happy Child, de Weissbluth, y The Sleepeasy Solution, de Waldburger y Spivack.


Document ID: ES_476546_63177-5
Sentence: Sleep-r

In [14]:
import pathlib
import pandas as pd
from scipy import sparse
import numpy as np

path_orig_en = pathlib.Path("/export/usuarios_ml4ds/lbartolome/Repos/umd/LinQAForge/data/source/corpus_rosie/corpus_pass_en_tr.parquet")
path_orig_es = pathlib.Path("/export/usuarios_ml4ds/lbartolome/Repos/umd/LinQAForge/data/source/corpus_rosie/corpus_pass_es_tr.parquet")
path_source = pathlib.Path("/export/usuarios_ml4ds/lbartolome/Repos/umd/LinQAForge/data/source/corpus_rosie/passages/multi_blade_filtered/df_1.parquet")

path_model = pathlib.Path("/export/usuarios_ml4ds/lbartolome/Repos/umd/LinQAForge/data/models/MULTI_BLADE_FILTERED/poly_rosie_v2_1_20")

def get_doc_top_tpcs(doc_distr, topn=2):
    sorted_tpc_indices = np.argsort(doc_distr)[::-1]
    top = sorted_tpc_indices[:topn].tolist()
    top_weight = [(k, doc_distr[k]) for k in top]
    return top_weight

def get_doc_main_topc(doc_distr):
    sorted_tpc_indices = np.argsort(doc_distr)[::-1]
    top = sorted_tpc_indices[:1][0]
    return top

In [6]:
raw = pd.read_parquet(path_source)
raw

Unnamed: 0,id_preproc,lemmas,lemmas_tr,doc_id,text,text_tr,lang
0,0,implant_port information technology easy child...,puerto_implantado facilitar hijo recibir trata...,EN_657272_107210-27,Implanted ports make it easier for you or your...,Los puertos implantados facilitan que usted o ...,EN
1,1,life_expectancy birth reach record high year t...,esperanza_vida nacer alcanzar_máximo precedent...,EN_403516_60988-2,Life expectancy at birth reached a record high...,La esperanza de vida al nacer alcanzó un máxim...,EN
2,2,shazam_hussain arterial common location caroti...,shazam_hussain arterial común provenir arteria...,EN_736110_112440-10,"Shazam Hussain: Usually, on the arterial side....","Shazam Hussain: Por lo general, en el lado art...",EN
3,3,rifaximin cause effect doctor unusual_problem ...,rifaximina causar secundario_llamir médico pro...,EN_1279033_283895-10,Rifaximin may cause other side effects. Call y...,La rifaximina puede causar otros efectos secun...,EN
4,4,point healing_process need dressing provider l...,curación necesitar apósito proveedor dejar her...,EN_1241536_280479-12,"At some point during the healing process, you ...",En algún momento durante el proceso de curació...,EN
...,...,...,...,...,...,...,...
1062898,1062898,complicación enfermedad grave afectación compl...,complication disease severe ocular complicatio...,ES_833462_8011-30,Las complicaciones de esta enfermedad pueden s...,"Complications of this disease can be severe, w...",ES
1062899,1062899,información prueba servicio embarazo centro sa...,information pregnancy test service health cent...,ES_651664_92975-2,Información sobre pruebas y servicios para el ...,Information about pregnancy tests and services...,ES
1062900,1062900,quedar beneficio secundario importante,information technology clear benefit effect im...,ES_189923_37958-96,2. ¿Le queda claro cuáles beneficios y efectos...,2. Is it clear to you what benefits and side e...,ES
1062901,1062901,limitación serie escaso número paciente anális...,limitation study information technology series...,ES_1043194_9292-51,Existen limitaciones en nuestro estudio: es un...,There are limitations in our study: it is a se...,ES


In [15]:
thetas = sparse.load_npz(path_model / "mallet_output" / "thetas_EN.npz")
raw["thetas"] = list(thetas.toarray())
raw.loc[:, "top_k"] = raw["thetas"].apply(get_doc_top_tpcs)
raw.loc[:, "main_topic"] = raw["thetas"].apply(get_doc_main_topc)

In [18]:
# get topic keys in Engish
with open(path_model / "mallet_output" / "keys_EN.txt", 'r') as file:
    lines = file.readlines()
topic_keys = [line.strip() for line in lines]
topic_keys

['doctor clinic mayo condition search minnesota rochester medical focus disease transplant surname care florida result center treat health location',
 'symptom pain technology medication medicine doctor treat severe feel infection treatment child day people sign skin include fever mild',
 'patient treatment study therapy clinical drug receive organization treat risk month trial follow evidence health rate effective day improve',
 'cancer tumor treatment breast cell radiation technology surgery therapy screening risk tissue thyroid stage treat gland prostate remove hormone',
 'disorder syndrome gene brain cell condition protein genetic affect mutation acid disease technology people function change development seizure develop',
 'baby woman pregnancy birth technology pregnant health infant risk organization week mother bear sex sexual month period delivery newborn',
 'infection hiv person report virus tuberculosis disease laboratory outbreak health testing infect cdc illness result posit

In [20]:
df_en = raw[raw['doc_id'].str.startswith("EN")].copy()
df_es = raw[raw['doc_id'].str.startswith("ES")].copy()

In [22]:
df_es[df_es.main_topic == 5]

Unnamed: 0,id_preproc,lemmas,lemmas_tr,doc_id,text,text_tr,lang,thetas,top_k,main_topic
18,18,contaminación_atmosférico causar cambio_climát...,air_pollution cause climate_change link death ...,ES_865177_116539-11,Esa misma contaminación atmosférica que causa ...,The same air pollution caused by climate chang...,ES,"[0.0, 0.0357142873108387, 0.0, 0.0, 0.0, 0.857...","[(5, 0.8571428656578064), (19, 0.0357142873108...",5
28,28,hijo control anote_fecha hora visita,child control appointment write date time purp...,ES_709235_98634-42,"Si su hijo tiene una cita de control, anote la...","If your child has a control appointment, write...",ES,"[0.0, 0.0, 0.0, 0.19230769574642181, 0.0, 0.65...","[(5, 0.6538461446762085), (3, 0.19230769574642...",5
31,31,suceder prueba,happen test procedure,ES_699215_98273-76,Qué sucedería si no se hace la prueba o el pro...,What would happen if the test or procedure is ...,ES,"[0.0, 0.0, 0.012658228166401386, 0.0, 0.0, 0.4...","[(5, 0.4810126721858978), (18, 0.4050633013248...",5
32,32,analizar_tecnología medir actividad cardíacar_...,study new technology measure fetal_cardiac act...,ES_591388_73930-2,Este estudio analiza una nueva tecnología para...,This study analyzes a new technology to measur...,ES,"[0.0, 0.23529411852359772, 0.0, 0.029411764815...","[(5, 0.3529411852359772), (1, 0.23529411852359...",5
59,59,regular comercialización alimento_bebida salud...,regulate marketing_unhealthy food_beverage pub...,ES_862069_116278-41,Regular la comercialización de alimentos y beb...,Regulate the marketing of unhealthy foods and ...,ES,"[0.07999999821186066, 0.0, 0.0, 0.0, 0.0, 0.80...","[(5, 0.800000011920929), (0, 0.079999998211860...",5
...,...,...,...,...,...,...,...,...,...,...
1062722,1062722,tratamiento convulsión diagnosticar hijo epile...,convulsion treatment child diagnose epilepsy d...,ES_336586_3064-40,Tratamiento para las convulsiones: Si diagnost...,Convulsions Treatment: If your child is diagno...,ES,"[0.0, 0.2857142984867096, 0.0, 0.1785714328289...","[(5, 0.3928571343421936), (1, 0.28571429848670...",5
1062762,1062762,exhale_empuje peso contener respiración_inhale...,exhale_push weight hold_breath time inhale lit...,ES_233066_39166-20,Exhale cuando empuje contra el peso. No conten...,Exhale when pushing against the weight. Do not...,ES,"[0.0, 0.24637681245803833, 0.0, 0.0, 0.0144927...","[(5, 0.49275362491607666), (1, 0.2463768124580...",5
1062766,1062766,persona tímido volver él_sensible sentimiento ...,shy people sensitive people feeling emotion th...,ES_364146_44036-10,Las personas tímidas también pueden volverse m...,Shy people can also become more sensitive to o...,ES,"[0.014925372786819935, 0.014925372786819935, 0...","[(5, 0.43283581733703613), (15, 0.268656730651...",5
1062789,1062789,cuidados vida tema,end life care information topic,ES_283635_40554-44,Cuidados al final de la vida: Para obtener más...,"End-of-life care: For more information, see th...",ES,"[0.0, 0.0, 0.0, 0.0, 0.008620689623057842, 0.5...","[(5, 0.5086206793785095), (17, 0.2327586263418...",5


In [23]:
from transformers import T5ForConditionalGeneration, T5Tokenizer
from transformers import pipeline
import itertools

tokenizer = T5Tokenizer.from_pretrained("Babelscape/t5-base-summarization-claim-extractor")
claim_extractor = T5ForConditionalGeneration.from_pretrained("Babelscape/t5-base-summarization-claim-extractor")
classifier = pipeline(
        "text-classification", 
        model="Nithiwat/bert-base_claimbuster"
)

#lens = [len(c) for c in claims]
#texts = [[text[i]]*lens[i] for i in range(len(lens))]
#scores = [classifier(c)[0]['label'] for c in claims] 

def generate_claims(text):
    tok_input = tokenizer.batch_encode_plus([text], return_tensors="pt", padding=True)
    claims = claim_extractor.generate(**tok_input)
    claims = tokenizer.batch_decode(claims, skip_special_tokens=True)[0].split(".")

    scores = [classifier(c)[0]['label'] for c in claims] 

    return claims, scores

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
  return self.fget.__get__(instance, owner)()


In [37]:
df_tpc = df_en[df_en.main_topic==19]
df_tpc = df_tpc[df_tpc['text'].str.contains('baby', case=False, na=False)]
df_tpc

Unnamed: 0,id_preproc,lemmas,lemmas_tr,doc_id,text,text_tr,lang,thetas,top_k,main_topic
411,411,birth_defect birth_defect common costly critical condition affect baby_bear united_states read learn birth_defect woman improve chance baby_bear birth_defect,defecto defecto_congénito condición común costoso crítica afectar bebé nacido unidos lea aprender defecto_congénito mujer mejorar bebé nacido defecto_congénito,EN_992239_17658-0,"What are Birth Defects: Birth defects are common, costly, and critical conditions that affect 1 in every 33 babies born in the United States each year.1 Read more about what we have learned about birth defects and how women can improve their chances of having a baby born without a birth defect.","¿Cuáles son los defectos congénitos?Los defectos congénitos son condiciones comunes, costosas y críticas que afectan a 1 de cada 33 bebés nacidos en los Estados Unidos cada año.1 Lea más sobre lo que hemos aprendido acerca de los defectos congénitos y cómo las mujeres pueden mejorar sus posibilidades de tener un bebé nacido sin un defecto congénito.",EN,"[0.0, 0.239130437374115, 0.0, 0.0, 0.0, 0.021739130839705467, 0.021739130839705467, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.021739130839705467, 0.0, 0.021739130839705467, 0.0, 0.0, 0.0, 0.6739130616188049]","[(19, 0.6739130616188049), (1, 0.239130437374115)]",19
698,698,aliza fussy baby eat conk wake eat bite fall_asleep high chair,alizar bebé_quisquilloso comer contagiar levantar comer_bocado quedar_dormido silla alto,EN_611321_99695-3,"Aliza was a fussy baby, until she ate. Then she’d conk out almost immediately. She would wake up early, and after eating a bite or two, she’d fall asleep in her high chair.","Aliza era un bebé quisquilloso, hasta que comió. Entonces ella se contagiaba casi inmediatamente. Se levantaba temprano, y después de comer un bocado o dos, se quedaba dormida en su silla alta.",EN,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.020408162847161293, 0.0, 0.0, 0.0, 0.0, 0.0, 0.020408162847161293, 0.22448979318141937, 0.0, 0.040816325694322586, 0.020408162847161293, 0.020408162847161293, 0.6530612111091614]","[(19, 0.6530612111091614), (14, 0.22448979318141937)]",19
790,790,miss round report nurse care baby day available bedside round caregiver speaker_phone parent hear update ask question real_time ask nurse share question doctor ask day,olvidar ronda informe enfermera cuidar bebé disponible cama ronda cuidador altavoz padre escuchar actualización pregunta real pedir él enfermera compartir pregunta médico pídale llamar,EN_1420818_306928-4,"If you miss rounds, you can get a report from the nurse caring for your baby later in the day. If you are not available to be at bedside during rounds, the caregiver can use a speaker phone so that parents can hear the update and ask questions in real time. Or, you can ask your nurse to share your questions with your doctor and ask them to call you later that day.","Si se olvida de las rondas, puede obtener un informe de la enfermera que cuida a su bebé más tarde en el día. Si no está disponible para estar al lado de la cama durante las rondas, el cuidador puede usar un altavoz para que los padres puedan escuchar la actualización y hacer preguntas en tiempo real. O puede pedirle a su enfermera que comparta sus preguntas con su médico y pídale que le llame más tarde ese mismo día.",EN,"[0.0, 0.016393441706895828, 0.016393441706895828, 0.016393441706895828, 0.0, 0.0, 0.016393441706895828, 0.0, 0.032786883413791656, 0.08196721225976944, 0.0, 0.04918032884597778, 0.0, 0.0, 0.016393441706895828, 0.0, 0.0, 0.1147540956735611, 0.0, 0.6393442749977112]","[(19, 0.6393442749977112), (17, 0.1147540956735611)]",19
1069,1069,household safety prevent injury crib baby spend lot time crib nap day sleep_night information technology important sure information technology safe environment place baby sleep help prevent sudden infant death syndrome_sids,seguridad hogar prevención lesión cuna bebé pasar cuna dormir_siesta dormir_noche importante asegurar él ambiente seguro coloquir bebé dormir espalda ayudar prevenir síndrome muerte_súbito lactante,EN_988000_171878-0,"Household Safety: Preventing Injuries in the Crib: Your baby will spend a lot of time in the crib, napping during the day and sleeping at night. It's very important to make sure it's always a safe environment. Always place your baby to sleep on his or her back to help prevent sudden infant death syndrome (SIDS).","Seguridad en el hogar: Prevención de lesiones en la cuna: Su bebé pasará mucho tiempo en la cuna, durmiendo la siesta durante el día y durmiendo por la noche. Es muy importante asegurarse de que siempre es un ambiente seguro. Coloque siempre a su bebé a dormir sobre su espalda para ayudar a prevenir el síndrome de muerte súbita del lactante.",EN,"[0.021276595070958138, 0.0, 0.11702127754688263, 0.021276595070958138, 0.05319149047136307, 0.021276595070958138, 0.0, 0.010638297535479069, 0.0, 0.021276595070958138, 0.0, 0.0, 0.0, 0.0, 0.13829787075519562, 0.0, 0.010638297535479069, 0.21276596188545227, 0.010638297535479069, 0.3617021143436432]","[(19, 0.3617021143436432), (17, 0.21276596188545227)]",19
1910,1910,baby adopt offer job base insurance self employ_unemployed qualify_medicare taxes_healthcare taxis form_reconcile tax_credit browse_topic feature_qualify special_enrollment period open_enrollment able enroll health insurance special_enrollment period_answers search_glossary care coordination organization treatment health care provider medical home accountable care organizations common way coordinate care,acabar bebé adoptar menor año seguro ofrecido_autónomo desempleado_calificar medicare_impuestos cuidado salud impuesto_formulario crédito_fiscal reconciliar_tema destacados_averigua calificas_período especial_inscripción abierto inscripción terminar_inscribir tú seguro salud período especial_inscripción respuesta buscar volver_glosario coordinación cuidados tratamiento proveedor_atención médico hogar médico atención responsable forma común coordinar atención,EN_903854_138864-2,"Just had a baby or adopted Are under 30 Have/offered job-based insurance Are self-employed Are unemployed Qualify for Medicare Taxes Healthcare & taxes Form 1095-A 'Reconcile' tax credit Browse all topics Featured Find out if you qualify for a Special Enrollment Period 2023 Open Enrollment is over, but you may still be able to enroll in 2023 health insurance through a Special Enrollment Period. Get Answers Search Back to glossary Care coordination The organization of your treatment across several health care providers. Medical homes and Accountable Care Organizations are two common ways to coordinate care.","Acaban de tener un bebé o ser adoptados Son menores de 30 años Seguro basado en el trabajo / ofrecido Son autónomos Son desempleados Calificar para Medicare Impuestos Cuidado de la salud e impuestos Formulario 1095-A Crédito fiscal 'Reconciliar' Ver todos los temas Destacados Averigua si calificas para un Período Especial de Inscripción 2023 Abierto La inscripción ha terminado, pero es posible que todavía puedas inscribirte en el seguro de salud 2023 a través de un Período Especial de Inscripción. Obtener respuestas Buscar Volver al glosario Coordinación de cuidados La organización de su tratamiento en varios proveedores de atención médica. Los hogares médicos y las organizaciones de atención responsable son dos formas comunes de coordinar la atención.",EN,"[0.0, 0.05882352963089943, 0.0, 0.05882352963089943, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.05882352963089943, 0.11764705926179886, 0.0, 0.0, 0.11764705926179886, 0.05882352963089943, 0.05882352963089943, 0.47058823704719543]","[(19, 0.47058823704719543), (16, 0.11764705926179886)]",19
...,...,...,...,...,...,...,...,...,...,...
1390543,1390543,store baby cord blood private blood bank ensure family information technology private blood bank helpful family history health condition treat stem_cell information technology beneficial family member need stem_cell transplant,almacenar sangre_cordón umbilical bebé banco sangre privado asegurar familia usar él banco sangre privado útil familia antecedente enfermedad tratar célula_madre beneficioso miembro familia necesitar trasplante célula_madre,EN_863402_123276-11,You can store your baby's cord blood in a private blood bank. This ensures that only your family can use it. Private blood banks are helpful for families with a history of health conditions that can be treated with stem cells. It's also beneficial if you have a family member currently needing a stem cell transplant.,Puede almacenar la sangre del cordón umbilical de su bebé en un banco de sangre privado. Esto asegura que sólo su familia puede usarlo. Los bancos de sangre privados son útiles para las familias con antecedentes de enfermedades que se pueden tratar con células madre. También es beneficioso si usted tiene un miembro de la familia que actualmente necesita un trasplante de células madre.,EN,"[0.04444444552063942, 0.0, 0.0, 0.02222222276031971, 0.0, 0.02222222276031971, 0.0, 0.0, 0.0, 0.0, 0.0, 0.02222222276031971, 0.0, 0.04444444552063942, 0.02222222276031971, 0.0, 0.0, 0.0, 0.0, 0.8222222328186035]","[(19, 0.8222222328186035), (13, 0.04444444552063942)]",19
1391417,1391417,deal baby world health organization decide reverse cycle sleep_night despise leave leave pumping preparation education minute information technology edge,tratar bebé decidir invertir ciclo dormir_noche despreciar dejar él dejar preparativo bombeo educación minuto borde,EN_80_7-6,"I was dealing with a baby who decided to reverse-cycle (and therefore hardly slept at night) and I despised leaving her. If I had left all of the pumping preparations and education until the last minute, it might have just put me right over the edge.","Estaba tratando con un bebé que decidió invertir el ciclo (y por lo tanto apenas dormía por la noche) y despreciaba dejarla. Si hubiera dejado todos los preparativos de bombeo y la educación hasta el último minuto, podría haberme puesto justo al borde.",EN,"[0.0, 0.06451612710952759, 0.016129031777381897, 0.0, 0.016129031777381897, 0.016129031777381897, 0.0, 0.016129031777381897, 0.016129031777381897, 0.0, 0.25806450843811035, 0.08064515888690948, 0.09677419066429138, 0.032258063554763794, 0.0, 0.06451612710952759, 0.016129031777381897, 0.0, 0.016129031777381897, 0.29032257199287415]","[(19, 0.29032257199287415), (10, 0.25806450843811035)]",19
1391581,1391581,new routine possible create pumping routine base baby breastfeed return work baby milk production adjust new routine able pump mother pumping_session able pump time day,rutina creer rutina bombeo bebé amamantar regresar bebé producción leche ajustar rutina capaz bombear_suficiente frecuencia madre encontrar sesión_bombeo rápido capaz bombear hora,EN_1397715_301977-1,"A new routine: If possible, create a pumping routine based on when your baby would normally breastfeed, especially when first returning to work. But you, your baby, and your milk production will adjust to a new routine if you are able to pump often enough. Many mothers do find pumping sessions go more quickly when they are able to pump at about the same time each day.","Una nueva rutina: Si es posible, cree una rutina de bombeo basada en el momento en que su bebé amamanta normalmente, especialmente cuando regresa al trabajo por primera vez. Pero usted, su bebé y su producción de leche se ajustarán a una nueva rutina si usted es capaz de bombear con suficiente frecuencia. Muchas madres encuentran que las sesiones de bombeo son más rápidas cuando son capaces de bombear a la misma hora cada día.",EN,"[0.0, 0.021739130839705467, 0.0, 0.0, 0.0, 0.043478261679410934, 0.043478261679410934, 0.0, 0.08695652335882187, 0.043478261679410934, 0.021739130839705467, 0.0, 0.0, 0.0, 0.10869564861059189, 0.021739130839705467, 0.0, 0.021739130839705467, 0.0, 0.5869565010070801]","[(19, 0.5869565010070801), (14, 0.10869564861059189)]",19
1391780,1391780,surgical procedure cesarean_section require anesthesia mother epidural_spinal block numb low body mother remain awake baby deliver emergency mother general_anesthetic fall_asleep surgery incision low abdomen follow incision uterus pain associate incision anesthesia,quirúrgico cesárea requerir anestesia madre administrar epidural_bloqueo espinal adormecer inferior cuerpo madre permanecer despertar bebé luz emergencia madre recibir anestésico dormir cirugía incisión inferior_abdomen seguido incisión útero dolor asociado incisión anestesia,EN_1207649_277801-3,"Like many surgical procedures, cesarean sections require anesthesia. Usually, the mother is given an epidural or a spinal block. Both of these will numb the lower body, but the mother will remain awake. If the baby has to be delivered quickly, as in an emergency, the mother may be given a general anesthetic, which will make her fall asleep. During the surgery, an incision is made in the lower abdomen followed by an incision made in the uterus. There is no pain associated with either of these incisions because of the anesthesia.","Como muchos procedimientos quirúrgicos, las cesáreas requieren anestesia. Por lo general, a la madre se le administra una epidural o un bloqueo espinal. Ambos adormecerán la parte inferior del cuerpo, pero la madre permanecerá despierta. Si el bebé tiene que ser dado a luz rápidamente, como en una emergencia, la madre puede recibir un anestésico general, lo que la hará dormir. Durante la cirugía, se realiza una incisión en la parte inferior del abdomen seguida de una incisión en el útero. No hay dolor asociado con ninguna de estas incisiones debido a la anestesia.",EN,"[0.016129031777381897, 0.008064515888690948, 0.0, 0.0, 0.008064515888690948, 0.008064515888690948, 0.0, 0.09677419066429138, 0.0, 0.008064515888690948, 0.008064515888690948, 0.016129031777381897, 0.008064515888690948, 0.07258064299821854, 0.008064515888690948, 0.024193547666072845, 0.09677419066429138, 0.0, 0.008064515888690948, 0.6129032373428345]","[(19, 0.6129032373428345), (7, 0.09677419066429138)]",19


In [38]:
all_ids = []
all_texts = []
all_claims = []
all_scores = []

for id_, el in df_tpc.iterrows():
    claims, scores = generate_claims(el.text)
    texts = [el.text]*len(claims)
    ids = [el.doc_id]*len(claims)

    all_ids.append(ids)
    all_texts.append(texts)
    all_claims.append(claims)
    all_scores.append(scores)


results = pd.DataFrame(
    {
        "ids": list(itertools.chain(*all_ids)),
        "text" : list(itertools.chain(*all_texts)),
        "claims" : list(itertools.chain(*all_claims)),
        "score" : list(itertools.chain(*all_scores))
    }
)

In [39]:
filtered = results[results.score == "LABEL_1"]
filtered

Unnamed: 0,ids,text,claims,score
0,EN_992239_17658-0,"What are Birth Defects: Birth defects are common, costly, and critical conditions that affect 1 in every 33 babies born in the United States each year.1 Read more about what we have learned about birth defects and how women can improve their chances of having a baby born without a birth defect.","Birth defects are common, costly, and critical conditions",LABEL_1
1,EN_992239_17658-0,"What are Birth Defects: Birth defects are common, costly, and critical conditions that affect 1 in every 33 babies born in the United States each year.1 Read more about what we have learned about birth defects and how women can improve their chances of having a baby born without a birth defect.",Birth defects affect 1 in every 33 babies born in the United States each year,LABEL_1
6,EN_611321_99695-3,"Aliza was a fussy baby, until she ate. Then she’d conk out almost immediately. She would wake up early, and after eating a bite or two, she’d fall asleep in her high chair.",She’d conk out almost immediately,LABEL_1
20,EN_903854_138864-2,"Just had a baby or adopted Are under 30 Have/offered job-based insurance Are self-employed Are unemployed Qualify for Medicare Taxes Healthcare & taxes Form 1095-A 'Reconcile' tax credit Browse all topics Featured Find out if you qualify for a Special Enrollment Period 2023 Open Enrollment is over, but you may still be able to enroll in 2023 health insurance through a Special Enrollment Period. Get Answers Search Back to glossary Care coordination The organization of your treatment across several health care providers. Medical homes and Accountable Care Organizations are two common ways to coordinate care.",Just had a baby or adopted Are under 30 Have/offered job-based insurance Are self-employed Are unemployed Qualify for Medicare Taxes Healthcare & taxes Form 1095-A 'Reconcile' tax credit Browse all topics Featured Find out if you qualify for a Special Enrollment Period 2023 Open Enrollment is over,LABEL_1
21,EN_903854_138864-2,"Just had a baby or adopted Are under 30 Have/offered job-based insurance Are self-employed Are unemployed Qualify for Medicare Taxes Healthcare & taxes Form 1095-A 'Reconcile' tax credit Browse all topics Featured Find out if you qualify for a Special Enrollment Period 2023 Open Enrollment is over, but you may still be able to enroll in 2023 health insurance through a Special Enrollment Period. Get Answers Search Back to glossary Care coordination The organization of your treatment across several health care providers. Medical homes and Accountable Care Organizations are two common ways to coordinate care.",You may still be able to enroll in 2023 health insurance through a Special Enrollment Period,LABEL_1
...,...,...,...,...
12023,EN_1397715_301977-1,"A new routine: If possible, create a pumping routine based on when your baby would normally breastfeed, especially when first returning to work. But you, your baby, and your milk production will adjust to a new routine if you are able to pump often enough. Many mothers do find pumping sessions go more quickly when they are able to pump at about the same time each day.",Many mothers do find pumping sessions go more quickly when they are able to pump at about the same time each day,LABEL_1
12025,EN_1207649_277801-3,"Like many surgical procedures, cesarean sections require anesthesia. Usually, the mother is given an epidural or a spinal block. Both of these will numb the lower body, but the mother will remain awake. If the baby has to be delivered quickly, as in an emergency, the mother may be given a general anesthetic, which will make her fall asleep. During the surgery, an incision is made in the lower abdomen followed by an incision made in the uterus. There is no pain associated with either of these incisions because of the anesthesia.",Cesarean sections require anesthesia,LABEL_1
12033,EN_1207649_277801-3,"Like many surgical procedures, cesarean sections require anesthesia. Usually, the mother is given an epidural or a spinal block. Both of these will numb the lower body, but the mother will remain awake. If the baby has to be delivered quickly, as in an emergency, the mother may be given a general anesthetic, which will make her fall asleep. During the surgery, an incision is made in the lower abdomen followed by an incision made in the uterus. There is no pain associated with either of these incisions because of the anesthesia.",There is no pain associated with either of these incisions because of the anesthetic,LABEL_1
12035,EN_925569_154416-10,"How Is Newborn Screening Done: A small blood sample taken by pricking the baby's heel is tested. This happens before the baby leaves the hospital, usually at 1 or 2 days of age. Talk to your doctor about newborn screening if your baby was not born in a hospital.",A small blood sample is taken by pricking the baby's heel,LABEL_1


In [None]:
class GenerateQuestion(dspy.Signature):
    fact = dspy.InputField()
    context = dspy.InputField()
    question = dspy.OutputField(desc="it asks the fact", prefix="Question:")


class QAGeneratorModule(dspy.Module):
    def __init__(self):
        super().__init__()
        self.generate_question = dspy.Predict("fact,context->question")

    def forward(self, fact,context):
        question = self.generate_question(fact=fact, context=context).question
        return dspy.Prediction(question=question)

qa_gen = QAGeneratorModule()

class TranslatorModule(dspy.Module):
    def __init__(self):
        super().__init__()
        self.translate = dspy.Predict("english->spanish")

    def forward(self, english):
        spanish = self.translate(english=english).spanish
        return spanish
tr = TranslatorModule()
tr("What is the best way to help your baby fall asleep?")

In [None]:
questions = [qa_gen(q,con).question for q,con in zip(filtered.claims.values.tolist(),filtered.text.values.tolist())]
filtered["question"] = questions

filtered["len"] = filtered["question"].apply(lambda x: len(x))
filtered_2 = filtered[
    (filtered["len"] <= 300) &
    (filtered["question"].str.endswith("?"))
].drop_duplicates(subset=['question'], keep='first', inplace=False)
filtered_2.to_excel("test_tpc5_multi.xlsx")