In [1]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sentence_transformers import SentenceTransformer
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse
import pathlib
import os
from dotenv import load_dotenv
import time
import sys
import csv

csv.field_size_limit(100000000)

131072

In [2]:
from langchain_core.documents import Document
from langchain_core.retrievers import BaseRetriever
from langchain_core.callbacks import CallbackManagerForRetrieverRun
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.llms import OpenAI
from langchain.chains import RetrievalQA, RetrievalQAWithSourcesChain
from langchain.document_loaders import TextLoader
from langchain.document_loaders import DirectoryLoader
from langchain.chat_models import ChatOpenAI
from langchain.docstore.document import Document
#from langchain_core.prompts import PromptTemplate
from langchain.prompts import PromptTemplate
from langchain_openai import ChatOpenAI

## OpenAI APIKEY

In [3]:
path_env = pathlib.Path(os.getcwd()).parent.parent / '.env'
load_dotenv(path_env)
api_key = os.getenv("OPENAI_API_KEY")

os.environ["OPENAI_API_KEY"] = api_key

## Paths

In [93]:
path_orig_en = pathlib.Path("/export/usuarios_ml4ds/lbartolome/Repos/umd/LinQAForge/data/source/corpus_rosie/corpus_pass_en_tr.parquet")
path_orig_es = pathlib.Path("/export/usuarios_ml4ds/lbartolome/Repos/umd/LinQAForge/data/source/corpus_rosie/corpus_pass_es_tr.parquet")
path_source = pathlib.Path("/export/usuarios_ml4ds/lbartolome/Repos/umd/LinQAForge/data/source/corpus_rosie/passages/df_1.parquet")

path_model = pathlib.Path("/export/usuarios_ml4ds/lbartolome/Repos/umd/LinQAForge/data/models/LDA/passage/rosie_lg_lda_1_20")

path_corpus_en = path_model / "train_data" / "corpus_EN.txt"
path_corpus_es = path_model / "train_data" / "corpus_ES.txt"

persist_directory = (path_model / 'db_en_pred_test12').as_posix()

## Read

In [5]:
raw_en = pd.read_parquet(path_source)
raw_en.head()

Unnamed: 0,id_preproc,lemmas,doc_id,text,lang
0,0,decrease initiation prevalence smoking hungary...,EN_492297_60866-9,To decrease the initiation and prevalence of s...,EN
1,1,value add table calculated entrance_exit skin ...,EN_143330_25224-123,Values added to Table 3-3; The calculated entr...,EN
2,2,outbreak duval_county begin april peak october...,EN_524864_63868-2,The outbreak in Duval County began in April 19...,EN
3,3,broder rapid communication bethesda system rep...,EN_518687_63341-14,Broder S. Rapid communication: the Bethesda Sy...,EN
4,4,opportunity meet social_worker child life spec...,EN_569477_70415-6,You will have the opportunity to meet with our...,EN


In [6]:
raw_en.columns

Index(['id_preproc', 'lemmas', 'doc_id', 'text', 'lang'], dtype='object')

In [7]:
with path_corpus_en.open("r", encoding="utf-8") as f:
    lines = [line for line in f.readlines()]
corpus_en = [line.rsplit(" 0 ")[1].strip().split() for line in lines]

ids = [line.split(" 0 ")[0] for line in lines]
df_en = pd.DataFrame({"lemmas": [" ".join(doc) for doc in corpus_en]})
df_en["doc_id"] = ids
df_en["len"] = df_en['lemmas'].apply(lambda x: len(x.split()))
df_en["id_top"] = range(len(df_en))

In [8]:
#corpus_en[0]

In [9]:
df_en_raw = df_en.merge(raw_en, how="inner", on="doc_id")[["doc_id", "id_top", "id_preproc", "lemmas_x", "text", "len"]]
df_en_raw

Unnamed: 0,doc_id,id_top,id_preproc,lemmas_x,text,len
0,EN_492297_60866-9,0,0,decrease initiation prevalence smoking hungary...,To decrease the initiation and prevalence of s...,63
1,EN_143330_25224-123,1,1,value add table calculated entrance_exit skin ...,Values added to Table 3-3; The calculated entr...,61
2,EN_524864_63868-2,2,2,outbreak duval_county begin april peak october...,The outbreak in Duval County began in April 19...,58
3,EN_518687_63341-14,3,3,broder rapid communication bethesda system rep...,Broder S. Rapid communication: the Bethesda Sy...,13
4,EN_569477_70415-6,4,4,opportunity meet social_worker child life spec...,You will have the opportunity to meet with our...,9
...,...,...,...,...,...,...
1393108,EN_485225_59974-3,1393108,1393108,acip cdc determine priority group rank_tier ba...,"ACIP and CDC determined the priority groups, r...",61
1393109,EN_1219267_277263-11,1393109,1393109,surgical lie operating table intravenous_intra...,For a surgical biopsy:\n- You'll lie on an ope...,53
1393110,EN_1238215_279154-29,1393110,1393110,cure rate people stage tumor people stage intr...,The cure rates for people with stage III tumor...,11
1393111,EN_321620_48080-1,1393111,1393111,suggested_citation article schneider kl lapane...,Suggested citation for this article: Schneider...,21


In [10]:
# Read thetas 
thetas = sparse.load_npz(path_model.joinpath(f"mallet_output/{'EN'}/thetas.npz")).toarray()
betas = np.load((path_model.joinpath(f"mallet_output/{'EN'}/betas.npy")))
def get_thetas_str(row,thetas):
    return " ".join([f"{id_}|{round(el, 4)}" for id_,el in enumerate(thetas[row]) if el!=0.0])

def get_most_repr_tpc(row,thetas):
    return np.argmax(thetas[row])

df_en_raw["thetas"] = df_en_raw.apply(lambda row: get_thetas_str(row['id_top'], thetas), axis=1)
df_en_raw["id_tpc"] = df_en_raw.apply(lambda row: get_most_repr_tpc(row['id_top'], thetas), axis=1)

In [11]:
df_en_raw.head()

Unnamed: 0,doc_id,id_top,id_preproc,lemmas_x,text,len,thetas,id_tpc
0,EN_492297_60866-9,0,0,decrease initiation prevalence smoking hungary...,To decrease the initiation and prevalence of s...,63,1|0.30140000581741333 6|0.04839999973773956 10...,10
1,EN_143330_25224-123,1,1,value add table calculated entrance_exit skin ...,Values added to Table 3-3; The calculated entr...,61,3|0.08810000121593475 7|0.45579999685287476 11...,7
2,EN_524864_63868-2,2,2,outbreak duval_county begin april peak october...,The outbreak in Duval County began in April 19...,58,1|0.22429999709129333 15|0.7408999800682068 17...,15
3,EN_518687_63341-14,3,3,broder rapid communication bethesda system rep...,Broder S. Rapid communication: the Bethesda Sy...,13,4|0.22939999401569366 6|0.004100000020116568 1...,19
4,EN_569477_70415-6,4,4,opportunity meet social_worker child life spec...,You will have the opportunity to meet with our...,9,4|0.0035000001080334187 6|0.005900000222027302...,14


## Merge with preds

In [12]:
df_pred = pd.read_csv("/export/usuarios_ml4ds/lbartolome/Repos/umd/LinQAForge/data/models/filtering/rosie_1_20/df_docs_predicted_EN.csv")
df_pred

Unnamed: 0,lemmas_x_x,id_top,doc_id,id_preproc,lemmas_x_y,text,len,label,human_labeled,predicted_label
0,order avoid future allergic_reaction range vom...,0,EN_607595_99219-9,0,order avoid future allergic_reaction range vom...,In order to avoid future allergic reactions – ...,24,1.0,True,
1,author_affiliations shinyi_wu phd university s...,1,EN_183633_46512-40,1,author_affiliations shinyi_wu phd university s...,"Author Affiliations: Shinyi Wu, PhD, Universit...",18,0.0,False,0.0
2,review_feb good doctor,2,EN_854328_122182-27,2,review_feb good doctor,"Reviewed on Feb 24, 2023: He is a very good do...",3,1.0,False,1.0
3,remove morning remove water balloon place syri...,3,EN_1361170_297168-6,3,remove morning remove water balloon place syri...,"Removing the catheter:\n- In the morning, remo...",41,1.0,False,1.0
4,risk factor child diagnose mis_c age year case...,4,EN_1091308_196292-20,4,risk factor child diagnose mis_c age year old ...,Risk factors: Children diagnosed with MIS-C ar...,17,1.0,False,1.0
...,...,...,...,...,...,...,...,...,...,...
1393092,overview photograph presence tumor vertebrae t...,1393108,EN_1292789_287135-0,1393108,overview photograph presence tumor vertebrae t...,Overview: This photograph shows the presence o...,11,1.0,False,1.0
1393093,pain bad day esi information technology begin ...,1393109,EN_771214_115470-28,1393109,pain bad day esi information technology begin ...,Your pain may become worse for two to three da...,16,1.0,False,1.0
1393094,weaver parent promote healthy body image child...,1393110,EN_595076_97282-3,1393110,ask weaver parent promote healthy body image c...,We asked Dr. Weaver how parents can promote he...,108,1.0,False,1.0
1393095,choi_peter meyerson_matthew targeted genomic_r...,1393111,EN_629755_104139-9,1393111,choi_peter s meyerson_matthew targeted genomic...,"2014: Choi Peter S, Meyerson Matthew: Targeted...",9,0.0,False,0.0


In [13]:
df_keep = df_pred[(df_pred.label == 1.0) | (df_pred.predicted_label == 1.0)]
df_keep

Unnamed: 0,lemmas_x_x,id_top,doc_id,id_preproc,lemmas_x_y,text,len,label,human_labeled,predicted_label
0,order avoid future allergic_reaction range vom...,0,EN_607595_99219-9,0,order avoid future allergic_reaction range vom...,In order to avoid future allergic reactions – ...,24,1.0,True,
2,review_feb good doctor,2,EN_854328_122182-27,2,review_feb good doctor,"Reviewed on Feb 24, 2023: He is a very good do...",3,1.0,False,1.0
3,remove morning remove water balloon place syri...,3,EN_1361170_297168-6,3,remove morning remove water balloon place syri...,"Removing the catheter:\n- In the morning, remo...",41,1.0,False,1.0
4,risk factor child diagnose mis_c age year case...,4,EN_1091308_196292-20,4,risk factor child diagnose mis_c age year old ...,Risk factors: Children diagnosed with MIS-C ar...,17,1.0,False,1.0
5,impact year single mom kid home schooling info...,5,EN_955188_164961-11,5,impact year single mom kid home schooling info...,How has COVID-19 impacted you personally and p...,37,1.0,False,1.0
...,...,...,...,...,...,...,...,...,...,...
1393088,comprises_dose diphtheria_tetanus toxoid_pertu...,1393104,EN_405733_61282-13,1393104,comprises_dose diphtheria_tetanus toxoid_pertu...,§ Comprises >4 doses of diphtheria and tetanus...,30,1.0,False,1.0
1393089,work good treatment talk pediatrician infant b...,1393105,EN_915674_151728-5,1393105,work good treatment talk pediatrician infant b...,"If that doesn’t work, what are the best treatm...",23,1.0,False,1.0
1393092,overview photograph presence tumor vertebrae t...,1393108,EN_1292789_287135-0,1393108,overview photograph presence tumor vertebrae t...,Overview: This photograph shows the presence o...,11,1.0,False,1.0
1393093,pain bad day esi information technology begin ...,1393109,EN_771214_115470-28,1393109,pain bad day esi information technology begin ...,Your pain may become worse for two to three da...,16,1.0,False,1.0


In [14]:
df_keep_ = df_keep.merge(df_en_raw, how="inner", on="id_top")
df_keep_ = df_keep_.loc[:, ~df_keep_.columns.str.endswith('_y')]
df_keep_.columns = df_keep_.columns.str.replace('_x', '')  # Remove '_x' suffix
df_keep_

Unnamed: 0,lemmas,id_top,doc_id,id_preproc,text,len,label,human_labeled,predicted_label,lemmas.1,thetas,id_tpc
0,order avoid future allergic_reaction range vom...,0,EN_607595_99219-9,0,In order to avoid future allergic reactions – ...,24,1.0,True,,decrease initiation prevalence smoking hungary...,1|0.30140000581741333 6|0.04839999973773956 10...,10
1,review_feb good doctor,2,EN_854328_122182-27,2,"Reviewed on Feb 24, 2023: He is a very good do...",3,1.0,False,1.0,outbreak duval_county begin april peak october...,1|0.22429999709129333 15|0.7408999800682068 17...,15
2,remove morning remove water balloon place syri...,3,EN_1361170_297168-6,3,"Removing the catheter:\n- In the morning, remo...",41,1.0,False,1.0,broder rapid communication bethesda system rep...,4|0.22939999401569366 6|0.004100000020116568 1...,19
3,risk factor child diagnose mis_c age year case...,4,EN_1091308_196292-20,4,Risk factors: Children diagnosed with MIS-C ar...,17,1.0,False,1.0,opportunity meet social_worker child life spec...,4|0.0035000001080334187 6|0.005900000222027302...,14
4,impact year single mom kid home schooling info...,5,EN_955188_164961-11,5,How has COVID-19 impacted you personally and p...,37,1.0,False,1.0,sexual_orientation feel fluid,0|0.005100000184029341 1|0.2888000011444092 2|...,12
...,...,...,...,...,...,...,...,...,...,...,...,...
985220,comprises_dose diphtheria_tetanus toxoid_pertu...,1393104,EN_405733_61282-13,1393104,§ Comprises >4 doses of diphtheria and tetanus...,30,1.0,False,1.0,review_jan continue recommend,0|0.005100000184029341 1|0.0066999997943639755...,6
985221,work good treatment talk pediatrician infant b...,1393105,EN_915674_151728-5,1393105,"If that doesn’t work, what are the best treatm...",23,1.0,False,1.0,acknowledgment amob_vll major program activity...,1|0.015300000086426735 10|0.47769999504089355 ...,13
985222,overview photograph presence tumor vertebrae t...,1393108,EN_1292789_287135-0,1393108,Overview: This photograph shows the presence o...,11,1.0,False,1.0,acip cdc determine priority group rank_tier ba...,15|1.0,15
985223,pain bad day esi information technology begin ...,1393109,EN_771214_115470-28,1393109,Your pain may become worse for two to three da...,16,1.0,False,1.0,surgical lie operating table intravenous_intra...,0|0.0203000009059906 8|0.04039999842643738 12|...,12


In [15]:
# Filter thetas by df_keep
df_keep_idx_set = set(df_keep.id_top.values.tolist())

# Create a mask where True indicates the index is in df_keep_idx_set
mask = np.array([i in df_keep_idx_set for i in range(thetas.shape[0])])

# Create a copy of thetas
thetas_ = thetas.copy()

# Set values to 0 where the mask is False
thetas_[~mask, :] = 0
thetas_.shape

(1393113, 20)

In [16]:
# Load vocab dictionaries
vocab_w2id = {}
vocab_id2w = {}

# Open the file and read the lines
with open((path_model / "mallet_output/EN" / "vocab_freq.txt"), 'r', encoding='utf8') as file:
    for i, line in enumerate(file):
        # Strip leading and trailing whitespace
        stripped_line = line.strip()
        # Split the line into words and numbers
        parts = stripped_line.split()
        if parts:
            # Get the word (first part)
            wd = parts[0]
            # Populate the dictionaries
            vocab_w2id[wd] = i
            vocab_id2w[str(i)] = wd

## Topics

In [17]:
# Load topic-keys
keys = []
with open((path_model / "mallet_output/EN" / "topickeys.txt"), 'r') as file:
    for line in file:
        # Strip leading and trailing whitespace
        stripped_line = line.strip()
        # Split the line into parts and ignore the first two parts (number and float)
        parts = stripped_line.split(maxsplit=2)
        if len(parts) > 2:
            text_part = parts[2]
            keys.append(text_part)

In [18]:
keys

['test heart doctor_begin blood disease image doctor technology lung information procedure magnetic_resonance imaging diagnose surgery result provider body ultrasound condition',
 'age year high rate report health student death prevalence increase world organization state woman white adult person black population group',
 'food eat water outbreak product information people report healthy illness diet include ill store animal source drink technology fda day',
 'information technology skin symptom infection child people body common area severe bacteria treatment include condition treat pain day spread eye',
 'case infection patient health hiv test report person tuberculosis world disease organization testing laboratory result cdc virus risk treatment transmission',
 'medication information technology treatment medicine doctor drug treat symptom effect dose day prescribe pain provider time stop healthcare_provider talk child',
 'information technology child time family good health day par

## Create VectorDB for each topic

In [94]:
# Create a text splitter with specified chunk size and overlap
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)

# Select topic to analyze
topic = 18
df_topic = df_keep_[df_keep_.id_tpc == topic]

# Create documents and split them into chunks
#documents = [
#    Document(page_content=chunk, metadata={"url": "local", "source": "initial", "identifier": row['doc_id'], "id_top": row["id_top"]})
#    for idx, row in df_topic.iterrows()
#    for chunk in text_splitter.split_text(row['text'])
#]

# Get most representative document for that topic and assume it is free of contradictions
thetas_topic = thetas.T[topic]
top_doc_topic = np.argsort(thetas_topic)[::-1][0]
df_topic_doc = df_topic[df_topic.id_top == top_doc_topic]

# Create Langchain document for that doc
documents = [
    Document(page_content=chunk, metadata={"identifier": row['doc_id'], "source": row["id_top"]})
    for idx, row in df_topic_doc.iterrows()
    for chunk in text_splitter.split_text(row['text'])
]

In [20]:
np.argsort(thetas_topic)[::-1][0]

921550

In [21]:
df_topic_doc.text

651670    Don't forget that children with ASD have the s...
Name: text, dtype: object

In [47]:
documents

[Document(page_content="Don't forget that children with ASD have the same healthcare needs as any other child and benefit from the same healthcare and disease prevention services. Accessing healthcare services can be challenging for children with autism because of communication barriers and sensory sensitivities. Preparation and planning help make medical visits more familiar and comfortable for children and their caregivers. As children with autism learn to cooperate with medical visits, they are better able to participate in healthy living and they build trust with their healthcare providers that impacts their overall health for their lifetime. Children’s National has a variety of visual supports and other resources to help families prepare and plan for medical visits.", metadata={'identifier': 'EN_474213_67961-3', 'source': 921550})]

In [95]:
start = time.time()
# Define embeddings
embedding = OpenAIEmbeddings()

# Create vector database with this document
vectorstore = Chroma.from_documents(
    documents=documents,
    embedding=embedding,
    persist_directory=persist_directory 
)
# Tiempo total de ejecución, tarda 51 mins en 300k docs
end = time.time()
print(f"Total time is {end - start} seconds")

Total time is 0.5859715938568115 seconds


Faithfulness: Choose one of the following options:

True: No contradiction or discrepancy.
Minor Discrepancy: A minor inconsistency that does not change the overall meaning. Examples include slight wording differences or small factual errors that do not affect the main message.
Major Discrepancy: A significant inconsistency that alters the meaning or could mislead. Examples include notable factual errors or significant shifts in focus that change the context or understanding.
Direct Contradiction: A direct and clear opposition to the context, making both statements impossible to be true simultaneously.
Not Applicable: If the CONTEXT and TEXT are about entirely different topics, there is no inconsistency.


In [96]:
prompt_template_text = """
Identify if there are contradictions with the provided context.

CONTEXT: {summaries}

TEXT: {question}

---

Follow the following format.

Faithfulness: One of the following:
- True: There is a direct and clear opposition to the context.
- False: No contradiction or the new text is not relevant to the context provided.
Rationale: Why the new text contradicts the context or not
Type: One of the following
- Minor Discrepancy: A minor inconsistency that does not change the overall meaning. Examples include slight wording differences or small factual errors that do not affect the main message.
- Major Discrepancy: A significant inconsistency that alters the meaning or could mislead. Examples include notable factual errors or significant shifts in focus that change the context or understanding.
- Direct Contradiction: A direct and clear opposition to the context, making both statements impossible to be true simultaneously.
- N/A
Ids_contradiction: List with the 'source' of the documents in which the contradictions were found
---
"""
prompt_template = PromptTemplate.from_template(template=prompt_template_text)

In [97]:
# Set up the turbo LLM
llm = ChatOpenAI(
    temperature=0,
    model_name='gpt-3.5-turbo'
)

# Define retriever
nb_retrieval_docs = 5
retriever = vectorstore.as_retriever(
    search_kwargs={"k": nb_retrieval_docs}
)

# Crear la chain
chain = RetrievalQAWithSourcesChain.from_chain_type(
    llm=llm,
    retriever=retriever,
    chain_type="stuff",
    verbose=False,
    chain_type_kwargs={
        "verbose": False,
        "prompt": prompt_template
    }
)

In [26]:
df_topic.iloc[0].text

'One thing’s for sure: Visceral pain isn’t “all in your head." But it is a little bit. That makes it trickier to target than a mechanical problem, but it also gives you some power. By activating the power of your brain, you can work to change your own neural pathways to reduce pain.'

In [27]:
retriever.get_relevant_documents(df_topic.iloc[0].text)

  warn_deprecated(
Number of requested results 5 is greater than number of elements in index 1, updating n_results = 1


[Document(page_content="Don't forget that children with ASD have the same healthcare needs as any other child and benefit from the same healthcare and disease prevention services. Accessing healthcare services can be challenging for children with autism because of communication barriers and sensory sensitivities. Preparation and planning help make medical visits more familiar and comfortable for children and their caregivers. As children with autism learn to cooperate with medical visits, they are better able to participate in healthy living and they build trust with their healthcare providers that impacts their overall health for their lifetime. Children’s National has a variety of visual supports and other resources to help families prepare and plan for medical visits.", metadata={'identifier': 'EN_474213_67961-3', 'source': 921550})]

In [28]:
"""
import pickle
with open('non_consistent.pkl', 'wb') as file:
    pickle.dump(non_consistent, file)
"""

"\nimport pickle\nwith open('non_consistent.pkl', 'wb') as file:\n    pickle.dump(non_consistent, file)\n"

In [29]:
#non_consistent[17]

In [30]:
print(df_topic[df_topic.doc_id=="EN_1347456_294495-14"].text.values[0])
print("\n######\n")
print(df_topic[df_topic.id_top==765831].text.values[0])

IBS can be painful, but it doesn’t lead to other health problems or damage the digestive tract.

######

Luckily, diarrhea is usually short-lived, lasting no more than a few days. But when diarrhea lasts beyond a few days into weeks, it usually indicates that there's another problem — such as irritable bowel syndrome (IBS) or a more serious disorder, including persistent infection, celiac disease or inflammatory bowel disease (IBD).


In [98]:
results = []
for index, row in df_topic.sample(n=1000, random_state=1).iterrows():
    if index % 100 == 0:
        print(f"-- -- Processed index {index / len(df_topic)}")
    d_response = chain({"question": row.text})
    #print(d_response)
    try:
        faithfulness = d_response["answer"].split("Faithfulness: ")[1].split("Rationale")[0].strip()
        rationale = d_response["answer"].split("Rationale: ")[1].strip().split("Type")[0].strip()
        type_ = d_response["answer"].split("Type: ")[1].strip().split("Ids_contradiction")[0].strip()
        Ids_contradiction = d_response["answer"].split("Ids_contradiction: ")[1].strip()
    except:
        import pdb; pdb.set_trace()
    
    if faithfulness == "False":
        vectorstore.add_documents([Document(page_content=row['text'], metadata={"identifier": row['doc_id'], "source": row["id_top"]})])
    results.append([row.doc_id, faithfulness, rationale,type_, Ids_contradiction])

Number of requested results 5 is greater than number of elements in index 1, updating n_results = 1
Number of requested results 5 is greater than number of elements in index 2, updating n_results = 2
Number of requested results 5 is greater than number of elements in index 3, updating n_results = 3
Number of requested results 5 is greater than number of elements in index 4, updating n_results = 4
Number of requested results 5 is greater than number of elements in index 4, updating n_results = 4


-- -- Processed index 27.718132748087076
-- -- Processed index 22.433121774719734
-- -- Processed index 14.422563615872827
-- -- Processed index 8.191470431223678
-- -- Processed index 9.33329378966724
-- -- Processed index 26.241176819502936
-- -- Processed index 19.980425885283825
-- -- Processed index 8.947743045257726
-- -- Processed index 22.818672519129247
-- -- Processed index 3.8851651936651046


In [104]:
df_res = pd.DataFrame(results, columns=["id", "faith", "reason", "type", "source"])
df_res

Unnamed: 0,id,faith,reason,type,source
0,EN_445197_64755-19,False,The text about mortality rates due to resident...,Minor Discrepancy,921550
1,EN_477738_69621-4,False,The new text about flu shots for children does...,Minor Discrepancy,
2,EN_49947_31287-4,False,The new text about flu shots for children does...,Minor Discrepancy,183192
3,EN_372769_57249-2,True,The context emphasizes the importance of using...,Major Discrepancy,"1372827, 778261"
4,EN_1301077_289674-7,False,The new text about ring chromosome 14 being pa...,Minor Discrepancy,
...,...,...,...,...,...
995,EN_886974_127793-6,False,The new text about keeping an X-ray record car...,Minor Discrepancy,
996,EN_829124_16393-6,True,The new text discusses the possibility of fals...,Major Discrepancy,1041356
997,EN_1081782_195120-71,True,The new text discusses functional imaging and ...,Major Discrepancy,
998,EN_137015_44586-3,True,The new text states that suicide was the 10th ...,Major Discrepancy,"946664, 848602, 1372827, 744388"


In [110]:
df_res[df_res["type"] == "Direct Contradiction"].iloc[0]

id                                      EN_1256734_281633-3
faith                                                  True
reason    The context mentions that a child should recei...
type                                   Direct Contradiction
source                                               217686
Name: 15, dtype: object

In [111]:
df_res[df_res["type"] == "Direct Contradiction"].iloc[0].source

'217686'

In [143]:
id_=3
print(df_res[df_res["type"] == "Minor Discrepancy"].iloc[id_].reason)
print("\n######\n")
print(df_topic[df_topic.doc_id==df_res[df_res["type"] == "Minor Discrepancy"].iloc[id_].id].text.values[0])
print("\n######\n")
try:
    print(df_topic[df_topic.id_top==int(df_res[df_res["type"] == "Minor Discrepancy"].iloc[id_].source)].text.values[0])
except:
    ids = df_res[df_res["type"] == "Direct Contradiction"].iloc[id_].source.split()
    #.split()
    for el in ids:                         
        print(df_topic[df_topic.id_top==int(el.strip(','))].text.values[0])

The new text about ring chromosome 14 being passed from a mother to her children does not directly contradict the provided context about flu shots, healthcare needs for children with ASD, mortality rates due to residential fires, or CDC programs for American Indian and Alaska Native people.

######

Most affected individuals have no history of the disorder in their families. However, at least two families have been reported in which a ring chromosome 14 was passed from a mother to her children.

######

Help your patients and clients build a healthy eating pattern — one they can follow throughout the year.


In [144]:
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

In [146]:
df_res[df_res.faith == "False"]

Unnamed: 0,id,faith,reason,type,source
0,EN_445197_64755-19,False,The text about mortality rates due to residential fires does not directly contradict the context about healthcare needs for children with ASD.,Minor Discrepancy,921550
1,EN_477738_69621-4,False,The new text about flu shots for children does not directly contradict the context provided about healthcare needs for children with ASD. It is not directly related to the information given.,Minor Discrepancy,
2,EN_49947_31287-4,False,The new text about flu shots for children does not directly contradict the context provided about healthcare needs for children with ASD. It is not directly related to the information given.,Minor Discrepancy,183192
4,EN_1301077_289674-7,False,"The new text about ring chromosome 14 being passed from a mother to her children does not directly contradict the provided context about flu shots, healthcare needs for children with ASD, mortality rates due to residential fires, or CDC programs for American Indian and Alaska Native people.",Minor Discrepancy,
6,EN_313866_53748-2,False,The new text about untreated dental caries is not directly relevant to the context provided about CDC's work with tribal organizations and promoting health among American Indian and Alaska Native people.,Minor Discrepancy,217686
...,...,...,...,...,...
974,EN_609541_99474-20,False,"The statement by Julian Allen, MD, does not directly contradict the context provided. It adds to the discussion about asthma but does not specifically address the ability to monitor lung function.",Minor Discrepancy,
987,EN_1097048_196848-3,False,"The text provided discusses the causes of acute kidney failure, which is not directly related to the context of ureteral cancer and the expertise of the professionals at Mayo Clinic.",Minor Discrepancy,1133839
989,EN_910105_150929-17,False,The new text is not relevant to the context provided.,,
994,EN_699518_109697-7,False,"The new text does not directly contradict the context provided. It simply adds information about individuals being welcome with or without a physician's referral, which does not conflict with the original content.",Minor Discrepancy,


In [154]:
for i in range(15):
    print("REASON ", df_res[df_res.faith == "False"].iloc[i].reason)
    print("\n######\n")
    print("TEXT ", df_topic[df_topic.doc_id==df_res[df_res["faith"] == "False"].iloc[i].id].text.values[0])
    print("\n######\n")

REASON  The text about mortality rates due to residential fires does not directly contradict the context about healthcare needs for children with ASD.

######

TEXT  Mortality rates due to residential fires differ markedly for blacks and whites. In 1984, the rate among blacks was 4.8/100,000; it was 1.5/100,000 among whites. By region, the mortality rate due to residential fires was highest in the South and lowest in the West. By age group, it was highest for persons 65 years of age or older (4.6/100,000) and second highest for children under 5 years of age (4.1/100,000).

######

REASON  The new text about flu shots for children does not directly contradict the context provided about healthcare needs for children with ASD. It is not directly related to the information given.

######

TEXT  A flu shot should also be given to:\n- A child who has a family member with a long-term (chronic) health condition\n- A child or teen taking aspirin as long-term therapy\n- A child with parents or c