# Downloading, Unzipping, extracting trial's parts, create final dataframe. [No need to run]

## Libraries and Data Downloading

In [None]:
# import os
# import glob
# import pickle
# import pandas as pd

# !wget https://www.trec-cds.org/2021_data/ClinicalTrials.2021-04-27.part1.zip
# !wget https://www.trec-cds.org/2021_data/ClinicalTrials.2021-04-27.part2.zip
# !wget https://www.trec-cds.org/2021_data/ClinicalTrials.2021-04-27.part3.zip
# !wget https://www.trec-cds.org/2021_data/ClinicalTrials.2021-04-27.part4.zip
# !wget https://www.trec-cds.org/2021_data/ClinicalTrials.2021-04-27.part5.zip


## Unzipping

In [None]:
# zip_files = ['path to each zip','']
# extracted_folder = 'path to folder to extract'

# for zip_file_path in zip_files:
#     # Unzip the file into the specified extracted folder
#     with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
#         zip_ref.extractall(extracted_folder)

#     # Print the contents of the extracted folder after each zip is processed
#     print(f"Contents of {zip_file_path} extracted to: {extracted_folder}")

# print("Unzipping completed successfully.")

## Util Functions to extract document parts

In [None]:
## Util functions to extract criteria
def get_element_text(element):
    return element.text.strip() if element is not None and element.text is not None else ''
import re

## Extraction of inclusion and exclusion criteria
def extract_criteria(criteria_text):
    inclusion_criteria = re.findall(r'Inclusion Criteria|Inclusion criteria|INCLUSION CRITERIA|Main inclusion criteria include', criteria_text)
    exclusion_criteria = re.findall(r'Exclusion Criteria|EXCLUSION CRITERIA|Exclusion criteria|Main exclusion criteria include', criteria_text)

    # Different cases that the inclusion/exclusion criteria can be found
    if inclusion_criteria and exclusion_criteria:
        # Case 1: Both Inclusion and Exclusion Criteria are present
        inclusion_index = criteria_text.find(inclusion_criteria[0])
        exclusion_index = criteria_text.find(exclusion_criteria[0])
        eligibility_data = {
            'eligibility_inclusion_criteria': criteria_text[inclusion_index+len(inclusion_criteria[0]):exclusion_index].strip(),
            'eligibility_exclusion_criteria': criteria_text[exclusion_index+len(exclusion_criteria[0]):].strip(),
        }
    elif inclusion_criteria:
        # Case 3: Only Inclusion Criteria is present
        inclusion_index = criteria_text.find(inclusion_criteria[0])
        eligibility_data = {
            'eligibility_inclusion_criteria': criteria_text[inclusion_index+len(inclusion_criteria[0]):].strip(),
            'eligibility_exclusion_criteria': '',
        }
    elif exclusion_criteria:
        # Case 4: Only Exclusion Criteria is present
        exclusion_index = criteria_text.find(exclusion_criteria[0])
        eligibility_data = {
            'eligibility_inclusion_criteria': criteria_text[:exclusion_index].strip(),
            'eligibility_exclusion_criteria': criteria_text[exclusion_index+len(exclusion_criteria[0]):].strip(),
        }
    else:
        # Case 5: Neither Inclusion nor Exclusion Criteria are present
        inclusion = re.findall(r'Inclusion|INCLUSION', criteria_text)
        exclusion = re.findall(r'Exclusion|EXCLUSION', criteria_text)

        if inclusion and exclusion:
          # Case 6: Both Inclusion and Exclusion are present
          inclusion_index = criteria_text.find(inclusion[0])
          exclusion_index = criteria_text.find(exclusion[0])
          eligibility_data = {
              'eligibility_inclusion_criteria': criteria_text[inclusion_index+len(inclusion[0]):exclusion_index].strip(),
              'eligibility_exclusion_criteria': criteria_text[exclusion_index+len(exclusion[0]):].strip(),
           }
        elif inclusion:
          # Case 7: Only Inclusion Criteria is present
          inclusion_index = criteria_text.find(inclusion[0])
          eligibility_data = {
              'eligibility_inclusion_criteria': criteria_text[inclusion_index+len(inclusion[0]):].strip(),
              'eligibility_exclusion_criteria': '',
          }
        elif exclusion:
          # Case 8: Only Exclusion Criteria is present
          exclusion_index = criteria_text.find(exclusion[0])
          eligibility_data = {
              'eligibility_inclusion_criteria': criteria_text[:exclusion_index].strip(),
              'eligibility_exclusion_criteria': criteria_text[exclusion_index+len(exclusion[0]):].strip(),
          }
        else:
          # Case 9: Neither Inclusion nor Exclusion solely or with Criteria are present
          eligibility_data = {
              'eligibility_inclusion_criteria': criteria_text,
              'eligibility_exclusion_criteria': criteria_text,
          }

    return eligibility_data

# Extract other tags of the eligibilty section
def extract_eligibility(eligibility_element):
    eligibility_data = {
        'eligibility_criteria': '',
        'eligibility_inclusion_criteria': '',
        'eligibility_exclusion_criteria': '',
        'eligibility_gender': '',
        'eligibility_minimum_age': '',
        'eligibility_maximum_age': '',
        'eligibility_healthy_volunteers': ''
    }

    if eligibility_element is not None:
        criteria_element = eligibility_element.find('criteria/textblock')
        if criteria_element is not None:
            criteria_text = criteria_element.text.strip()
            if criteria_text:
                eligibility_data['eligibility_criteria'] = criteria_text

                # Extract inclusion and exclusion criteria
                criteria_data = extract_criteria(criteria_text)
                eligibility_data.update(criteria_data)

        gender_element = eligibility_element.find('gender')
        eligibility_data['eligibility_gender'] = gender_element.text.strip() if gender_element is not None else ''

        min_age_element = eligibility_element.find('minimum_age')
        eligibility_data['eligibility_minimum_age'] = min_age_element.text.strip() if min_age_element is not None else ''

        max_age_element = eligibility_element.find('maximum_age')
        eligibility_data['eligibility_maximum_age'] = max_age_element.text.strip() if max_age_element is not None else ''

        volunteers_element = eligibility_element.find('healthy_volunteers')
        eligibility_data['eligibility_healthy_volunteers'] = volunteers_element.text.strip() if volunteers_element is not None else ''

    return eligibility_data

# Given a clinical trial in .xml format, extract several of its fields
def extract_tags(xml_data):
    root = ET.fromstring(xml_data)

    data_dict = {
        #id_info_nct_id
        'nct_id': get_element_text(root.find('.//id_info/nct_id')),
        'brief_title': get_element_text(root.find('brief_title')),
        'official_title': get_element_text(root.find('official_title')),
        'brief_summary': get_element_text(root.find('.//brief_summary/textblock')),
        'detailed_description': get_element_text(root.find('.//detailed_description/textblock')),
        'study_type': get_element_text(root.find('study_type')),
        'study_design_info_primary_purpose': get_element_text(root.find('.//study_design_info/primary_purpose')),
        'condition': get_element_text(root.find('condition')),
        'intervention_intervention_type': get_element_text(root.find('.//intervention/intervention_type')),
        'intervention_intervention_name': get_element_text(root.find('.//intervention/intervention_name')),
        'location': get_element_text(root.find('.//location/facility/name')),
        'trials_keyword': ', '.join([get_element_text(elem) for elem in root.findall('.//keyword')]),
        'condition_browse_mesh_term': get_element_text(root.find('.//condition_browse/mesh_term')),
        'intervention_browse_mesh_term': ', '.join([get_element_text(elem) for elem in root.findall('.//intervention_browse/mesh_term')])
    }

    # Extract eligibility section
    eligibility_element = root.find('eligibility')
    data_dict.update(extract_eligibility(eligibility_element))

    return data_dict

## Extract Information from the .xml files

In [None]:
# Initialize the list to store all the extracted documents
# all_documents = []

# Define the chunk size to save data in chunks
# chunk_size = 10000

# path_to_save_data = ''

# for i, root_folder_path in enumerate(root_folder_paths):
#     print(f'Processing {i + 1}/5 Folder.')
#     for j, ntc_folder_name in enumerate(os.listdir(root_folder_path)):
#         print(f'Processing {j + 1}/100 Internal Folder.')
#         ntc_folder_path = os.path.join(root_folder_path, ntc_folder_name)

#         # Define the pattern to search for XML files inside the folder.
#         xml_files_pattern = os.path.join(ntc_folder_path, "*.xml")

#         # Use the glob module to find all XML files matching the pattern.
#         xml_files_list = glob.glob(xml_files_pattern)

#         # Process each XML file
#         for k, xml_file_path in enumerate(xml_files_list):
#             print(f"Processing XML file {k + 1}/{len(xml_files_list)}: {xml_file_path}")
#             try:
#                 # Read the XML content from the file
#                 with open(xml_file_path, 'r') as file:
#                     xml_data = file.read()

#                 # Extract tags from the XML data
#                 document = extract_tags(xml_data)
#                 all_documents.append(document)

#                 # Check if it's time to save the chunk to the Pickle file
#                 if len(all_documents) >= chunk_size:
#                     # Save the extracted information to a Pickle file
#                     with open('path_to_save_data', 'ab') as file:
#                         pickle.dump(all_documents, file)

#                     # Clear the all_documents list to free up memory
#                     all_documents = []
#             except Exception as e:
#                 # Handle any exceptions that might occur during processing
#                 print(f"Error processing XML file: {xml_file_path}")
#                 print(f"Error details: {str(e)}")

# # Save any remaining data to the Pickle file
# if all_documents:
#     with open(path_to_save_data, 'ab') as file:
#         pickle.dump(all_documents, file)

# print("Extraction and Pickling complete!")

## Load the Pickle file in a dataframe to use it with PyTerrier

In [None]:
# Specify the path to the pickle file
pickle_file_path = ''

# Load the data from the pickle file into a list of dictionaries
with open(pickle_file_path, 'rb') as file:
    extracted_data = []
    while True:
        try:
            document = pickle.load(file)
            extracted_data.extend(document)
        except EOFError:
            break

# Convert the list of dictionaries to a Pandas DataFrame
df = pd.DataFrame(extracted_data)

# Downloading the Clinical trials 2021 collection [Start Here]

2 GBs --> 20 sec

In [4]:
!gdown --id 1oi3mnz6PQVt-tEMR6IQnqC0ab9IZ1iXx

Downloading...
From: https://drive.google.com/uc?id=1oi3mnz6PQVt-tEMR6IQnqC0ab9IZ1iXx
To: /content/extracted_information.pkl
100% 2.20G/2.20G [00:18<00:00, 120MB/s]


# Load the Dataframe

2 GBs --> 18 secs, 375580 Documents

In [63]:
import pandas as pd
import pickle

# Specify the path to the pickle file
pickle_file_path = '/content/extracted_information.pkl'

# Load the data from the pickle file into a list of dictionaries
with open(pickle_file_path, 'rb') as file:
    extracted_data = []
    while True:
        try:
            document = pickle.load(file)
            extracted_data.extend(document)
        except EOFError:
            break

# Convert the list of dictionaries to a Pandas DataFrame
df = pd.DataFrame(extracted_data)

# Now 'df' is a Pandas DataFrame that contains the extracted information from the XML files.
# You can use standard Pandas DataFrame operations to analyze and manipulate the data.
# For example, you can print the first few rows of the DataFrame:
display(df.head())
print(f'Total number of documents: {len(df)}')

Unnamed: 0,nct_id,brief_title,official_title,brief_summary,detailed_description,study_type,study_design_info_primary_purpose,condition,intervention_intervention_type,intervention_intervention_name,location,trials_keyword,condition_browse_mesh_term,intervention_browse_mesh_term,eligibility_criteria,eligibility_inclusion_criteria,eligibility_exclusion_criteria,eligibility_gender,eligibility_minimum_age,eligibility_maximum_age,eligibility_healthy_volunteers
0,NCT00976963,Single Dose Monurol for Treatment of Acute Cys...,Single Dose Monurol for Treatment of Acute Cys...,Urinary tract infecton (UTI) is a very common ...,Procedures subjects will undergo once they hav...,Interventional,Treatment,Urinary Tract Infection,Drug,Fosfomycin,University of Washington,UTI,Urinary Tract Infections,Fosfomycin,Inclusion Criteria:\r\n\r\n - Non pr...,:\r\n\r\n - Non pregnant women in go...,":\r\n\r\n - Pregnant, lactating, or ...",Female,18 Years,45 Years,Accepts Healthy Volunteers
1,NCT00976573,"Carboplatin, Paclitaxel, and Bevacizumab With ...","A Randomized Phase II Trial of Carboplatin, Pa...",This randomized phase II trial is studying how...,OBJECTIVES:\r\n\r\n Primary\r\n\r\n ...,Interventional,Treatment,Melanoma (Skin),Biological,bevacizumab,Mayo Clinic Scottsdale,"Stage IV Skin Melanoma, recurrent melanoma",Melanoma,"Paclitaxel, Bevacizumab, Carboplatin, Everolimus",Inclusion Criteria:\r\n\r\n - Histol...,:\r\n\r\n - Histologic proof of stag...,- Prior treatment with agents disrupting vasc...,All,18 Years,,No
2,NCT00971139,Implementing Online Patient-Provider Communica...,Implementing Online Patient-Provider Communica...,"This interdisciplinary, international collabor...",A rapidly growing research literature document...,Interventional,Supportive Care,Cancer,Behavioral,Access to an OPPC service,Oslo University Hospital - Rikshospitalet,"Online patient-provider communication, Organiz...",,,Inclusion Criteria:\r\n\r\n - 18 yea...,:\r\n\r\n - 18 years of age.\r\n\r\n...,:\r\n\r\n - Excluded are patients wh...,All,18 Years,,No
3,NCT00977769,Carbetocin Versus Oxytocin and Hemodynamic Eff...,"Hemodynamic Effects of Carbetocin 100 µg, Oxyt...",A randomized double-blind trial of oxytocin 5 ...,Healthy pregnant women sheduled for elective c...,Interventional,Treatment,"Effects of; Anesthesia, in Pregnancy",Drug,carbetocin 100 µg,Division of Anaesthesia and Intensive Care Med...,"cesarean delivery, blood pressure, cardiac output",,"Oxytocin, Carbetocin",Inclusion Criteria:\r\n\r\n - Health...,:\r\n\r\n - Healthy pregnant women f...,:\r\n\r\n - Bleeding disorders\r\n\r...,Female,18 Years,50 Years,No
4,NCT00976924,Clinical Test of Blood Glucose Test Strips,Clinical Test of Blood Glucose Test Strips,Blood glucose test strips are tested with the ...,,Interventional,Diagnostic,Diabetes,Device,blood glucose monitor,Tianjin Medical University general hospital,blood glucose,,,Inclusion Criteria:\r\n\r\n - The pa...,:\r\n\r\n - The patients with blood ...,:\r\n\r\n - The patients with blood ...,All,22 Years,78 Years,Accepts Healthy Volunteers


Total number of documents: 375580


# Analyze the data


## Installing PyTerrier

In [64]:
!pip install python-terrier
import pyterrier as pt
if not pt.started():
  pt.init(boot_packages=["com.github.terrierteam:terrier-prf:-SNAPSHOT"])

from pyterrier.measures import *



## Creating a field to index per document

In [66]:
## Rename the ntc_id column to docno
df.rename(columns={'nct_id': 'docno'}, inplace=True)
display(df.keys())

## Merge the titles, and the summary in one text
columns_to_combine = ["brief_title", "official_title", "brief_summary", "detailed_description", "condition", "condition_browse_mesh_term", "eligibility_gender", "eligibility_criteria"]
df["text"] = ""
for col in columns_to_combine:
  print(f"Merge col {col}")
  df["text"] = df["text"] + " " + df[col].fillna("").astype(str)

# df = df.head(50_000)
# display(df)

Index(['docno', 'brief_title', 'official_title', 'brief_summary',
       'detailed_description', 'study_type',
       'study_design_info_primary_purpose', 'condition',
       'intervention_intervention_type', 'intervention_intervention_name',
       'location', 'trials_keyword', 'condition_browse_mesh_term',
       'intervention_browse_mesh_term', 'eligibility_criteria',
       'eligibility_inclusion_criteria', 'eligibility_exclusion_criteria',
       'eligibility_gender', 'eligibility_minimum_age',
       'eligibility_maximum_age', 'eligibility_healthy_volunteers', 'text'],
      dtype='object')

Merge col brief_title
Merge col official_title
Merge col brief_summary
Merge col detailed_description
Merge col condition
Merge col condition_browse_mesh_term
Merge col eligibility_gender
Merge col eligibility_criteria


In the following cell, we index the dataframe's documents. The index, with all its data structures, is written into a directory called `index`.

[10 minutes - Whole Collection]

In [96]:
# !rm -r ./content/index

In [97]:
##Allows to set a property in Terrier’s global properties configuration. Example:
pt.set_property("termpipelines", "PorterStemmer, stopwords")

# Index the text, record the docnos and the raw text as metadata
pd_indexer = pt.DFIndexer("./content/index", overwrite=True, verbose=True)
indexref = pd_indexer.index(df["text"], df[["docno"]])

##Printing the files related to the index
!ls -lh content/index/

  0%|          | 0/375580 [00:00<?, ?documents/s]

  for column, value in meta_column[1].iteritems():


total 195M
-rw-r--r-- 1 root root  71M Aug 30 15:18 data.direct.bf
-rw-r--r-- 1 root root 6.1M Aug 30 15:18 data.document.fsarrayfile
-rw-r--r-- 1 root root  66M Aug 30 15:19 data.inverted.bf
-rw-r--r-- 1 root root  26M Aug 30 15:19 data.lexicon.fsomapfile
-rw-r--r-- 1 root root 1017 Aug 30 15:19 data.lexicon.fsomaphash
-rw-r--r-- 1 root root 1.3M Aug 30 15:19 data.lexicon.fsomapid
-rw-r--r-- 1 root root  14M Aug 30 15:18 data.meta-0.fsomapfile
-rw-r--r-- 1 root root 2.9M Aug 30 15:18 data.meta.idx
-rw-r--r-- 1 root root 9.7M Aug 30 15:18 data.meta.zdata
-rw-r--r-- 1 root root 4.1K Aug 30 15:19 data.properties


Printing some statistics

# Loading queries and qrels

We will use ir_datasets to obtain the queries and the qrels
https://ir-datasets.com/

In [98]:
# Queries
!pip install --upgrade ir_datasets
dataset = pt.get_dataset('irds:clinicaltrials/2021/trec-ct-2022')
queries = dataset.get_topics(variant='text')
display(queries[:10])



Unnamed: 0,qid,query
0,1,a 19 year old male came to clinic with some se...
1,2,a 32 year old woman comes to the hospital with...
2,3,a 51 year old man comes to the office complain...
3,4,a 66 year old woman comes to the office due to...
4,5,a 23 year old man comes to the emergency depar...
5,6,a 61 year old man comes to the clinic due to n...
6,7,a 3 year old girl is brought to the clinic by ...
7,8,a 7 month old boy is brought to emergency by h...
8,9,a 67 year old woman comes to the clinic due to...
9,10,a 19 year old girl comes to the clinic due to ...


In [99]:
#Qrels
!gdown --id 1RYHxr2sM9Hd2C2iRI_NXzO4RY71Adu-p

path_to_qrels = 'clinical_qrels22.txt'
qrels = pd.read_csv(path_to_qrels, names=['qid','Q0','docno','label'],sep=" ",header=None)
qrels = qrels.drop(columns=['Q0'])
qrels["qid"] = qrels["qid"].astype(str)
qrels["docno"] = qrels["docno"].astype(str)
display(qrels[:10])

Downloading...
From: https://drive.google.com/uc?id=1RYHxr2sM9Hd2C2iRI_NXzO4RY71Adu-p
To: /content/clinical_qrels22.txt
100% 666k/666k [00:00<00:00, 118MB/s]


Unnamed: 0,qid,docno,label
0,1,NCT00000409,0
1,1,NCT00001148,0
2,1,NCT00001181,0
3,1,NCT00001202,0
4,1,NCT00001270,0
5,1,NCT00001412,1
6,1,NCT00001417,0
7,1,NCT00001721,0
8,1,NCT00002459,0
9,1,NCT00002619,0


In [100]:
qrels['label'].value_counts()

0    28419
2     3939
1     3036
Name: label, dtype: int64

In [101]:
qrels_relevance = qrels.copy()
qrels_relevance["label"] = qrels_relevance["label"].map({0:0, 1:1, 2:1})
qrels_relevance[:10]

Unnamed: 0,qid,docno,label
0,1,NCT00000409,0
1,1,NCT00001148,0
2,1,NCT00001181,0
3,1,NCT00001202,0
4,1,NCT00001270,0
5,1,NCT00001412,1
6,1,NCT00001417,0
7,1,NCT00001721,0
8,1,NCT00002459,0
9,1,NCT00002619,0


In [102]:
qrels_eligible = qrels.copy()
qrels_eligible = qrels_eligible[qrels_eligible["label"] != 0]
qrels_eligible["label"] = qrels_eligible["label"].map({1:0, 2:1})
qrels_eligible[:10]

Unnamed: 0,qid,docno,label
5,1,NCT00001412,0
32,1,NCT00004335,0
49,1,NCT00064987,0
51,1,NCT00070733,0
52,1,NCT00080483,0
56,1,NCT00104572,0
57,1,NCT00112151,0
60,1,NCT00119483,0
65,1,NCT00136695,0
66,1,NCT00140153,0


# Candidate retrieval

In [103]:
bm25 = pt.BatchRetrieve(index, wmodel="BM25", properties={"termpipelines" : "Stopwords,PorterStemmer"}, verbose=True)
rm3  = pt.rewrite.RM3(index, fb_terms=30, fb_docs=20, verbose=True)

candidate_retrieval_1 = bm25
candidate_retrieval_2 = (candidate_retrieval_1 >> rm3 >> candidate_retrieval_1 >> pt.rewrite.reset())

In [104]:
pt.Experiment(
    [
      candidate_retrieval_1,
      candidate_retrieval_2,
    ],
    names=[
        "candidate retr. 1",
        "candidate retr. 2",
    ],
    eval_metrics= [
        P(rel=2)@10, R(rel=2)@10, R(rel=2)@25, R(rel=2)@100, R(rel=2)@500, R(rel=2)@1000
    ],
    topics=queries,
    qrels=qrels,
    baseline=0,
    perquery = False,
    correction='b',
    highlight= 'color',
    verbose=True,
)

pt.Experiment:   0%|          | 0/2 [00:00<?, ?system/s]

BR(BM25):   0%|          | 0/50 [00:00<?, ?q/s]

BR(BM25):   0%|          | 0/50 [00:00<?, ?q/s]

Transformer:   0%|          | 0/50 [00:00<?, ?q/s]

BR(BM25):   0%|          | 0/50 [00:00<?, ?q/s]

Unnamed: 0,name,P(rel=2)@10,R(rel=2)@10,R(rel=2)@25,R(rel=2)@100,R(rel=2)@500,R(rel=2)@1000,P(rel=2)@10 +,P(rel=2)@10 -,P(rel=2)@10 p-value,P(rel=2)@10 reject,P(rel=2)@10 p-value corrected,R(rel=2)@10 +,R(rel=2)@10 -,R(rel=2)@10 p-value,R(rel=2)@10 reject,R(rel=2)@10 p-value corrected,R(rel=2)@25 +,R(rel=2)@25 -,R(rel=2)@25 p-value,R(rel=2)@25 reject,R(rel=2)@25 p-value corrected,R(rel=2)@100 +,R(rel=2)@100 -,R(rel=2)@100 p-value,R(rel=2)@100 reject,R(rel=2)@100 p-value corrected,R(rel=2)@500 +,R(rel=2)@500 -,R(rel=2)@500 p-value,R(rel=2)@500 reject,R(rel=2)@500 p-value corrected,R(rel=2)@1000 +,R(rel=2)@1000 -,R(rel=2)@1000 p-value,R(rel=2)@1000 reject,R(rel=2)@1000 p-value corrected
0,candidate retr. 1,0.208,0.032176,0.054195,0.079929,0.107472,0.112412,,,,False,,,,,False,,,,,False,,,,,False,,,,,False,,,,,False,
1,candidate retr. 2,0.246,0.035095,0.052845,0.08225,0.110824,0.120399,18.0,10.0,0.076156,False,0.152312,18.0,10.0,0.538192,False,1.0,14.0,16.0,0.737317,False,1.0,17.0,14.0,0.763505,False,1.0,17.0,8.0,0.584536,False,1.0,19.0,7.0,0.117197,False,0.234395


In [105]:
best_candidate_retrieval = candidate_retrieval_2

# Relevance re-ranking

In [106]:
!pip install --upgrade git+https://github.com/terrierteam/pyterrier_t5.git

Collecting git+https://github.com/terrierteam/pyterrier_t5.git
  Cloning https://github.com/terrierteam/pyterrier_t5.git to /tmp/pip-req-build-bquuwnue
  Running command git clone --filter=blob:none --quiet https://github.com/terrierteam/pyterrier_t5.git /tmp/pip-req-build-bquuwnue
  Resolved https://github.com/terrierteam/pyterrier_t5.git to commit 63756ebc2968ab03f46a61f0b391e27873226d75
  Preparing metadata (setup.py) ... [?25l[?25hdone


In [107]:
from pyterrier_t5 import MonoT5ReRanker
MonoT5ReRanker.__repr__ = lambda self: "monoT5"
mono_t5 = MonoT5ReRanker("castorini/monot5-base-msmarco", batch_size=128, verbose=True)

In [108]:
from dataclasses import dataclass

df_t = df[["docno", "text"]]

@dataclass(frozen=True)
class GetText(pt.Transformer):

  def transform(self, res):
    res = res.merge(df_t, how="inner", on="docno")
    return res

  def __repr__(self):
    return "GetText()"

In [109]:
relevance_reranking_1 = best_candidate_retrieval
relevance_reranking_2 = ~((best_candidate_retrieval % 100 >> GetText() >> mono_t5) ^ best_candidate_retrieval)

In [110]:
pt.Experiment(
    [
      relevance_reranking_1,
      relevance_reranking_2,
    ],
    names=[
        "rel. re-rank 1",
        "rel. re-rank 2",
    ],
    eval_metrics= [
        P(rel=1)@10, nDCG@10, Rprec(rel=1), RR(rel=1),
    ],
    topics=queries,
    qrels=qrels_relevance,
    baseline=0,
    perquery = False,
    correction='b',
    highlight= 'color',
    verbose=True,
)

pt.Experiment:   0%|          | 0/2 [00:00<?, ?system/s]

BR(BM25):   0%|          | 0/50 [00:00<?, ?q/s]

Transformer:   0%|          | 0/50 [00:00<?, ?q/s]

BR(BM25):   0%|          | 0/50 [00:00<?, ?q/s]

BR(BM25):   0%|          | 0/50 [00:00<?, ?q/s]

Transformer:   0%|          | 0/50 [00:00<?, ?q/s]

BR(BM25):   0%|          | 0/50 [00:00<?, ?q/s]

monoT5:   0%|          | 0/40 [00:00<?, ?batches/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1300 > 512). Running this sequence through the model will result in indexing errors


BR(BM25):   0%|          | 0/50 [00:00<?, ?q/s]

Transformer:   0%|          | 0/50 [00:00<?, ?q/s]

BR(BM25):   0%|          | 0/50 [00:00<?, ?q/s]

Unnamed: 0,name,Rprec,RR,P@10,nDCG@10,Rprec +,Rprec -,Rprec p-value,Rprec reject,Rprec p-value corrected,RR +,RR -,RR p-value,RR reject,RR p-value corrected,P@10 +,P@10 -,P@10 p-value,P@10 reject,P@10 p-value corrected,nDCG@10 +,nDCG@10 -,nDCG@10 p-value,nDCG@10 reject,nDCG@10 p-value corrected
0,rel. re-rank 1,0.077887,0.548553,0.36,0.378741,,,,False,,,,,False,,,,,False,,,,,False,
1,rel. re-rank 2,0.07694,0.486379,0.308,0.325485,7.0,3.0,0.734941,False,1.0,11.0,17.0,0.301395,False,0.602789,12.0,23.0,0.124015,False,0.24803,17.0,24.0,0.157692,False,0.315385


In [111]:
pt.Experiment(
    [
      relevance_reranking_1,
      relevance_reranking_2,
    ],
    names=[
        "rel. re-rank 1",
        "rel. re-rank 2",
    ],
    eval_metrics= [
        # P(rel=2)@10, nDCG@10, Rprec(rel=2), RR(rel=2),
        P(rel=2)@10, R(rel=2)@10, R(rel=2)@25, R(rel=2)@100, R(rel=2)@500, R(rel=2)@1000
    ],
    topics=queries,
    qrels=qrels,
    baseline=0,
    perquery = False,
    correction='b',
    highlight= 'color',
    verbose=True,
)

pt.Experiment:   0%|          | 0/2 [00:00<?, ?system/s]

BR(BM25):   0%|          | 0/50 [00:00<?, ?q/s]

Transformer:   0%|          | 0/50 [00:00<?, ?q/s]

BR(BM25):   0%|          | 0/50 [00:00<?, ?q/s]

Unnamed: 0,name,P(rel=2)@10,R(rel=2)@10,R(rel=2)@25,R(rel=2)@100,R(rel=2)@500,R(rel=2)@1000,P(rel=2)@10 +,P(rel=2)@10 -,P(rel=2)@10 p-value,P(rel=2)@10 reject,P(rel=2)@10 p-value corrected,R(rel=2)@10 +,R(rel=2)@10 -,R(rel=2)@10 p-value,R(rel=2)@10 reject,R(rel=2)@10 p-value corrected,R(rel=2)@25 +,R(rel=2)@25 -,R(rel=2)@25 p-value,R(rel=2)@25 reject,R(rel=2)@25 p-value corrected,R(rel=2)@100 +,R(rel=2)@100 -,R(rel=2)@100 p-value,R(rel=2)@100 reject,R(rel=2)@100 p-value corrected,R(rel=2)@500 +,R(rel=2)@500 -,R(rel=2)@500 p-value,R(rel=2)@500 reject,R(rel=2)@500 p-value corrected,R(rel=2)@1000 +,R(rel=2)@1000 -,R(rel=2)@1000 p-value,R(rel=2)@1000 reject,R(rel=2)@1000 p-value corrected
0,rel. re-rank 1,0.246,0.035095,0.052845,0.08225,0.110824,0.120399,,,,False,,,,,False,,,,,False,,,,,False,,,,,False,,,,,False,
1,rel. re-rank 2,0.202,0.033455,0.051829,0.08225,0.110824,0.120399,9.0,15.0,0.072046,False,0.144092,9.0,15.0,0.724027,False,1.0,13.0,15.0,0.766445,False,1.0,0.0,0.0,,False,,0.0,0.0,,False,,0.0,0.0,,False,


# Eligibility re-ranking

In [88]:
import ir_datasets as irds
dataset_2021 = irds.load('clinicaltrials/2021/trec-ct-2021')
qrels_2021 = pd.DataFrame([
    {
        "qid": qrel.query_id,
        "docno": qrel.doc_id,
        "label": qrel.relevance - 1,
    }
    for qrel in dataset_2021.qrels_iter()
    if qrel.relevance != 0
])
qrels_2021 = qrels_2021.merge(df[["docno", "eligibility_criteria"]], on="docno")
qrels_2021 = qrels_2021.merge(queries[["qid", "query"]], on="qid")
qrels_2021


Output hidden; open in https://colab.research.google.com to view.

In [None]:
qrels_2021

# Evaluation

In [None]:
display(res.head(25))
### Create Condesed list
Con_aggr_res = pd.merge(res, qrels, on=["qid",'docno'],how="left")
display(Con_aggr_res.head(25))
# display(Con_aggr_res.head(25))
Con_aggr_res = Con_aggr_res[Con_aggr_res['label'].notna()]

eval = pt.Utils.evaluate(Con_aggr_res, qrels,metrics=[AP@1000,P@5,P@10],perquery=False)
display(eval)

Unnamed: 0,qid,docid,docno,rank,score,query
0,1,132571,NCT01511588,0,42.29426,a 19 year old male came to clinic with some se...
1,1,137074,NCT01689896,1,41.191565,a 19 year old male came to clinic with some se...
2,1,150445,NCT01190904,2,40.756693,a 19 year old male came to clinic with some se...
3,1,156096,NCT02612714,3,40.733572,a 19 year old male came to clinic with some se...
4,1,358654,NCT04630275,4,39.414434,a 19 year old male came to clinic with some se...
5,1,56037,NCT00104572,5,39.367162,a 19 year old male came to clinic with some se...
6,1,128044,NCT01672411,6,38.929173,a 19 year old male came to clinic with some se...
7,1,366889,NCT04036604,7,38.205295,a 19 year old male came to clinic with some se...
8,1,68308,NCT00170339,8,37.978924,a 19 year old male came to clinic with some se...
9,1,217263,NCT02529306,9,37.253048,a 19 year old male came to clinic with some se...


Unnamed: 0,qid,docid,docno,rank,score,query,label
0,1,132571,NCT01511588,0,42.29426,a 19 year old male came to clinic with some se...,2.0
1,1,137074,NCT01689896,1,41.191565,a 19 year old male came to clinic with some se...,1.0
2,1,150445,NCT01190904,2,40.756693,a 19 year old male came to clinic with some se...,0.0
3,1,156096,NCT02612714,3,40.733572,a 19 year old male came to clinic with some se...,0.0
4,1,358654,NCT04630275,4,39.414434,a 19 year old male came to clinic with some se...,0.0
5,1,56037,NCT00104572,5,39.367162,a 19 year old male came to clinic with some se...,1.0
6,1,128044,NCT01672411,6,38.929173,a 19 year old male came to clinic with some se...,0.0
7,1,366889,NCT04036604,7,38.205295,a 19 year old male came to clinic with some se...,0.0
8,1,68308,NCT00170339,8,37.978924,a 19 year old male came to clinic with some se...,0.0
9,1,217263,NCT02529306,9,37.253048,a 19 year old male came to clinic with some se...,0.0


{'AP@1000': 0.16395709300388217,
 'P@5': 0.46399999999999997,
 'P@10': 0.41200000000000003}

# Ideas to play around:



*   Write a script that, for each topic, keeps high representative terms. What is one property of those terms?
*   Index different document parts and perform retrieval.
*   Use the PyTerrier operators to create custom pipelines. Try to use different indices for each step and unite the results.

Documentation: https://pyterrier.readthedocs.io/en/latest/index.html
