# Prerequisites

* Any Run in the TREC Format
* Pubmed XML collection: http://trec-cds.appspot.com/2018.html#documents
* 2018 Topics: http://trec-cds.appspot.com/topics2018.xml
* Extra Abstracts TXT collection: http://trec-cds.appspot.com/2018.html#documents

In [62]:
from os import listdir
from os.path import isfile, isdir, join
from lxml import etree
import pandas as pd
import tarfile
import gzip
import time
import csv
import csv
import re

# Decompress _.tar.gz_ Files

In [None]:
# Decompress Files
def decompress(myPath):
    fileNames = [f for f in listdir(myPath) if isfile(join(myPath, f)) and f[-7:] == ".tar.gz"]
    for file in fileNames:
        print("Extracting from: ", file)
        tar = tarfile.open(join(myPath, file), "r:gz")
        tar.extractall(join(myPath, file[:-7]))
        tar.close()
        print("Done")
        
if __name__ == "__main__":
    # Path containing the medline_xml.part[x].tar.gz files (Pubmed XML collection)
    pubMedAbstracts = "/Users/ari/Downloads/TREC/XML-Collection"

    # Decompress files
    decompress(pubMedAbstracts)

# Extract Information from Pubmed XML and Extra Abstracts Text Files

In [51]:
# Extract Pubmed Ids from the Gold-Standard CSV file
def extractDocIDs(filePath):
    """ Extracts all ids from the gold standard """
    f = pd.read_csv(filePath, 
                    names=["trec_topic_number", "x", "trec_doc_id", "order", "relevance", "name"], dtype={'trec_doc_id':object},
                    sep="\t", encoding="utf-8"
                   )
    return set(f['trec_doc_id'])

# Get the name of the folders containing xml.gz files
def getFolderNames(myPath):
    dirNames = [d for d in listdir(myPath) if isdir(join(myPath, d))]
    return dirNames

# Get the name of the xml.gz files
def getGzFileNames(myPath):
    fileNames = [f for f in listdir(myPath) if isfile(join(myPath, f)) and f[-7:] == ".xml.gz"]
    return fileNames

def getTarFileNames(myPath):
    fileNames = [f for f in listdir(myPath) if isfile(join(myPath, f)) and f[-4:] == ".tar"]
    return fileNames

def unzipTar(folderPath, docIDsPath, targetFolder=''):
    # Unzip either pubmed or extra abstracts from folderPath to targetFolder if they are in the gold standard
    ids = extractDocIDs(docIDsPath)
    print("Gold Standard Ids:", len(ids))
    tarFiles = getTarFileNames(folderPath)
    print(tarFiles)
    
    if targetFolder:
        outpuPath=join(folderPath, targetFolder)
    else:
        outpuPath=folderPath
    txtCounter = 0
    for tarFileName in tarFiles:
        print("Searching through:", tarFileName)
        tar = tarfile.open(join(folderPath, tarFileName), 'r:')
        for txtFile in tar:
            
            # Extract ID from full path
            docID = re.search( r'\/(.*)\.', txtFile.name)
            if docID:
                # Extract file only when there is a match
                if docID.group(1) in ids:
                    txtCounter += 1
                    ids.remove(docID.group(1))
                    tar.extract(txtFile, path=outpuPath)

        tar.close()
    print("Matched files:", txtCounter)
        
# Extract relevant information from the papers inside the XML files that match the gold-standard
def extractFeatures(folderPath, docIDsPath, outputPath):
    st = time.time()
    
    # Get Pubmed Ids from the Gold-Standard
    ids = extractDocIDs(docIDsPath)
    print("Nr of PMIDs in the Gold-Standard:", len(ids))
    # Recover the names of each folder containing xml.gz files
    
    folderNames = getFolderNames(folderPath)
    
    nrExtractedXMLs = 0
    
    # Create CSV for the output
    with open(outputPath, 'w', encoding='utf-8') as extractFile:
        wr = csv.writer(extractFile, quoting=csv.QUOTE_ALL, delimiter="\t")
        wr.writerow(["trec_doc_id","title","abstract","major_mesh","minor_mesh"])
    
    # Iterate through the folders with the xml.gz files
    for folderName in folderNames:
        print("Looking into files from folder: ", folderName)
        gzFiles = getGzFileNames(join(folderPath, folderName))
        for gzFileName in gzFiles:
            print("Analyzing information from: ", gzFileName)
            gzFilePath = join(join(folderPath,folderName), gzFileName)
            parser = etree.XMLParser(encoding='utf-8', recover=True)
            pubMedArticleSet = etree.parse(gzip.open(gzFilePath, 'rt', encoding='utf-8'), parser=parser).getroot()
            for mc in pubMedArticleSet.iterfind('PubmedArticle/MedlineCitation'):
                pmid = mc.find("PMID").text
                # Verify if the PMID is in the list of IDs from the Gold-Standard
                majorMeshTerms = []
                minorMeshTerms = []
                abstractList = []
                if pmid in ids:
                    # Remove found pmid from ids set
                    ids.remove(pmid)
                    
                    print("Extracting info from the PMID: ", pmid)
                    # Get title
                    if mc.find("Article/ArticleTitle") is not None:
                        title = ''.join(mc.find("Article/ArticleTitle").itertext())
                    # Get abstract, including the structured ones
                    if mc.find("Article/Abstract") is not None:
                        for abstractPiece in mc.findall("Article/Abstract/AbstractText"):
                            abstractList.append(''.join(abstractPiece.itertext()))
                        abstract = ' '.join(abstractList)
                    # Extracting major and minor mesh descriptors
                    # Qualifiers - not taking into account major and minor attributes
                    for meshTerm in mc.findall("MeshHeadingList/MeshHeading"):
                        qualifiers = []
                        for qualifier in meshTerm.findall("QualifierName"):
                            qualifiers.append(meshTerm.find("DescriptorName").text + "/" + qualifier.text)
                        if not qualifiers:
                            fullMesh = meshTerm.find("DescriptorName").text
                            if meshTerm.find("DescriptorName").get("MajorTopicYN") == "Y":
                                majorMeshTerms.append(fullMesh)
                            else:
                                minorMeshTerms.append(fullMesh)
                        else:
                            if meshTerm.find("DescriptorName").get("MajorTopicYN") == "Y":
                                majorMeshTerms.extend(qualifiers)
                            else:
                                minorMeshTerms.extend(qualifiers)
                    majorMeshList = ";".join(majorMeshTerms)
                    minorMeshList = ";".join(minorMeshTerms)
                    
                    # Write the result to CSV
                    with open(outputPath, 'a', encoding='utf-8') as extractFile:
                        wr = csv.writer(extractFile, quoting=csv.QUOTE_ALL, delimiter="\t")
                        wr.writerow([pmid, title, abstract, majorMeshList, minorMeshList])
                
                    # Count the number of extracted papers
                    nrExtractedXMLs += 1
    
    print("Number of papers with information extracted: ", nrExtractedXMLs)
    end = time.time()
    print("Run time: ", end-st)
    
def extractExtraFeatures(extraAbstracts, extractedFeaturesFile):
    files = [fi for fi in listdir(extraAbstracts) if isfile(join(extraAbstracts, fi))]
    fCount = 0
    for fi in files:
        fCount += 1
        fiObj = open(join(extraAbstracts, fi), encoding="utf8")
        fId = fi[:-4]
        lines = fiObj.readlines()
        fullTitle = lines[1].strip()
        title = re.search( r'(Title:) (.*)', fullTitle).group(2)
        abstract = ""
        for line in lines[2:]:
            if line.strip():
                abstract += line.strip() + " "
        with open(extractedFeaturesFile, 'a', encoding="utf8") as extractFile:
            wr = csv.writer(extractFile, quoting=csv.QUOTE_ALL, delimiter="\t")
            wr.writerow([fId, title, abstract])    
    print("Extracted files:", fCount)

In [42]:
# Path containing the Annotated Gold-Standard File
docIDPath = "/Users/ari/Downloads/TREC/trec2018/results/runs/hpipubcommon.trec_results"

In [None]:
if __name__ == "__main__":
    # Path containing the medline_xml.partx folders - they need to be extracted first
    pubMedAbstracts = "/Users/ari/Downloads/TREC/XML-Collection"
   
    # Output file
    outputPath = "relevantAbstractsPubRun.csv"
    
    # Extract relevant information from the XML files
    extractFeatures(pubMedAbstracts, docIDPath, outputPath)

# For issues

In [140]:
relevantAbs1 = pd.read_csv("/Users/ari/Downloads/TREC/trec-pm/notebooks/relevantAbstractsPubRunPart1.csv", sep='\t', encoding="utf-8", dtype={'trec_doc_id':object})
relevantAbs1.shape

(20839, 5)

In [141]:
relevantAbs2 = pd.read_csv("/Users/ari/Downloads/TREC/trec-pm/notebooks/relevantAbstractsPubRunPart2.csv", sep='\t', encoding="utf-8", dtype={'trec_doc_id':object})
relevantAbs2.shape

(4436, 5)

In [142]:
relevantAbs3 = pd.read_csv("/Users/ari/Downloads/TREC/trec-pm/notebooks/relevantAbstractsPubRunPart3.csv", sep='\t', encoding="utf-8", dtype={'trec_doc_id':object})
relevantAbs3.shape

(7186, 5)

In [143]:
relevantAbs = pd.concat([relevantAbs1,relevantAbs2,relevantAbs3]).drop_duplicates(subset='trec_doc_id').reset_index(drop=True)
relevantAbs.head()

Unnamed: 0,trec_doc_id,title,abstract,major_mesh,minor_mesh
0,4004617,Metastatic uveal melanoma. Pretherapy serum li...,The liver was the organ most frequently involv...,Uveal Neoplasms/radiotherapy;Uveal Neoplasms/s...,Adult;Aged;Alkaline Phosphatase/blood;Aspartat...
1,8521382,Immunohistochemical detection of the cyclin-de...,The retinoblastoma (RB) and cyclin-dependent k...,,Carrier Proteins/metabolism;Cell Nucleus/metab...
2,8521414,Multiple mechanisms of p16INK4A inactivation i...,"p16INK4A, a specific inhibitor of cyclin-depen...",Proto-Oncogene Proteins,"Carcinoma, Non-Small-Cell Lung/genetics;Carcin..."
3,8522248,DNA amplification of HER-2/neu and INT-2 oncog...,Oncogene alterations are thought to be prognos...,"Gene Amplification;Genes, erbB-2","Adolescent;Adult;Aged;Aged, 80 and over;Base S..."
4,8524841,The Bcr-Abl leukemia oncogene activates Jun ki...,The leukemogenic tyrosine kinase fusion protei...,"Cell Transformation, Neoplastic;MAP Kinase Kin...",Calcium-Calmodulin-Dependent Protein Kinases/m...


In [144]:
relevantAbs.to_csv(path_or_buf='/Users/ari/Downloads/TREC/trec-pm/notebooks/relevantAbstractsPubRun.csv', index=False, sep='\t')


# Read Relevant Pubmed Abstracts Output

In [145]:
abstracts = pd.read_csv("relevantAbstractsPubRun.csv", sep='\t', encoding="utf-8", dtype={'trec_doc_id':object})
abstracts.head(5)

Unnamed: 0,trec_doc_id,title,abstract,major_mesh,minor_mesh
0,4004617,Metastatic uveal melanoma. Pretherapy serum li...,The liver was the organ most frequently involv...,Uveal Neoplasms/radiotherapy;Uveal Neoplasms/s...,Adult;Aged;Alkaline Phosphatase/blood;Aspartat...
1,8521382,Immunohistochemical detection of the cyclin-de...,The retinoblastoma (RB) and cyclin-dependent k...,,Carrier Proteins/metabolism;Cell Nucleus/metab...
2,8521414,Multiple mechanisms of p16INK4A inactivation i...,"p16INK4A, a specific inhibitor of cyclin-depen...",Proto-Oncogene Proteins,"Carcinoma, Non-Small-Cell Lung/genetics;Carcin..."
3,8522248,DNA amplification of HER-2/neu and INT-2 oncog...,Oncogene alterations are thought to be prognos...,"Gene Amplification;Genes, erbB-2","Adolescent;Adult;Aged;Aged, 80 and over;Base S..."
4,8524841,The Bcr-Abl leukemia oncogene activates Jun ki...,The leukemogenic tyrosine kinase fusion protei...,"Cell Transformation, Neoplastic;MAP Kinase Kin...",Calcium-Calmodulin-Dependent Protein Kinases/m...


# Read Extra Abstracts

In [146]:
# Path containing the downloaded txt collection (extra abstracts)
extraAbstractsPath = "/Users/ari/Downloads/TREC/TXT-Collection/extra_abstracts"
# Path containing the extracted txt collection (extra abstracts)
extraAbstractsNewPath = join(extraAbstractsPath,"extra_abstracts")
abstractsGzFiles = getGzFileNames(extraAbstractsPath)
extractedFeaturesFile = "/Users/ari/Downloads/TREC/trec-pm/notebooks/relevantAbstractsPubRun.csv"

for abstractsGzFile in abstractsGzFiles:
    print("Extracting: ", abstractsGzFile)
    subprocess.call(['gunzip', '-d', join(abstractsPath, abstractsGzFile)])
    print("Done")

unzipTar(extraAbstractsPath, docIDPath)
extractExtraFeatures(extraAbstractsNewPath, extractedFeaturesFile)

Gold Standard Ids: 31090
['extra_abstracts.tar']
Searching through: extra_abstracts.tar
Matched files: 3065
Extracted files: 3065


In [147]:
abstracts = pd.read_csv("relevantAbstractsPubRun.csv", sep='\t', encoding="utf-8", dtype={'trec_doc_id':object})
abstracts.head()

Unnamed: 0,trec_doc_id,title,abstract,major_mesh,minor_mesh
0,4004617,Metastatic uveal melanoma. Pretherapy serum li...,The liver was the organ most frequently involv...,Uveal Neoplasms/radiotherapy;Uveal Neoplasms/s...,Adult;Aged;Alkaline Phosphatase/blood;Aspartat...
1,8521382,Immunohistochemical detection of the cyclin-de...,The retinoblastoma (RB) and cyclin-dependent k...,,Carrier Proteins/metabolism;Cell Nucleus/metab...
2,8521414,Multiple mechanisms of p16INK4A inactivation i...,"p16INK4A, a specific inhibitor of cyclin-depen...",Proto-Oncogene Proteins,"Carcinoma, Non-Small-Cell Lung/genetics;Carcin..."
3,8522248,DNA amplification of HER-2/neu and INT-2 oncog...,Oncogene alterations are thought to be prognos...,"Gene Amplification;Genes, erbB-2","Adolescent;Adult;Aged;Aged, 80 and over;Base S..."
4,8524841,The Bcr-Abl leukemia oncogene activates Jun ki...,The leukemogenic tyrosine kinase fusion protei...,"Cell Transformation, Neoplastic;MAP Kinase Kin...",Calcium-Calmodulin-Dependent Protein Kinases/m...


# Read Relevance Score

In [148]:
run = pd.read_csv("/Users/ari/Downloads/TREC/trec2018/results/runs/hpipubcommon.trec_results", sep='\t', encoding="utf-8", header=None, 
                        names=["trec_topic_number", "x", "trec_doc_id", "order", "relevance_score", "run_name"], dtype={'trec_topic_number':object})
run.head()

Unnamed: 0,trec_topic_number,x,trec_doc_id,order,relevance_score,run_name
0,1,Q0,23403819,1,141.282288,hpipubcommon
1,1,Q0,22742884,2,128.239929,hpipubcommon
2,1,Q0,21639808,3,127.580528,hpipubcommon
3,1,Q0,24535907,4,122.982758,hpipubcommon
4,1,Q0,21635872,5,121.221413,hpipubcommon


In [149]:
abstractsRun = run.merge(abstracts, left_on=['trec_doc_id'], right_on=['trec_doc_id'], how='left')
abstractsRun.drop(["order", "x"], axis=1, inplace=True)
abstractsRun.head(5)

Unnamed: 0,trec_topic_number,trec_doc_id,relevance_score,run_name,title,abstract,major_mesh,minor_mesh
0,1,23403819,141.282288,hpipubcommon,BRAF(V600E) protein expression and outcome fro...,To examine the association between level and p...,,"Adult;Aged;Aged, 80 and over;Clinical Trials a..."
1,1,22742884,128.239929,hpipubcommon,Vemurafenib in patients with BRAF V600E mutati...,"Vemurafenib is an oral, small-molecule kinase ...",,Antineoplastic Agents/adverse effects;Antineop...
2,1,21639808,127.580528,hpipubcommon,Improved survival with vemurafenib in melanoma...,Phase 1 and 2 clinical trials of the BRAF kina...,,"Adult;Aged;Aged, 80 and over;Antineoplastic Ag..."
3,1,24535907,122.982758,hpipubcommon,Analysis of the BRAF V600E mutation in primary...,BRAF V600E is the most common mutation in cuta...,Molecular Targeted Therapy,Aged;Brazil;Female;Humans;Male;Melanoma/geneti...
4,1,21635872,121.221413,hpipubcommon,Acquired and intrinsic BRAF inhibitor resistan...,The discovery of activating BRAF V600E mutatio...,,Antineoplastic Agents/pharmacology;Gene Expres...


In [150]:
runName = run["run_name"].unique()
runName

array(['hpipubcommon'], dtype=object)

# Read Information from 2018 Topics

In [151]:
topicsColumns = ['trec_topic_number', 'trec_topic_disease', 'trec_topic_age', 'trec_topic_sex']
topics = pd.DataFrame(columns=topicsColumns)
topicsXML = etree.parse("/Users/ari/Downloads/TREC/trec-pm/resources/topics2018.xml")
for topic in topicsXML.getroot():
    topicNumber = topic.get('number')
    disease = topic.find('disease').text
    demographic = topic.find('demographic').text.split(' ')
    age = demographic[0]
    sex = demographic[1]
    topics = topics.append(pd.Series([topicNumber, disease, age, sex], index=topicsColumns), ignore_index=True)
topics.head(1)

Unnamed: 0,trec_topic_number,trec_topic_disease,trec_topic_age,trec_topic_sex
0,1,melanoma,64-year-old,male


# Add 2017 topics Information

In [152]:
processedRun = abstractsRun.merge(topics, left_on=['trec_topic_number'], right_on=['trec_topic_number'], how='left')
processedRun['score'] = 0
processedRun.tail()

Unnamed: 0,trec_topic_number,trec_doc_id,relevance_score,run_name,title,abstract,major_mesh,minor_mesh,trec_topic_disease,trec_topic_age,trec_topic_sex,score
49995,50,25802479,77.349098,hpipubcommon,Identification of differentially expressed gen...,Acute lymphoblastic leukemia type B (B-ALL) is...,,"Adaptor Proteins, Vesicular Transport/genetics...",acute myeloid leukemia,13-year-old,male,0
49996,50,20425404,77.328011,hpipubcommon,The role of molecular tests in acute myelogeno...,The prognosis for patients with acute myelogen...,Gene Expression Profiling,"Acute Disease;Biomarkers, Tumor/genetics;Human...",acute myeloid leukemia,13-year-old,male,0
49997,50,AACR_2015-5408,77.304543,hpipubcommon,"The novel retinamide VNLG-152, which targets t...",Purpose: Acute myeloid leukemia (AML) with int...,,,acute myeloid leukemia,13-year-old,male,0
49998,50,16109776,77.278633,hpipubcommon,Mutations in nucleophosmin (NPM1) in acute mye...,Mutations in nucleophosmin NPM1 are the most f...,Oligonucleotide Array Sequence Analysis,Acute Disease;Adolescent;Adult;Age Factors;Age...,acute myeloid leukemia,13-year-old,male,0
49999,50,25301331,77.276886,hpipubcommon,Concealed dagger in FLT3/ITD+ AML.,"In this issue of Blood, Ostronoff et al report...",,"Female;Humans;Leukemia, Myeloid, Acute/metabol...",acute myeloid leukemia,13-year-old,male,0


# Save the Result into a new _.csv_

In [153]:
date = time.strftime("%Y%m%d")
processedRun.to_csv(path_or_buf='/Users/ari/Downloads/TREC/trec-pm/results/l2r/'+ date + 'processed'+runName[0]+'.csv', index=False, sep='\t')
