In [4]:
import pyterrier as pt
import gdown
import zipfile
import ir_datasets
from sqlalchemy import create_engine
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker
from db_utils import Base
import os

In [5]:
if not pt.started():
  pt.init()

PyTerrier 0.9.2 has loaded Terrier 5.7 (built by craigm on 2022-11-10 18:30) and terrier-helper 0.0.7

No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.


In [4]:
!pwd

/


In [6]:
url = "https://drive.google.com/file/d/16j6dzHIVxlULuIExZl8aJyconEbC7zs4/view?usp=sharing"
output = "/root/.ir_datasets/wapo/WashingtonPost.v2.tar.gz"
#gdown.download(url, output, quiet=False, fuzzy=True)

download_docstore = True

if download_docstore:
  url = "https://drive.google.com/file/d/1LPXK1shthTm-_yH4aA0SQJ0jU_btRPvh/view?usp=sharing"
  output = "WashingtonPost.v2.tar.gz.pklz4.zip"
  #gdown.download(url, output, quiet=False, fuzzy=True)

  #with zipfile.ZipFile(output,"r") as zip_ref:
  #    zip_ref.extractall("/root/.ir_datasets/wapo/")

dataset = ir_datasets.load("wapo/v2")
#indexer = pt.IterDictIndexer('/workspace/indices/wapo_v2', meta={"docno": 36})
#index_ref = indexer.index(dataset.get_corpus_iter(), fields=['url', 'title', 'author', 'kicker', 'body'])

In [6]:
#now build database

In [7]:
#! createdb -h postgres wapo

In [8]:
conn_string = 'postgresql://root@postgres:5432/' + "wapo"
engine = create_engine(conn_string)

In [9]:
def recreate_database():
    Base.metadata.drop_all(engine)
    Base.metadata.create_all(engine)

#recreate_database()

In [10]:
Session = sessionmaker(bind=engine)
session = Session()

In [11]:
from db_utils import parse_wapo_entry
from db_utils import WapoEntry
from db_utils import get_wapo_entry

In [12]:
cnt = 0

for doc in dataset.docs_iter():
    wapo_entry = parse_wapo_entry(doc)
    session.add(WapoEntry(**wapo_entry))
    cnt+=1
    if cnt % 1000 == 0:
        session.commit()
        print(f"\r {cnt} \t", end="")
session.commit()

 595000 	

In [13]:
#prepare topics and qrels

In [7]:
dataset = pt.get_dataset('irds:wapo/v2/trec-core-2018')

In [15]:
total_words = 0

cnt = 0
for doc in dataset.get_corpus_iter():
    total_words += len(doc['body'].split(" "))


wapo/v2/trec-core-2018 documents: 100%|██████████| 595037/595037 [00:27<00:00, 21415.00it/s]


In [17]:
total_words/595037

475.7733922428353

In [8]:
#transform topics/qrels

qrels = dataset.get_qrels()
topics = dataset.get_topics()

There are multiple query fields available: ('title', 'description', 'narrative'). To use with pyterrier, provide variant or modify dataframe to add query column.


In [9]:
topics

Unnamed: 0,qid,title,description,narrative
0,321,Women in Parliaments,Pertinent documents will reflect the fact that...,Pertinent documents relating to this issue wil...
1,336,Black Bear Attacks,A relevant document would discuss the frequenc...,It has been reported that food or cosmetics so...
2,341,Airport Security,A relevant document would discuss the effectiv...,A relevant document would contain reports on w...
3,347,Wildlife Extinction,The spotted owl episode in America highlighted...,"A relevant item will specify the country, the ..."
4,350,Health and Computer Terminals,Is it hazardous to the health of individuals t...,Relevant documents would contain any informati...
5,362,human smuggling,Identify incidents of human smuggling.,A relevant document shows an incident of human...
6,363,transportation tunnel disasters,What disasters have occurred in tunnels used f...,A relevant document identifies a disaster in a...
7,367,piracy,What modern instances have there been of old f...,Documents discussing piracy on any body of wat...
8,375,hydrogen energy,What is the status of research on hydrogen as ...,A relevant document will describe progress in ...
9,378,euro opposition,Identify documents that discuss opposition to ...,A relevant document should include the countri...


In [27]:
#cnt, 1, topic, query
qid_map = {}

with open("./workspace/data/wapo/title_queries", "w") as f:
    for i, row in topics.iterrows():
        qid_map[row['qid']] = i+1
        f.write(f"{1},1,{i+1},{row['title']}\n")

In [17]:
# generate term set



In [33]:
def generate_100_naive_queries(title, term_set):
    naive_queries = [title]
    
    while len(naive_queries) < 100:
        #draw new term
        for t in term_set:
            new_q =  f"{title} {t}"
            if not new_q in naive_queries:
                naive_queries.append(new_q)
            
            if len(naive_queries) >= 100:
                break
        
        if len(naive_queries) >= 100:
                break

        #draw two terms
        for i,j in [(i, j) for i in range(len(term_set)) for j in range(len(term_set))]:
            
            new_q = f"{title} {term_set[i]} {term_set[j]}"
            if not new_q in naive_queries:
                naive_queries.append(new_q)
            
            if len(naive_queries) >= 100:
                break

        if len(naive_queries) >= 100:
                break

    
    return naive_queries

In [None]:
term_set = list(set(topics.iloc[0]['description'].split(' ') + topics.iloc[0]['narrative'].split(' ')))
generate_100_naive_queries(topics.iloc[0]['title'], term_set)

In [36]:
#cnt, 1, topic, query
# generate new 100 queries from narrative and desscription
qid_map = {}



with open("./workspace/data/wapo/title_queries_naive_100", "w") as f:
    for i, row in topics.iterrows():
        qid_map[row['qid']] = i+1

        term_set = " ".join(list(set(row['description'].split(' ') + row['narrative'].split(' ')))).replace(",", " ").replace(".", " ").split(" ")

        naive_100 = generate_100_naive_queries(row['title'], term_set)

        for j in range(100):
            f.write(f"{j+1},1,{i+1},{naive_100[j]}\n")

In [None]:
#create topic files
for i, row in topics.iterrows():
    with open(f"./workspace/data/wapo/topics/topic.{i+1}", "w") as f:
        f.write(row['title'])

In [29]:
#topic 0 docno label 
with open("./workspace/data/wapo/wapo_qrels", "w") as f:
    for i, row in qrels.iterrows():
        f.write(f"{qid_map[row['qid']]} 0 {row['docno']} {row['label']}\n")