In [2]:
import glob
import pickle
from tqdm import tqdm

from sqlalchemy import Column, Integer, String, Date, Text, ARRAY, FLOAT

from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
from sqlalchemy.ext.declarative import declarative_base

from db_utils import WapoQueries, WapoDocEmbeddings, gen_session, refresh_db, get_wapo_doc2query, get_wapo_emb

import ir_datasets
import zipfile

import pyterrier as pt

In [3]:
#! createdb -h postgres datasets

In [4]:
server_address = 'postgresql://root@postgres:5432/'

In [5]:
session = gen_session(server_address=server_address, db_name="datasets")

In [6]:
Base = declarative_base()

  Base = declarative_base()


In [7]:
#define tables
class WapoEntry(Base):
    __tablename__ = 'wapo'
    doc_id = Column(String, primary_key=True)
    url = Column(String)
    title = Column(String)
    author = Column(String)
    kicker = Column(Text)
    body = Column(Text)
    
    def __repr__(self):
        repr_str = f"doc_id={self.doc_id}, url={self.url}, title={self.title}, author={self.author},"\
        f"kicker={self.kicker}, body={self.body}"
        
        return repr_str
    
class NytEntry(Base):
    __tablename__ = 'nyt'
    doc_id = Column(String, primary_key=True)
    headline = Column(String)
    body = Column(String)
    
    def __repr__(self):
        repr_str = f"doc_id={self.doc_id}, headline={self.headline}, body={self.body}"

        return repr_str

In [8]:
#create tables

engine = create_engine(server_address + "datasets")
Base.metadata.create_all(engine)

In [8]:
!cp /workspace/data/wapo/WashingtonPost.v2.tar.gz /root/.ir_datasets/wapo/
!cp /workspace/data/nyt/nyt.tgz /root/.ir_datasets/nyt/

In [9]:
dataset_nyt = ir_datasets.load("nyt")
dataset_wapo = ir_datasets.load("wapo/v2")

In [10]:
with zipfile.ZipFile("/workspace/data/nyt/nyt.tgz.pklz4.zip","r") as zip_ref:
      zip_ref.extractall("/root/.ir_datasets/nyt/")

In [11]:
with zipfile.ZipFile("/workspace/data/wapo/WashingtonPost.v2.tar.gz.pklz4.zip","r") as zip_ref:
      zip_ref.extractall("/root/.ir_datasets/wapo/")

In [12]:
docstore_nyt = dataset_nyt.docs_store()
docstore_wapo = dataset_wapo.docs_store()

In [13]:
docstore_nyt.get("532").headline

'DEATH TOLL AT 95 IN SAN JUAN BLAZE AS SEARCH GOES ON'

In [14]:
docstore_wapo.get("4b0d69ae-d85a-11e2-a9f2-42ee3912ae0e")

WapoDoc(doc_id='4b0d69ae-d85a-11e2-a9f2-42ee3912ae0e', url='https://www.washingtonpost.com/politics/2013/07/14/4b0d69ae-d85a-11e2-a9f2-42ee3912ae0e_story.html', title='Reid-McConnell clash latest evidence genteel Senate has turned into a fight club', author='Paul Kane', published_date=1373846443000, kicker='Politics', body='The 2014 midterm elections are just around the corner. These are the races to watch.\nThe U.S. Senate, once considered the most exclusive and chummiest club in America, has in recent years been transformed into an ideological war zone, where comity and compromise have lost their allure, while confrontation and showmanship now pay big dividends.\n\n\nReid’s move\nSuddenly, the world’s greatest deliberative body just isn’t that much fun anymore.\nA three-minute mashup of a spat between Senate Majority Leader Harry Reid and Minority Leader Mitch McConnell on the Senate floor.\n“I miss it like an abscessed tooth,” former senator Christopher S. Bond (R-Mo.), a 24-year ve

In [19]:
#fill database with entries
cnt = 0

with tqdm(total=dataset_wapo.docs_count()) as tbar:
    for doc in dataset_wapo.docs_iter():
        session.add(WapoEntry(doc_id = doc.doc_id, url=doc.url, title=doc.title, author=doc.author, kicker=doc.kicker, body=doc.body))
        cnt+=1
        if cnt%1000 == 0:
            session.commit()
        tbar.update()
    session.commit()

100%|██████████| 595037/595037 [01:33<00:00, 6373.37it/s] 


In [15]:
cnt = 0
#here we need to check for duplicates first. only the first doc with a unique doc_id will remain in the table
fin_docs = set()

with tqdm(total=dataset_nyt.docs_count()) as tbar:
    for doc in dataset_nyt.docs_iter():
        if doc.doc_id not in fin_docs:
            session.add(NytEntry(doc_id = doc.doc_id, headline=doc.headline, body=doc.body))
            fin_docs.add(doc.doc_id )
            cnt+=1

        if cnt%1000 == 0:
            session.commit()
        tbar.update()
    session.commit()

100%|██████████| 1864661/1864661 [04:05<00:00, 7591.33it/s]


In [None]:
#create indices

In [16]:
if not pt.started():
  pt.init()

PyTerrier 0.9.2 has loaded Terrier 5.7 (built by craigm on 2022-11-10 18:30) and terrier-helper 0.0.7

No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.


In [17]:
index_location_nyt = '/root/.ir_datasets/nyt/'
indexer_nyt = pt.IterDictIndexer(index_location_nyt)

indexer_nyt = pt.IterDictIndexer(
        index_location_nyt,
        verbose=True,
        meta={"docno": 26, "body": 100000},
        meta_tags={"body": "ELSE"},
        blocks=True
        )
ds_corpus_nyt = pt.get_dataset('irds:nyt')

In [18]:
index_ref_nyt = indexer_nyt.index(ds_corpus_nyt.get_corpus_iter(), fields=['headline', 'body'])
#index_ref_nyt = pt.IndexRef.of('/root/.ir_datasets/nyt/')

nyt documents:   0%|          | 3151/1864661 [00:01<12:17, 2525.29it/s]

In [15]:
index_location_wapo = '/root/.ir_datasets/wapo/'
indexer_wapo = pt.IterDictIndexer(index_location_wapo)

indexer_wapo = pt.IterDictIndexer(
        index_location_wapo,
        verbose=True,
        meta={"docno": 70, "body": 100000},
        meta_tags={"body": "ELSE"},
        blocks=True
        )
ds_corpus_wapo = pt.get_dataset('irds:wapo/v2')

In [16]:
index_ref_wapo = indexer_wapo.index(ds_corpus_wapo.get_corpus_iter(), fields=['title', 'body', 'kicker'])
#index_ref_wapo = pt.IndexRef.of(index_location_wapo)

wapo/v2 documents:  89%|████████▊ | 527298/595037 [07:29<01:26, 779.87it/s] 