In [1]:
import glob
import pickle
from tqdm import tqdm

from sqlalchemy import Column, Integer, String, Date, Text, ARRAY, FLOAT

from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
from sqlalchemy.ext.declarative import declarative_base

from db_utils import WapoQueries, WapoDocEmbeddings, gen_session, refresh_db, get_wapo_doc2query, get_wapo_emb

import ir_datasets
import zipfile

In [2]:
#! createdb -h postgres datasets

In [3]:
server_address = 'postgresql://root@postgres:5432/'

In [4]:
session = gen_session(server_address=server_address, db_name="datasets")

In [5]:
Base = declarative_base()

  Base = declarative_base()


In [6]:
#define tables
class WapoEntry(Base):
    __tablename__ = 'wapo'
    doc_id = Column(String, primary_key=True)
    url = Column(String)
    title = Column(String)
    author = Column(String)
    kicker = Column(Text)
    body = Column(Text)
    
    def __repr__(self):
        repr_str = f"doc_id={self.doc_id}, url={self.url}, title={self.title}, author={self.author},"\
        f"kicker={self.kicker}, body={self.body}"
        
        return repr_str
    
class NytEntry(Base):
    __tablename__ = 'nyt'
    doc_id = Column(String, primary_key=True)
    headline = Column(String)
    body = Column(String)
    
    def __repr__(self):
        repr_str = f"doc_id={self.doc_id}, headline={self.headline}, body={self.body}"

        return repr_str

In [7]:
#create tables

engine = create_engine(server_address + "datasets")
Base.metadata.create_all(engine)

In [8]:
dataset_nyt = ir_datasets.load("nyt")
dataset_wapo = ir_datasets.load("wapo/v2")

In [9]:
with zipfile.ZipFile("/workspace/data/nyt/nyt.tgz.pklz4.zip","r") as zip_ref:
      zip_ref.extractall("/root/.ir_datasets/nyt/")

In [10]:
with zipfile.ZipFile("/workspace/data/wapo/WashingtonPost.v2.tar.gz.pklz4.zip","r") as zip_ref:
      zip_ref.extractall("/root/.ir_datasets/wapo/")

In [11]:
docstore_nyt = dataset_nyt.docs_store()
docstore_wapo = dataset_wapo.docs_store()

In [None]:
docstore_nyt.get("532").headline

In [None]:
docstore_wapo.get("4b0d69ae-d85a-11e2-a9f2-42ee3912ae0e")

In [None]:
#fill database with entries
cnt = 0

with tqdm(total=dataset_wapo.docs_count()) as tbar:
    for doc in dataset_wapo.docs_iter():
        session.add(WapoEntry(doc_id = doc.doc_id, url=doc.url, title=doc.title, author=doc.author, kicker=doc.kicker, body=doc.body))
        cnt+=1
        if cnt%1000 == 0:
            session.commit()
        tbar.update()
    session.commit()

In [12]:
cnt = 0
#here we need to check for duplicates first. only the first doc with a unique doc_id will remain in the table
fin_docs = set()

with tqdm(total=dataset_wapo.docs_count()) as tbar:
    for doc in dataset_nyt.docs_iter():
        if doc.doc_id not in fin_docs:
            session.add(NytEntry(doc_id = doc.doc_id, headline=doc.headline, body=doc.body))
            fin_docs.add(doc.doc_id )
            cnt+=1

        if cnt%1000 == 0:
            session.commit()
        tbar.update()
    session.commit()

 75%|███████▍  | 443956/595037 [01:07<00:31, 4857.11it/s]

In [None]:
#create indices

In [None]:
a