In [1]:
%cd ../

/Users/hoangle/Projects/recsys


In [None]:
import polars as pl
from transformers import AutoTokenizer
from transformers import BertModel
from torch import nn
import torch
import lightning as L
from torch.utils.data import DataLoader, Dataset
from torch.optim import AdamW
from torch.nn import Module
from torch import Tensor
from polars import DataFrame
from loguru import logger
from lightning.pytorch.callbacks import RichProgressBar, ModelCheckpoint
from lightning.pytorch.loggers import TensorBoardLogger

In [None]:
MODEL_NAME = 'bert-base-cased'
Nq = 32

# Load raw data and process

In [3]:
queries = pl.read_ndjson("data/raw/hotpotqa/queries.jsonl")
queries.head()

_id,text,metadata
str,str,struct[2]
"""5ab6d31155429954757d3384""","""What country of origin does Ho…","{""American"",[[""House of Cosbys"", ""0""], [""Bill Cosby"", ""0""]]}"
"""5ac0d92f554299012d1db645""","""How many fountains where prese…","{""1,200 musical water fountains"",[[""Steve Davison"", ""0""], [""Steve Davison"", ""1""], … [""World of Color"", ""2""]]}"
"""5abd01335542993a06baf9fc""","""Chris Larceny directed the mus…","{""the Fugees"",[[""Chris Larceny"", ""3""], [""Wyclef Jean"", ""0""], [""Wyclef Jean"", ""2""]]}"
"""5abff8c95542994516f4555c""","""The person where local traditi…","{""the Iroquois Confederacy"",[[""Cross Lake"", ""1""], [""Hiawatha"", ""0""]]}"
"""5adec8ad55429975fa854f8f""","""The actor who played Carl Swee…","{""Denise DeClue"",[[""About Last Night (1986 film)"", ""1""], [""Tim Kazurinsky"", ""0""]]}"


In [4]:
queries = (
    queries
    .with_columns(
        pl.format("[CLS] [Q] {}", pl.col('text')).alias('text')
    )
)

queries.head()

_id,text,metadata
str,str,struct[2]
"""5ab6d31155429954757d3384""","""[CLS] [Q] What country of orig…","{""American"",[[""House of Cosbys"", ""0""], [""Bill Cosby"", ""0""]]}"
"""5ac0d92f554299012d1db645""","""[CLS] [Q] How many fountains w…","{""1,200 musical water fountains"",[[""Steve Davison"", ""0""], [""Steve Davison"", ""1""], … [""World of Color"", ""2""]]}"
"""5abd01335542993a06baf9fc""","""[CLS] [Q] Chris Larceny direct…","{""the Fugees"",[[""Chris Larceny"", ""3""], [""Wyclef Jean"", ""0""], [""Wyclef Jean"", ""2""]]}"
"""5abff8c95542994516f4555c""","""[CLS] [Q] The person where loc…","{""the Iroquois Confederacy"",[[""Cross Lake"", ""1""], [""Hiawatha"", ""0""]]}"
"""5adec8ad55429975fa854f8f""","""[CLS] [Q] The actor who played…","{""Denise DeClue"",[[""About Last Night (1986 film)"", ""1""], [""Tim Kazurinsky"", ""0""]]}"


In [5]:
corpus = pl.read_ndjson("data/raw/hotpotqa/corpus.jsonl")
corpus.head()

_id,title,text,metadata
str,str,str,struct[1]
"""12""","""Anarchism""","""Anarchism is a political philo…","{""https://en.wikipedia.org/wiki?curid=12""}"
"""25""","""Autism""","""Autism is a neurodevelopmental…","{""https://en.wikipedia.org/wiki?curid=25""}"
"""39""","""Albedo""","""Albedo ( ) is a measure for re…","{""https://en.wikipedia.org/wiki?curid=39""}"
"""290""","""A""","""A (named , plural ""As"", ""A's"",…","{""https://en.wikipedia.org/wiki?curid=290""}"
"""303""","""Alabama""","""Alabama ( ) is a state in the …","{""https://en.wikipedia.org/wiki?curid=303""}"


In [6]:
corpus = (
    corpus
    .with_columns(
        pl.format("[CLS] [D] {}", pl.col('text')).alias('text')
    )
)

corpus.head()

_id,title,text,metadata
str,str,str,struct[1]
"""12""","""Anarchism""","""[CLS] [D] Anarchism is a polit…","{""https://en.wikipedia.org/wiki?curid=12""}"
"""25""","""Autism""","""[CLS] [D] Autism is a neurodev…","{""https://en.wikipedia.org/wiki?curid=25""}"
"""39""","""Albedo""","""[CLS] [D] Albedo ( ) is a meas…","{""https://en.wikipedia.org/wiki?curid=39""}"
"""290""","""A""","""[CLS] [D] A (named , plural ""A…","{""https://en.wikipedia.org/wiki?curid=290""}"
"""303""","""Alabama""","""[CLS] [D] Alabama ( ) is a sta…","{""https://en.wikipedia.org/wiki?curid=303""}"


## Tokenize

In [None]:


tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.add_tokens(['[Q]', '[D]'], special_tokens=False)
tokenizer.add_special_tokens({'pad_token': tokenizer.mask_token})
tokenizer.get_added_vocab()

# TODO: HoangLe [Feb-20]: Add this to the implementation
# model = BertModel(MODEL_NAME)
# model.resize_token_embeddings(len(tokenizer))

{'[PAD]': 0,
 '[UNK]': 100,
 '[CLS]': 101,
 '[SEP]': 102,
 '[MASK]': 103,
 '[Q]': 28996,
 '[D]': 28997}

In [None]:
sentence = queries['text'].to_list()[0]
sentence

'[CLS] [Q] What country of origin does House of Cosbys and Bill Cosby have in common?'

In [None]:
queries_str = tokenizer(
    sentence,
    add_special_tokens=False,
    padding_side='right',
    max_length=Nq,
    truncation=True
)
tokenizer.convert_ids_to_tokens(queries_str['input_ids'])

['[CLS]',
 '[Q]',
 'What',
 'country',
 'of',
 'origin',
 'does',
 'House',
 'of',
 'Co',
 '##sby',
 '##s',
 'and',
 'Bill',
 'Co',
 '##sby',
 'have',
 'in',
 'common',
 '?']

In [None]:
c