# Loading Model

In [1]:
import torch

path = "models/transformer_6.8M.pt"
checkpoint = torch.load(path)

In [2]:
from src.enums import CheckpointEnum

print(f"Vocab Size:   {checkpoint[CheckpointEnum.VOCAB_SIZE]}")
print(f"D_Model:      {checkpoint[CheckpointEnum.D_MODEL]}")
print(f"Layers:       {checkpoint[CheckpointEnum.NUM_BLOCKS]}")
print(f"Heads:        {checkpoint[CheckpointEnum.NUM_HEADS]}")
print(f"Head Dim:     {checkpoint[CheckpointEnum.D_MODEL] // checkpoint[CheckpointEnum.NUM_HEADS]}")
print(f"Seq Len:      {checkpoint[CheckpointEnum.SEQ_LEN]}")
print(f"FF Hidden:    {checkpoint[CheckpointEnum.FF_HIDDEN_DIM]}")
print(f"Dropout:      {checkpoint[CheckpointEnum.DROPOUT]}")
print(f"num_params:   {checkpoint['num_params']:,}".replace(",", "."))

Vocab Size:   4000
D_Model:      256
Layers:       6
Heads:        8
Head Dim:     32
Seq Len:      128
FF Hidden:    1024
Dropout:      0.1
num_params:   6.819.232


# Loading Tokenizer

In [4]:
from pathlib import Path

from src.utils import load_bpe_hugging_face_tokenizer

BASE_DIR = Path.cwd()
TOKENIZER_DIR = BASE_DIR / "tokenizer"
path = TOKENIZER_DIR / "bpe_hugging_face_tokenizer.json"


tokenizer = load_bpe_hugging_face_tokenizer(path)

Loaded BPE hugging face tokenizer - vocab size: 4000


# Loading Data

In [10]:
from pathlib import Path

BASE_DIR = Path.cwd()

path = BASE_DIR / "data" / "letterboxd_filtered.jsonl"

In [11]:
from src.utils.data_loader import read_file_only_reviews

text = read_file_only_reviews(path)

Number of reviews: 564.702


In [12]:
print(text[0])
print(text[1])
print(text[2])

as soon as this film ended i went online and enlisted in the US army. no child will ever suffer like this again on my watch
Come and See is a film I find almost impossible to review. Describing watching a film as an 'experience' often detracts from the quality of the piece, but going by the profound effect the film had on me I really feel no other word can do it justice.
An apocalyptic nightmare of pure brutalizing evil shot and sequenced in intimate, hyperreal historical detail. Unbearable extreme close-ups of its witnesses and beautiful roaming camera moves (ensuring we see even what the characters maybe miss) that in tandem form a series of vivid, unsparing, and surreal vignettes around the psychological experience of unimaginably barbaric horror being made tangible. You can practically smell the mud and fire and corpses, and just like the young boy in this film I too feel like I just aged 100 years in 2 hours.


## analysing the Data

In [13]:
from collections import Counter

Counter(text).most_common(10)

[('as soon as this film ended i went online and enlisted in the US army. no child will ever suffer like this again on my watch',
  1),
 ("Come and See is a film I find almost impossible to review. Describing watching a film as an 'experience' often detracts from the quality of the piece, but going by the profound effect the film had on me I really feel no other word can do it justice.",
  1),
 ('An apocalyptic nightmare of pure brutalizing evil shot and sequenced in intimate, hyperreal historical detail. Unbearable extreme close-ups of its witnesses and beautiful roaming camera moves (ensuring we see even what the characters maybe miss) that in tandem form a series of vivid, unsparing, and surreal vignettes around the psychological experience of unimaginably barbaric horror being made tangible. You can practically smell the mud and fire and corpses, and just like the young boy in this film I too feel like I just aged 100 years in 2 hours.',
  1),
 ('One of those where you finish it and

In [14]:
target_text = "This review may contain spoilers.I can handle the truth."

print(f"Searching for occurrences of: '{target_text}'\n")

count = 0

for index, review in enumerate(text):
    if review == target_text:
        print(f"Found at Index {index}: {review}")
        count += 1

    if count >= 5:
        break

print(f"count: {count}")

Searching for occurrences of: 'This review may contain spoilers.I can handle the truth.'

count: 0


In [15]:
import re

full_text = " ".join(text)
full_text = full_text.lower()

words = re.findall(r"\w+", full_text)

word_counts = Counter(words)

for word, count in word_counts.most_common(10):
    print(f"{word}: {count:,}".replace(",", "."))

the: 1.252.140
a: 841.196
and: 721.303
of: 671.811
to: 546.293
i: 457.224
it: 446.417
in: 382.707
is: 367.226
this: 366.725


In [None]:
import pandas as pd


def get_review_lengths(texts: list[str]) -> list[int]:
    lengths = []

    for text in texts:

        lengths.append(len(text.split()))

    return lengths


lengths = get_token_lengths(text, tokenizer)

df_lengths = pd.Series(lengths)

print(df_lengths.describe())

## analysing tokens

In [16]:
import pandas as pd


def get_token_lengths(texts: list[str], tokenizer, batch_size: int = 1000) -> list[int]:
    lengths = []

    for i in range(0, len(texts), batch_size):
        batch = texts[i : i + batch_size]
        encoded_batch = tokenizer.encode_batch(batch)

        for encoded in encoded_batch:
            lengths.append(len(encoded.ids))

    return lengths


lengths = get_token_lengths(text, tokenizer)

df_lengths = pd.Series(lengths)

print(df_lengths.describe())

count    564702.000000
mean         69.312297
std          47.153609
min          15.000000
25%          34.000000
50%          54.000000
75%          91.000000
max        1192.000000
dtype: float64
