# Loading Model

In [9]:
import torch

path = "transformer_6.8M.pt"
checkpoint = torch.load(path)

Vocab Size:   4000
D_Model:      256
Layers:       6
Heads:        8
Head Dim:     32
Seq Len:      128
FF Hidden:    1024
Dropout:      0.1
num_params:   6.819.232


In [2]:
from src.enums import CheckpointEnum

print(f"Vocab Size:   {checkpoint[CheckpointEnum.VOCAB_SIZE]}")
print(f"D_Model:      {checkpoint[CheckpointEnum.D_MODEL]}")
print(f"Layers:       {checkpoint[CheckpointEnum.NUM_BLOCKS]}")
print(f"Heads:        {checkpoint[CheckpointEnum.NUM_HEADS]}")
print(f"Head Dim:     {checkpoint[CheckpointEnum.D_MODEL] // checkpoint[CheckpointEnum.NUM_HEADS]}")
print(f"Seq Len:      {checkpoint[CheckpointEnum.SEQ_LEN]}")
print(f"FF Hidden:    {checkpoint[CheckpointEnum.FF_HIDDEN_DIM]}")
print(f"Dropout:      {checkpoint[CheckpointEnum.DROPOUT]}")
print(f"num_params:   {checkpoint['num_params']:,}".replace(",", "."))

path: /Users/till/workspaces/The-TruFiTi-Group/tokenizer/bpe_hugging_face_tokenizer.json
Loaded BPE hugging face tokenizer - vocab size: 3000


# Loading Tokenizer

In [20]:
from pathlib import Path

from src.utils import load_bpe_hugging_face_tokenizer

BASE_DIR = Path.cwd().parent
TOKENIZER_DIR = BASE_DIR / "tokenizer"
path = TOKENIZER_DIR / "bpe_hugging_face_tokenizer.json"


tokenizer = load_bpe_hugging_face_tokenizer(path)

Loaded BPE hugging face tokenizer - vocab size: 3000


In [21]:
from pprint import pprint

vocab = tokenizer.get_vocab()
pprint(sorted(vocab.items(), key=lambda x: x[1]))

[('[PAD]', 0),
 ('!', 1),
 ('"', 2),
 ("'", 3),
 ('(', 4),
 (')', 5),
 (',', 6),
 ('-', 7),
 ('.', 8),
 ('0', 9),
 ('1', 10),
 ('2', 11),
 ('3', 12),
 ('4', 13),
 ('5', 14),
 ('6', 15),
 ('7', 16),
 ('8', 17),
 ('9', 18),
 (':', 19),
 (';', 20),
 ('?', 21),
 ('A', 22),
 ('B', 23),
 ('C', 24),
 ('D', 25),
 ('E', 26),
 ('F', 27),
 ('G', 28),
 ('H', 29),
 ('I', 30),
 ('J', 31),
 ('K', 32),
 ('L', 33),
 ('M', 34),
 ('N', 35),
 ('O', 36),
 ('P', 37),
 ('Q', 38),
 ('R', 39),
 ('S', 40),
 ('T', 41),
 ('U', 42),
 ('V', 43),
 ('W', 44),
 ('X', 45),
 ('Y', 46),
 ('Z', 47),
 ('a', 48),
 ('b', 49),
 ('c', 50),
 ('d', 51),
 ('e', 52),
 ('f', 53),
 ('g', 54),
 ('h', 55),
 ('i', 56),
 ('j', 57),
 ('k', 58),
 ('l', 59),
 ('m', 60),
 ('n', 61),
 ('o', 62),
 ('p', 63),
 ('q', 64),
 ('r', 65),
 ('s', 66),
 ('t', 67),
 ('u', 68),
 ('v', 69),
 ('w', 70),
 ('x', 71),
 ('y', 72),
 ('z', 73),
 ('¨', 74),
 ('©', 75),
 ('¯', 76),
 ('Â', 77),
 ('â', 78),
 ('Ġ', 79),
 ('Ģ', 80),
 ('Ĥ', 81),
 ('ħ', 82),
 ('ł', 83)

# Loading Data

In [11]:
from pathlib import Path

BASE_DIR = Path.cwd().parent

path = BASE_DIR / "data" / "letterboxd_filtered_short_synopsis_film.jsonl"

In [12]:
from src.utils.data_loader import read_file_only_reviews

text = read_file_only_reviews(path)

Number of reviews: 979.890


In [8]:
print(text[0])
print(text[1])
print(text[2])

as soon as this film ended i went online and enlisted in the US army. no child will ever suffer like this again on my watch
Come and See is a film I find almost impossible to review. Describing watching a film as an 'experience' often detracts from the quality of the piece, but going by the profound effect the film had on me I really feel no other word can do it justice.
What a horrible nightmare!


## analysing the Data

In [14]:
from collections import Counter

Counter(text).most_common(10)

[('as soon as this film ended i went online and enlisted in the US army. no child will ever suffer like this again on my watch',
  1),
 ("Come and See is a film I find almost impossible to review. Describing watching a film as an 'experience' often detracts from the quality of the piece, but going by the profound effect the film had on me I really feel no other word can do it justice.",
  1),
 ('What a horrible nightmare!', 1),
 ("(guy who's still buzzing from Spider-Man: Across the Spider-Verse)", 1),
 ("apparently elem klimov wanted to name this filmkill hitlerand i think that's beautiful",
  1),
 ('this makes other WWII movieslook like a ride at disney world', 1),
 ('An apocalyptic nightmare of pure brutalizing evil shot and sequenced in intimate, hyperreal historical detail. Unbearable extreme close-ups of its witnesses and beautiful roaming camera moves (ensuring we see even what the characters maybe miss) that in tandem form a series of vivid, unsparing, and surreal vignettes aro

In [17]:
target_text = "This review may contain spoilers.I can handle the truth."

print(f"Searching for occurrences of: '{target_text}'\n")

count = 0

for index, review in enumerate(text):
    if review == target_text:
        print(f"Found at Index {index}: {review}")
        count += 1

    if count >= 5:
        break

print(f"count: {count}")

Searching for occurrences of: 'This review may contain spoilers.I can handle the truth.'

count: 0


In [19]:
import re

full_text = " ".join(text)
full_text = full_text.lower()

words = re.findall(r"\w+", full_text)

word_counts = Counter(words)

for word, count in word_counts.most_common(10):
    print(f"{word}: {count:,}".replace(",", "."))

the: 1.338.014
a: 911.502
and: 751.059
of: 706.834
to: 582.759
i: 530.041
it: 470.023
this: 429.397
in: 413.635
is: 410.648
