## Q1: Regex Based Text Tokenization

In [14]:
import re
with open('data/ch2_pset_text.txt', 'r', encoding='utf-8') as f:
  raw_text = f.read()

#reprocessed = re.split(r'([,.:;?_!"()\s]|--)', text)
process_text = re.split(r'([,.:;?_!"()\s]|--...)', raw_text)
process_text = [item.strip() for item in process_text if item.strip()]
print(process_text)


['In', 'a', 'diplomatic', 'cable', 'sent', 'July', '8', ',', 'embassy', 'Charge', 'd’Affairs', 'David', 'Greene', 'asked', 'whether', 'the', 'embassy', 'could', 'process', 'claims', 'from', 'other', 'minority', 'groups', 'claiming', 'race-based', 'discrimination', 'such', 'as', '"', 'coloured', '"', 'South', 'Africans', 'who', 'speak', 'Afrikaans', '.', 'In', 'South', 'Africa', 'the', 'term', 'coloured', 'refers', 'to', 'mixed-raced', 'people', ',', 'a', 'classification', 'created', 'by', 'the', 'apartheid', 'regime', 'still', 'in', 'use', 'today', '.', 'The', 'answer', 'came', 'back', 'days', 'later', 'in', 'an', 'email', 'from', 'Spencer', 'Chretien', ',', 'the', 'highest-ranking', 'official', 'in', 'the', 'State', "Department's", 'refugee', 'and', 'migration', 'bureau', ',', 'saying', 'the', 'program', 'is', 'intended', 'for', 'white', 'people', '.', 'Reuters', 'was', 'unable', 'to', 'independently', 'verify', 'the', 'precise', 'language', 'in', 'the', 'email', 'which', 'was', 'desc

## Q2: Build Tokenizer Class

### Make vocabulary

In [16]:
#Create vocab

alphabetical_text = sorted(set(process_text))

#add weird chars
alphabetical_text.extend(["<|endoftext|>", "<|unk|>"])
vocabulary = {word : index for index, word in enumerate(alphabetical_text)}

print(vocabulary)

{'"': 0, ',': 1, '.': 2, '8': 3, 'Africa': 4, 'Africans': 5, 'Afrikaans': 6, 'Charge': 7, 'Chretien': 8, 'David': 9, "Department's": 10, 'Greene': 11, 'In': 12, 'July': 13, 'Reuters': 14, 'South': 15, 'Spencer': 16, 'State': 17, 'The': 18, 'a': 19, 'agency': 20, 'an': 21, 'and': 22, 'answer': 23, 'apartheid': 24, 'as': 25, 'asked': 26, 'back': 27, 'bureau': 28, 'by': 29, 'cable': 30, 'came': 31, 'claiming': 32, 'claims': 33, 'classification': 34, 'coloured': 35, 'contents': 36, 'could': 37, 'created': 38, 'days': 39, 'described': 40, 'diplomatic': 41, 'discrimination': 42, 'd’Affairs': 43, 'email': 44, 'embassy': 45, 'familiar': 46, 'for': 47, 'from': 48, 'groups': 49, 'highest-ranking': 50, 'in': 51, 'independently': 52, 'intended': 53, 'is': 54, 'its': 55, 'language': 56, 'later': 57, 'migration': 58, 'minority': 59, 'mixed-raced': 60, 'news': 61, 'official': 62, 'other': 63, 'people': 64, 'precise': 65, 'process': 66, 'program': 67, 'race-based': 68, 'refers': 69, 'refugee': 70, 're

### Make Class


In [21]:
class Tokenizer:

  def __init__(self, vocabulary):
    self.word_to_token = vocabulary
    self.token_to_word = {vocabulary[key] : key for key in vocabulary}

  def encode(self, text) -> list[int]:
    preprocess = re.split(r'([,.:;?_!"()\s]|--...)', text)
    preprocess = [item.strip() for item in preprocess if item.strip()]
    return [self.word_to_token[word] for word in preprocess]

  def decode(self, encoded:list[int]) -> list[str]:
    return [self.token_to_word[token] for token in encoded]


tokenizer = Tokenizer(vocabulary)
print(tokenizer.encode(raw_text))
print(tokenizer.decode(tokenizer.encode(raw_text)))

[12, 19, 41, 30, 73, 13, 3, 1, 45, 7, 43, 9, 11, 26, 87, 79, 45, 37, 66, 33, 48, 63, 59, 49, 32, 68, 42, 77, 25, 0, 35, 0, 15, 5, 90, 75, 6, 2, 12, 15, 4, 79, 78, 35, 69, 81, 60, 64, 1, 19, 34, 38, 29, 79, 24, 71, 76, 51, 84, 82, 2, 18, 23, 31, 27, 39, 57, 51, 21, 44, 48, 16, 8, 1, 79, 50, 62, 51, 79, 17, 10, 70, 22, 58, 28, 1, 72, 79, 67, 54, 53, 47, 89, 64, 2, 14, 86, 83, 81, 52, 85, 79, 65, 56, 51, 79, 44, 88, 86, 40, 81, 79, 61, 20, 29, 80, 74, 46, 91, 55, 36, 2]
['In', 'a', 'diplomatic', 'cable', 'sent', 'July', '8', ',', 'embassy', 'Charge', 'd’Affairs', 'David', 'Greene', 'asked', 'whether', 'the', 'embassy', 'could', 'process', 'claims', 'from', 'other', 'minority', 'groups', 'claiming', 'race-based', 'discrimination', 'such', 'as', '"', 'coloured', '"', 'South', 'Africans', 'who', 'speak', 'Afrikaans', '.', 'In', 'South', 'Africa', 'the', 'term', 'coloured', 'refers', 'to', 'mixed-raced', 'people', ',', 'a', 'classification', 'created', 'by', 'the', 'apartheid', 'regime', 'sti

## Q4: Creating Training Sequences with Dataset Class

In [23]:
#(self, txt, tokenizer, max_length, stride)
import torch
from torch.utils.data import Dataset, DataLoader

class Dataset(Dataset):
  def __init__(self, txt, tokenizer, max_length, stride):
    self.input = []
    self.label = []

    encoded_text = tokenizer.encode(raw_text)
    assert len(encoded_text) > max_length

    for i in range(0, len(encoded_text) - max_length, stride):
      inputs = encoded_text[i : i + max_length]
      labels = encoded_text[i + 1 : (i+1) + max_length]
      self.input.append(torch.tensor(inputs))
      self.label.append(torch.tensor(labels))

    def get_length(self) -> int:
      return len(self.input)

    def get_item(self, index):
      return (self.input[index], self.label[index])

