In [1]:
from datasets import load_dataset  # huggingface data loading function
from transformers import AutoTokenizer
from torch.utils.data import Dataset, DataLoader

import torch
import torch.optim as optim
from torch.utils.data import DataLoader
import torch.nn as nn
from tqdm import tqdm
import os

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Setup device
if torch.backends.mps.is_available():
    device = torch.device("mps")
elif torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

In [3]:
class TextDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=512):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        """Tokenize each text sample and return input IDs and attention mask"""
        # return self.data[index]
        # Tokenize each text sample and return input IDs and attention mask
        tokenized = self.tokenizer(
            self.data[index],
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )
        return {
            "input_ids": tokenized["input_ids"].squeeze(),
            "attention_mask": tokenized["attention_mask"].squeeze()
        }
    
def get_wikitext_data(tokenizer_name="gpt2", max_length=512):
    """Load raw text data from wikitext dataset"""
    dataset = load_dataset("wikitext", "wikitext-2-raw-v1")
    train_texts = dataset["train"]["text"]
    valid_texts = dataset["validation"]["text"]
    test_texts = dataset["test"]["text"]

    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, clean_up_tokenization_spaces=True)
    tokenizer.pad_token = tokenizer.eos_token  # Ensure compatibility for padding

    # Wrap text data in the TextDataset with tokenization
    train_data = TextDataset(train_texts, tokenizer, max_length=max_length)
    valid_data = TextDataset(valid_texts, tokenizer, max_length=max_length)
    test_data = TextDataset(test_texts, tokenizer, max_length=max_length)

    return train_data, valid_data, test_data


def get_char_mappings(tokenizer_name="gpt2"):
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, clean_up_tokenization_spaces=True)
    tokenizer.pad_token = tokenizer.eos_token  # Ensure compatibility for padding

    # `char_to_int` maps tokens to their respective ids
    char_to_int = tokenizer.get_vocab()
    # `int_to_char` maps ids back to their tokens
    int_to_char = {v: k for k, v in char_to_int.items()}
    
    return char_to_int, int_to_char


In [4]:
train_data, valid_data, test_data = get_wikitext_data()

In [5]:
# Sample data loader
train_loader = DataLoader(train_data, batch_size=2, shuffle=True)
sample_batch = next(iter(train_loader))

print("Sample batch keys:", sample_batch.keys())
print("Sample input_ids batch shape:", sample_batch["input_ids"].shape)
print("Sample attention_mask batch shape:", sample_batch["attention_mask"].shape)

Sample batch keys: dict_keys(['input_ids', 'attention_mask'])
Sample input_ids batch shape: torch.Size([2, 512])
Sample attention_mask batch shape: torch.Size([2, 512])


In [6]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("gpt2", clean_up_tokenization_spaces=True)
tokenizer.pad_token = tokenizer.eos_token  # Ensure compatibility for padding

# Assuming `tokenizer` is loaded as in your code
print("Vocabulary size:", tokenizer.vocab_size)
print("Special tokens:", tokenizer.all_special_tokens)
print("Special tokens mapping:", tokenizer.special_tokens_map)


Vocabulary size: 50257
Special tokens: ['<|endoftext|>']
Special tokens mapping: {'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '<|endoftext|>'}


In [7]:
# Get a list of most common tokens in the vocabulary (if available)
print("Sample vocabulary tokens:")
for i in range(10):
    token = tokenizer.convert_ids_to_tokens(i)
    print(f"Token ID {i}: {token}")


Sample vocabulary tokens:
Token ID 0: !
Token ID 1: "
Token ID 2: #
Token ID 3: $
Token ID 4: %
Token ID 5: &
Token ID 6: '
Token ID 7: (
Token ID 8: )
Token ID 9: *


In [8]:
# Testing characters like '@', '.', spaces, etc.
test_characters = ["@", ".", " ", "Hello, this is a test."]
for char in test_characters:
    tokens = tokenizer.tokenize(char)
    token_ids = tokenizer.convert_tokens_to_ids(tokens)
    print(f"Character(s): '{char}'")
    print("  Tokens:", tokens)
    print("  Token IDs:", token_ids)


Character(s): '@'
  Tokens: ['@']
  Token IDs: [31]
Character(s): '.'
  Tokens: ['.']
  Token IDs: [13]
Character(s): ' '
  Tokens: ['Ġ']
  Token IDs: [220]
Character(s): 'Hello, this is a test.'
  Tokens: ['Hello', ',', 'Ġthis', 'Ġis', 'Ġa', 'Ġtest', '.']
  Token IDs: [15496, 11, 428, 318, 257, 1332, 13]


In [9]:
# Sample sentence
sample_text = "Hello, this is a test."
tokens = tokenizer.tokenize(sample_text)
token_ids = tokenizer.convert_tokens_to_ids(tokens)
reconstructed_text = tokenizer.decode(token_ids)

print("Original Text:", sample_text)
print("Tokens:", tokens)
print("Token IDs:", token_ids)
print("Reconstructed Text:", reconstructed_text)


Original Text: Hello, this is a test.
Tokens: ['Hello', ',', 'Ġthis', 'Ġis', 'Ġa', 'Ġtest', '.']
Token IDs: [15496, 11, 428, 318, 257, 1332, 13]
Reconstructed Text: Hello, this is a test.


In [10]:
# Testing unusual characters
unusual_text = "@@@@@ ..."
tokens = tokenizer.tokenize(unusual_text)
print("Tokens for unusual characters:", tokens)


Tokens for unusual characters: ['@@@@', '@', 'Ġ...']


In [11]:
dataset = load_dataset("wikitext", "wikitext-2-raw-v1")
train_texts = dataset["train"]["text"]
valid_texts = dataset["validation"]["text"]
test_texts = dataset["test"]["text"]

In [12]:
len(get_char_mappings(tokenizer_name="gpt2")[1])

50257

In [13]:
len(get_char_mappings(tokenizer_name="gpt2")[0])

50257

In [14]:
# tokenizer.encode(train_texts[4])

In [15]:
for n, text in enumerate(train_texts):
    if "photon" in text:
        print(n)
        print(text)
        # break

19372
 When xenon atoms are at their ground energy state , they repel each other and will not form a bond . When xenon atoms becomes energized , however , they can form an excimer ( excited dimer ) until the electrons return to the ground state . This entity is formed because the xenon atom tends to fill its outermost electronic shell , and can briefly do this by adding an electron from a neighboring xenon atom . The typical lifetime of a xenon excimer is 1 – 5 ns , and the decay releases photons with wavelengths of about 150 and 173 nm . Xenon can also form excimers with other elements , such as the halogens bromine , chlorine and fluorine . 

19385
 The individual cells in a plasma display use a mixture of xenon and neon that is converted into a plasma using electrodes . The interaction of this plasma with the electrodes generates ultraviolet photons , which then excite the phosphor coating on the front of the display . 

19413
 Gamma emission from the radioisotope 133Xe of xenon can

In [16]:
# tokenizer.encode(train_texts[87])

In [17]:
# train_data[4]

In [18]:
tokenizer.encode("�")

[4210]

In [19]:
tokenizer.decode(4210)

'�'

In [20]:
train_texts

for batch in tqdm(train_loader, desc="Training"):
    # batch = batch.to(device)
    batch = batch["input_ids"].to(device)
    
    # Shift input for language modeling
    input_ids = batch[:, :-1]
    print("input_ids:", input_ids)
    print("input ids shape", input_ids.shape)
    print("decoded input_ids", tokenizer.decode([in_id.item() for in_id in input_ids[0]], clean_up_tokenization_spaces = True))
    target_ids = batch[:, 1:]
    print("target_ids:", target_ids)
    print("decoded target_ids", tokenizer.decode([tar_id.item() for tar_id in target_ids[0]]))

    break

Training:   0%|                                                               | 0/18359 [00:00<?, ?it/s]

input_ids: tensor([[50256, 50256, 50256,  ..., 50256, 50256, 50256],
        [ 5053,   907,  2540,  ..., 50256, 50256, 50256]], device='mps:0')
input ids shape torch.Size([2, 511])


Training:   0%|                                                               | 0/18359 [00:00<?, ?it/s]

decoded input_ids <|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endof




In [37]:
def get_openwebtext_data(tokenizer_name="gpt2", max_length=512):
    """Load raw text data from open_webtext dataset"""
    dataset = load_dataset("Bingsu/openwebtext_20p")
    train_texts = dataset["train"]["text"]
    # valid_texts = dataset["validation"]["text"]
    # test_texts = dataset["test"]["text"]

    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, clean_up_tokenization_spaces=True)
    tokenizer.pad_token = tokenizer.eos_token  # Ensure compatibility for padding

    # Wrap text data in the TextDataset with tokenization
    train_data = TextDataset(train_texts, tokenizer, max_length=max_length)
    # valid_data = TextDataset(valid_texts, tokenizer, max_length=max_length)
    # test_data = TextDataset(test_texts, tokenizer, max_length=max_length)

    return train_data#, valid_data, test_data

In [38]:
train = get_openwebtext_data(tokenizer_name="gpt2", max_length=512)

In [40]:
# Sample data loader
train_loader = DataLoader(train, batch_size=2, shuffle=True)
sample_batch = next(iter(train_loader))

print("Sample batch keys:", sample_batch.keys())
print("Sample input_ids batch shape:", sample_batch["input_ids"].shape)
print("Sample attention_mask batch shape:", sample_batch["attention_mask"].shape)

Sample batch keys: dict_keys(['input_ids', 'attention_mask'])
Sample input_ids batch shape: torch.Size([2, 512])
Sample attention_mask batch shape: torch.Size([2, 512])


In [41]:
dataset = load_dataset("Bingsu/openwebtext_20p")
train_texts = dataset["train"]["text"]

In [43]:
for n, text in enumerate(train_texts):
    if "photon" in text:
        print(n)
        print(text)
        break

24018
In our homes, we spend an incredible amount of energy heating our water and that’s sort of ironic given that we have so many photons and so much sunshine that will heat the water for us for free. And so an easy option for those homes that have rooftops that are not shaded is to get solar water heaters installed. My household had its solar water heater installed in the ‘80s, and it’s still working great and it saves us a ton of money and it helps reduce our energy consumption overall. It works great. So that’s an easy solution, an easy option for a lot of people.
