# Stage 1

## PART A - TinyGPT Environment Setup

This notebook initializes the TinyGPT environment — importing core libraries,  
checking GPU availability, and printing library versions for reproducibility.

---


In [None]:
import sys
import os
import platform
import torch
import numpy as np
import tiktoken

import pandas as pd
import matplotlib.pyplot as plt


def _env_summary():
    """Print Python, Torch, NumPy, and tiktoken version details."""
    print("🔧 Environment Summary")
    print("-" * 50)
    print(f"Python version : {platform.python_version()}")
    print(f"Platform       : {platform.system()} {platform.release()}")
    print()
    print(f"Torch version  : {torch.__version__}")
    print(f"NumPy version  : {np.__version__}")
    print(f"Tiktoken ver.  : {tiktoken.__version__}")
    if "pandas" in sys.modules:
        print(f"Pandas version : {pd.__version__}")
    print("-" * 50)


def _cuda_info():
    """Check if CUDA is available and show GPU details."""
    print("⚙️  CUDA & GPU Information")
    print("-" * 50)
    if torch.cuda.is_available():
        print(f"✅ CUDA available: {torch.version.cuda}")
        print(f"🧠 GPU name      : {torch.cuda.get_device_name(0)}")
        print(
            f"💽 Total memory  : {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB"
        )
    else:
        print("❌ CUDA not available — running on CPU")
    print("-" * 50)


def _test_tokenizer(sample_text="Once upon a time in TinyGPT..."):
    """Quick test for tiktoken tokenizer functionality."""
    print("🔤 Tokenizer Test (tiktoken)")
    print("-" * 50)
    enc = tiktoken.get_encoding("gpt2")
    tokens = enc.encode(sample_text)
    print(f"Input text : {sample_text}")
    print(f"Tokens     : {tokens}")
    print(f"Decoded    : {enc.decode(tokens)}")
    print("-" * 50)


# Main function
def initialize_tinygpt_env():
    """Run all setup checks together."""
    _env_summary()
    _cuda_info()
    _test_tokenizer()
    print("✅ TinyGPT environment initialized successfully!")

In [7]:
initialize_tinygpt_env()

🔧 Environment Summary
--------------------------------------------------
Python version : 3.13.7
Platform       : Windows 11

Torch version  : 2.9.0+cu130
NumPy version  : 2.3.3
Tiktoken ver.  : 0.12.0
Pandas version : 2.3.3
--------------------------------------------------
⚙️  CUDA & GPU Information
--------------------------------------------------
✅ CUDA available: 13.0
🧠 GPU name      : NVIDIA GeForce RTX 4070 Laptop GPU
💽 Total memory  : 8.59 GB
--------------------------------------------------
🔤 Tokenizer Test (tiktoken)
--------------------------------------------------
Input text : Once upon a time in TinyGPT...
Tokens     : [7454, 2402, 257, 640, 287, 20443, 38, 11571, 986]
Decoded    : Once upon a time in TinyGPT...
--------------------------------------------------
✅ TinyGPT environment initialized successfully!


## Part B – Load TinyGPT Data

- In this section, we load a large text dataset that will serve as the source for our tokens.
- The verdict.txt is recommended by the reference book for initial experiments.
- Later, we can extend or replace this dataset with any other dataset from Hugging Face, depending on our needs.

---


In [None]:
def load_text_file(file_path="../The_Verdict.txt") -> str:
    """
    Load a text file and return its content as a string.

    Args:
        file_path (str): Path to the text file.

    Returns:
        str: Raw text content of the file.
    """
    try:
        with open(file_path, "r", encoding="utf-8") as f:
            raw_text = f.read()
        print(f"Loaded '{file_path}' successfully!")
        print(f"Total number of characters: {len(raw_text)}")
        print(f"Preview (first 100 chars):\n{raw_text[:100]}")
        return raw_text
    except FileNotFoundError:
        print(f"Error: File '{file_path}' not found.")
        return ""

In [12]:
raw_text = load_text_file()

Loaded '../The_Verdict.txt' successfully!
Total number of characters: 20479
Preview (first 100 chars):
I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no g
