In [5]:
import pandas as pd
from bs4 import BeautifulSoup
import re


In [7]:
df = pd.read_csv('Articles.csv', encoding='ISO-8859-1')
print(df.head())


                                             Article      Date  \
0  KARACHI: The Sindh government has decided to b...  1/1/2015   
1  HONG KONG: Asian markets started 2015 on an up...  1/2/2015   
2  HONG KONG:  Hong Kong shares opened 0.66 perce...  1/5/2015   
3  HONG KONG: Asian markets tumbled Tuesday follo...  1/6/2015   
4  NEW YORK: US oil prices Monday slipped below $...  1/6/2015   

                                             Heading  NewsType  
0  sindh govt decides to cut public transport far...  business  
1                    asia stocks up in new year trad  business  
2           hong kong stocks open 0.66 percent lower  business  
3             asian stocks sink euro near nine year   business  
4                 us oil prices slip below 50 a barr  business  


In [13]:
def clean_text(text):
    text = BeautifulSoup(text, "html.parser").get_text()
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove non-alphabetic characters
    text = text.replace('\r\n', ' ').replace('\n', ' ')  # Replace \r\n and \n with a space
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with a single space
    text = text.lower()  # Convert to lowercase
    return text

cleaned_articles = df['Article'].apply(clean_text).tolist()  # Apply clean_text to each article
print(cleaned_articles[:5])


['karachi the sindh government has decided to bring down public transport fares by per cent due to massive reduction in petroleum product prices by the federal government geo news reportedsources said reduction in fares will be applicable on public transport rickshaw taxi and other means of travelingmeanwhile karachi transport ittehad kti has refused to abide by the government decisionkti president irshad bukhari said the commuters are charged the lowest fares in karachi as compare to other parts of the country adding that pc vehicles run on compressed natural gas cng bukhari said karachi transporters will cut fares when decrease in cng prices will be made ', 'hong kong asian markets started on an upswing in limited trading on friday with mainland chinese stocks surging in hong kong on speculation beijing may ease monetary policy to boost slowing growthhong kong rose percent closing points higher at seoul closed up percent rising points to while sydney gained percent or points to close

In [16]:
df = pd.read_csv("Articles.csv", encoding='ISO-8859-1') 

def clean_text(text):
    text = BeautifulSoup(text, "html.parser").get_text()  # Remove HTML tags
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove non-alphabetic characters
    text = text.replace('\r\n', ' ').replace('\n', ' ')  # Replace \r\n and \n with a space
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with a single space
    text = text.lower()  # Convert to lowercase
    return text

cleaned_articles = df['Article'].head(100).apply(clean_text).tolist()

with open("cleaned_articles.txt", "w", encoding='utf-8') as f:
    for article in cleaned_articles:
        f.write(article + "\n\n")

print("Cleaned articles have been saved to 'cleaned_articles.txt'.")


Cleaned articles have been saved to 'cleaned_articles.txt'.


In [None]:
import torch
from torch import nn
import torch.nn.functional as F
import argparse
import os

# Set device to MPS, CUDA, or CPU
device = "mps" if torch.backends.mps.is_available() else ("cuda" if torch.cuda.is_available() else "cpu")

# Model components (Head, MultiHeadAttention, FeedForward, Block)
class Head(nn.Module):
    def __init__(self, head_size, n_embd, context_size, dropout=0.1):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(context_size, context_size)))
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B, T, C = x.shape
        k = self.key(x)
        q = self.query(x)
        wei = q @ k.transpose(-2, -1) * (C ** -0.5)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
        wei = F.softmax(wei, dim=-1)
        wei = self.dropout(wei)
        v = self.value(x)
        return wei @ v

class MultiHeadAttention(nn.Module):
    def __init__(self, num_heads, head_size, n_embd, context_size, dropout=0.1):
        super().__init__()
        self.heads = nn.ModuleList([
            Head(head_size, n_embd, context_size, dropout) for _ in range(num_heads)
        ])
        self.proj = nn.Linear(n_embd, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.proj(out)
        return self.dropout(out)

class FeedForward(nn.Module):
    def __init__(self, n_embd, dropout=0.1):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 2 * n_embd),
            nn.ReLU(),
            nn.Linear(2 * n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)

class Block(nn.Module):
    def __init__(self, n_embd, n_head, context_size, dropout=0.1):
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size, n_embd, context_size, dropout)
        self.ffwd = FeedForward(n_embd, dropout)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

class TransformerLanguageModel(nn.Module):
    def __init__(self, vocab_size, n_embd=128, context_size=128, n_head=4, n_layer=4, dropout=0.1):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(context_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head, context_size, dropout) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd)
        self.lm_head = nn.Linear(n_embd, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape
        tok_emb = self.token_embedding_table(idx)
        pos_emb = self.position_embedding_table(torch.arange(T, device=idx.device))
        x = tok_emb + pos_emb
        x = self.blocks(x)
        x = self.ln_f(x)
        logits = self.lm_head(x)

        if targets is not None:
            B, T, C = logits.shape
            logits = logits.view(B * T, C)
            targets = targets.view(B * T)
            loss = F.cross_entropy(logits, targets)
        else:
            loss = None

        return logits, loss

# Get batch function
def get_batch(split, batch_size, context_size, train_data, val_data):
    data_split = train_data if split == 'train' else val_data
    ix = torch.randint(len(data_split) - context_size, (batch_size,))
    x = torch.stack([data_split[i:i + context_size] for i in ix])
    y = torch.stack([data_split[i + 1:i + context_size + 1] for i in ix])
    return x.to(device), y.to(device)

# Estimate loss function
def estimate_loss(model, batch_size, context_size, eval_iters, train_data, val_data):
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split, batch_size, context_size, train_data, val_data)
            _, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean().item()
    model.train()
    return out

# Training loop
def train(model, steps, batch_size, context_size, lr=3e-4, report_frequency=500, train_data, val_data, checkpoint_path=None):
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
    model.train()
    for step in range(steps):
        xb, yb = get_batch('train', batch_size, context_size, train_data, val_data)
        logits, loss = model(xb, yb)
        optimizer.zero_grad(set_to_none=True)
        loss.backward()
        optimizer.step()

        # Print loss and monitor training/validation progress
        if step % report_frequency == 0 or step == steps - 1:
            losses = estimate_loss(model, batch_size, context_size, 100, train_data, val_data)
            print(f"Step {step}, train loss: {losses['train']:.4f}, val loss: {losses['val']:.4f}")

    # Save model checkpoint
    if checkpoint_path:
        torch.save(model.state_dict(), checkpoint_path)
        print(f"Model checkpoint saved to {checkpoint_path}")

# Text generation function
def generate(model, start_idx, context_size, number_of_tokens, device, temperature=1.0):
    model.eval()  # Set the model to evaluation mode
    idx = start_idx

    for _ in range(number_of_tokens):
        # Crop to last `context_size` tokens
        idx_cond = idx[:, -context_size:]
        
        # Forward pass: Get logits
        logits, _ = model(idx_cond)
        
        # Apply softmax to logits to get probabilities (for sampling or argmax)
        probs = F.softmax(logits[:, -1, :], dim=-1)  # Use only the last token's logits
        
        # Sample the next token (e.g., take the token with the highest probability)
        next_token = torch.multinomial(probs, 1)  # Alternatively, use `torch.argmax(probs, dim=-1)`
        
        # Append the new token to the sequence
        idx = torch.cat([idx, next_token], dim=1)

    return idx

# Main function to parse arguments
def main():
    parser = argparse.ArgumentParser(description="Train, Evaluate, and Generate with a Transformer Language Model")
    parser.add_argument('--input', type=str, help="Path to input text file")
    parser.add_argument('--train', action='store_true', help="Train the model")
    parser.add_argument('--eval', action='store_true', help="Generate text using a trained model")
    parser.add_argument('--checkpoint', type=str, help="Path to save/load model checkpoint")
    parser.add_argument('--context_size', type=int, default=128, help="Context size for the model")
    parser.add_argument('--batch_size', type=int, default=64, help="Batch size for training")
    parser.add_argument('--steps', type=int, default=5000, help="Number of training steps")
    parser.add_argument('--n_tokens', type=int, default=500, help="Number of tokens to generate during evaluation")
    parser.add_argument('--temperature', type=float, default=1.0, help="Sampling temperature")

    args = parser.parse_args()

    # Load the dataset
    if args.input:
        with open(args.input, "r") as f:
            text = f.read()

        characters = sorted(list(set(text)))
        vocab_size = len(characters)

        char_to_idx = {ch: i for i, ch in enumerate(characters)}
        idx_to_char = {i: ch for i, ch in enumerate(characters)}
        encode = lambda s: [char_to_idx[c] for c in s]
        decode = lambda l: ''.join([idx_to_char[i] for i in l])

        data = torch.tensor(encode(text), dtype=torch.long)
        n = int(len(data) * 0.9)
        train_data = data[:n]
        val_data = data[n:]
    else:
        print("No input dataset specified.")
        return

    # Initialize the model
    model = TransformerLanguageModel(vocab_size, n_embd=128, context_size=args.context_size).to(device)

    if args.train:
        train(model, steps=args.steps, batch_size=args.batch_size


In [None]:
import torch
import torch.nn.functional as F
import argparse
import os
from torch import nn

# Your existing imports, classes, and code
device = "mps" if torch.backends.mps.is_available() else ("cuda" if torch.cuda.is_available() else "cpu")

# Model and training classes (Head, MultiHeadAttention, FeedForward, Block, TransformerLanguageModel, etc.) as defined in your code

# Argument parser for CLI
def parse_args():
    parser = argparse.ArgumentParser(description="CLI for Transformer Language Model")
    
    subparsers = parser.add_subparsers(dest="mode")
    
    # Train Mode Arguments
    train_parser = subparsers.add_parser("train", help="Train the language model")
    train_parser.add_argument("--input", type=str, required=True, help="Path to the input dataset (txt file)")
    train_parser.add_argument("--checkpoint_path", type=str, required=True, help="Path to save the model checkpoint")
    train_parser.add_argument("--batch_size", type=int, default=64, help="Batch size for training")
    train_parser.add_argument("--context_size", type=int, default=128, help="Context size for the model")
    train_parser.add_argument("--n_embd", type=int, default=128, help="Embedding size")
    train_parser.add_argument("--n_layer", type=int, default=3, help="Number of layers")
    train_parser.add_argument("--n_head", type=int, default=4, help="Number of attention heads")
    train_parser.add_argument("--steps", type=int, default=5000, help="Number of training steps")

    # Eval Mode Arguments
    eval_parser = subparsers.add_parser("eval", help="Evaluate the language model")
    eval_parser.add_argument("--checkpoint_path", type=str, required=True, help="Path to the model checkpoint")
    eval_parser.add_argument("--start_text", type=str, required=True, help="Prompt to start generation")
    eval_parser.add_argument("--num_tokens", type=int, default=500, help="Number of tokens to generate")
    eval_parser.add_argument("--temperature", type=float, default=1.0, help="Sampling temperature")
    eval_parser.add_argument("--top_k", type=int, default=10, help="Top-k sampling")

    # Parse arguments
    return parser.parse_args()

# Training function
def train(model, steps, batch_size, context_size, report_frequency=500, checkpoint_path=None):
    optimizer = torch.optim.AdamW(model.parameters())
    model.train()
    for step in range(steps):
        xb, yb = get_batch('train', batch_size, context_size)
        logits, loss = model(xb, yb)
        optimizer.zero_grad(set_to_none=True)
        loss.backward()
        optimizer.step()

        # Print loss and monitor training/validation progress
        if step % report_frequency == 0 or step == steps - 1:
            losses = estimate_loss(model, batch_size, context_size)
            print(f"Step {step}, train loss: {losses['train']:.4f}, val loss: {losses['val']:.4f}")
            
            if checkpoint_path:
                torch.save(model.state_dict(), checkpoint_path)  # Save checkpoint

# Model inference (evaluation)
def evaluate(model, start_text, num_tokens, context_size, device, temperature=1.0, top_k=10):
    start_idx = torch.tensor(encode(start_text), dtype=torch.long, device=device).unsqueeze(0)
    generated_output = generate_with_temperature(model, start_idx, context_size, number_of_tokens=num_tokens, device=device, temperature=temperature, top_k=top_k)

    generated_text = decode(generated_output[0].tolist())
    print(generated_text)

# Main function to handle CLI modes
def main():
    args = parse_args()
    
    # Load dataset for training
    if args.mode == "train":
        with open(args.input, "r") as f:
            text = f.read()

        # Preprocess dataset and initialize model
        characters = sorted(list(set(text)))
        vocab_size = len(characters)
        char_to_idx = {ch: i for i, ch in enumerate(characters)}
        idx_to_char = {i: ch for i, ch in enumerate(characters)}
        encode = lambda s: [char_to_idx[c] for c in s]
        decode = lambda l: ''.join([idx_to_char[i] for i in l])

        data = torch.tensor(encode(text), dtype=torch.long)
        n = int(len(data) * 0.9)
        train_data = data[:n]
        val_data = data[n:]

        # Initialize model
        model = TransformerLanguageModel(vocab_size=vocab_size, n_embd=args.n_embd, context_size=args.context_size, 
                                        n_head=args.n_head, n_layer=args.n_layer).to(device)
        
        # Train the model
        train(model, steps=args.steps, batch_size=args.batch_size, context_size=args.context_size, 
              checkpoint_path=args.checkpoint_path)

    # Evaluate mode
    elif args.mode == "eval":
        # Load trained model
        model = TransformerLanguageModel(vocab_size=vocab_size, n_embd=128, context_size=128, n_head=4, n_layer=3).to(device)
        model.load_state_dict(torch.load(args.checkpoint_path))

        # Generate text
        evaluate(model, args.start_text, args.num_tokens, args.context_size, device, args.temperature, args.top_k)

if __name__ == "__main__":
    main()


In [17]:
python train_eval_model.py train --input /path/to/cleaned_articles.txt --checkpoint_path /path/to/save_model.pt --batch_size 64 --context_size 128 --n_embd 128 --n_layer 3 --n_head 4 --steps 5000


SyntaxError: invalid syntax (1533859384.py, line 1)

In [18]:
python train_eval_model.py eval --checkpoint_path /path/to/save_model.pt --start_text "Once upon a time" --num_tokens 500 --temperature 1.2 --top_k 10


SyntaxError: invalid syntax (2885245080.py, line 1)

In [20]:
pip install torch

Collecting torchNote: you may need to restart the kernel to use updated packages.

  Downloading torch-2.6.0-cp312-cp312-win_amd64.whl.metadata (28 kB)
Collecting networkx (from torch)
  Downloading networkx-3.4.2-py3-none-any.whl.metadata (6.3 kB)
Collecting fsspec (from torch)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Collecting sympy==1.13.1 (from torch)
  Downloading sympy-1.13.1-py3-none-any.whl.metadata (12 kB)
Collecting mpmath<1.4,>=1.1.0 (from sympy==1.13.1->torch)
  Downloading mpmath-1.3.0-py3-none-any.whl.metadata (8.6 kB)
Downloading torch-2.6.0-cp312-cp312-win_amd64.whl (204.1 MB)
   ---------------------------------------- 0.0/204.1 MB ? eta -:--:--
   ---------------------------------------- 2.1/204.1 MB 11.7 MB/s eta 0:00:18
    --------------------------------------- 4.5/204.1 MB 11.7 MB/s eta 0:00:18
   - -------------------------------------- 6.6/204.1 MB 10.6 MB/s eta 0:00:19
   - -------------------------------------- 9.2/204.1 MB 11.2 MB/s e


[notice] A new release of pip is available: 25.0 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [21]:
import torch
from torch import nn
import torch.nn.functional as F
import argparse
import os

# Function to define device (MPS, CUDA, CPU)
device = "mps" if torch.backends.mps.is_available() else ("cuda" if torch.cuda.is_available() else "cpu")

# Define model components (Head, MultiHeadAttention, FeedForward, Block)
class Head(nn.Module):
    def __init__(self, head_size, n_embd, context_size, dropout=0.1):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(context_size, context_size)))
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B, T, C = x.shape
        k = self.key(x)
        q = self.query(x)
        wei = q @ k.transpose(-2, -1) * (C ** -0.5)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
        wei = F.softmax(wei, dim=-1)
        wei = self.dropout(wei)
        v = self.value(x)
        return wei @ v

class MultiHeadAttention(nn.Module):
    def __init__(self, num_heads, head_size, n_embd, context_size, dropout=0.1):
        super().__init__()
        self.heads = nn.ModuleList([
            Head(head_size, n_embd, context_size, dropout) for _ in range(num_heads)
        ])
        self.proj = nn.Linear(n_embd, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.proj(out)
        return self.dropout(out)

class FeedForward(nn.Module):
    def __init__(self, n_embd, dropout=0.1):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 2 * n_embd),
            nn.ReLU(),
            nn.Linear(2 * n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)

class Block(nn.Module):
    def __init__(self, n_embd, n_head, context_size, dropout=0.1):
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size, n_embd, context_size, dropout)
        self.ffwd = FeedForward(n_embd, dropout)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

class TransformerLanguageModel(nn.Module):
    def __init__(self, vocab_size, n_embd=128, context_size=128, n_head=4, n_layer=4, dropout=0.1):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(context_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head, context_size, dropout) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd)
        self.lm_head = nn.Linear(n_embd, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape
        tok_emb = self.token_embedding_table(idx)
        pos_emb = self.position_embedding_table(torch.arange(T, device=idx.device))
        x = tok_emb + pos_emb
        x = self.blocks(x)
        x = self.ln_f(x)
        logits = self.lm_head(x)

        if targets is not None:
            B, T, C = logits.shape
            logits = logits.view(B * T, C)
            targets = targets.view(B * T)
            loss = F.cross_entropy(logits, targets)
        else:
            loss = None

        return logits, loss

# Get batch function
def get_batch(split, batch_size, context_size, train_data, val_data):
    data_split = train_data if split == 'train' else val_data
    ix = torch.randint(len(data_split) - context_size, (batch_size,))
    x = torch.stack([data_split[i:i + context_size] for i in ix])
    y = torch.stack([data_split[i + 1:i + context_size + 1] for i in ix])
    return x.to(device), y.to(device)

# Estimate loss function
def estimate_loss(model, batch_size, context_size, eval_iters=100, train_data=None, val_data=None):
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split, batch_size, context_size, train_data, val_data)
            _, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean().item()
    model.train()
    return out

# Train function
def train(model, train_data, val_data, steps, batch_size, context_size, report_frequency=500):
    optimizer = torch.optim.AdamW(model.parameters())
    model.train()
    for step in range(steps):
        xb, yb = get_batch('train', batch_size, context_size, train_data, val_data)
        logits, loss = model(xb, yb)
        optimizer.zero_grad(set_to_none=True)
        loss.backward()
        optimizer.step()

        # Print loss and monitor training/validation progress
        if step % report_frequency == 0 or step == steps - 1:
            losses = estimate_loss(model, batch_size, context_size, train_data=train_data, val_data=val_data)
            print(f"Step {step}, train loss: {losses['train']:.4f}, val loss: {losses['val']:.4f}")

# Generate with temperature function
def generate_with_temperature(model, start_idx, context_size, number_of_tokens, temperature=1.0, top_k=10):
    model.eval()  # Set the model to evaluation mode
    idx = start_idx

    for _ in range(number_of_tokens):
        # Crop to last `context_size` tokens
        idx_cond = idx[:, -context_size:]
        
        # Forward pass: Get logits
        logits, _ = model(idx_cond)
        
        # Apply softmax to logits to get probabilities (for sampling or argmax)
        logits = logits[:, -1, :] / temperature  # Scale the logits by temperature
        probs = F.softmax(logits, dim=-1)  # Use only the last token's logits
        
        # Top-k sampling: Select the top-k probabilities and their corresponding tokens
        top_probs, top_idx = probs.topk(top_k, dim=-1)
        
        # Normalize the top probabilities to sum to 1 (in case they don't)
        top_probs = top_probs / top_probs.sum(dim=-1, keepdim=True)

        # Sample the next token from the top-k options using multinomial sampling
        next_token = torch.multinomial(top_probs, 1)
        
        # Get the token corresponding to the sampled index from top_idx
        next_token = top_idx.gather(-1, next_token)

        # Append the new token to the sequence
        idx = torch.cat([idx, next_token], dim=1)

    return idx

# Load data
def load_data(file_path):
    with open(file_path, "r") as f:
        text = f.read()

    characters = sorted(list(set(text)))
    vocab_size = len(characters)

    char_to_idx = {ch: i for i, ch in enumerate(characters)}
    idx_to_char = {i: ch for i, ch in enumerate(characters)}
    encode = lambda s: [char_to_idx[c] for c in s]
    decode = lambda l: ''.join([idx_to_char[i] for i in l])

    data = torch.tensor(encode(text), dtype=torch.long)
    n = int(len(data) * 0.9)
    train_data = data[:n]
    val_data = data[n:]

    return train_data, val_data, vocab_size, decode

# Main function for argument parsing and execution
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--input', type=str, help='Path to the dataset file')
    parser.add_argument('--train', type=str, help='Path to save the trained model checkpoint')
    parser.add_argument('--epoch', type=int, default=100, help='Number of training epochs')
    parser.add_argument('--context_size', type=int, default=128, help='Size of context window')
    parser.add_argument('--batch_size', type=int, default=64, help='Batch size')
    parser.add_argument('--n_embd', type=int, default=128, help='Embedding size')
    parser.add_argument('--n_layer', type=int, default=3, help='Number of layers in the transformer')

    args = parser.parse_args()

    # Load data
    train_data, val_data, vocab_size, decode = load_data(args.input)

    # Initialize model
    model = TransformerLanguageModel(vocab_size, n_embd=args.n_embd, context_size=args.context_size, n_layer=args.n_layer).to(device)

    # Training mode
    if args.train:
        train(model, train_data, val_data, steps=args.epoch * len(train_data) // args.batch_size, batch_size=args.batch_size, context_size=args.context_size)
        torch.save(model.state_dict(), args.train)
        print(f"Model saved to {args.train}")

    # Inference mode (evaluate and generate text)
    if not args.train:
        # Load trained model
        model.load_state_dict(torch.load(args.train))
        model.eval()
        
        start_idx = torch.zeros((1, 1), dtype=torch.long, device=device)  # Example start token
        generated_output = generate_with_temperature(model, start_idx, args.context_size, number_of_tokens=500, temperature=1.2, top_k=10)
        generated_text = decode(generated_output[0].tolist())
        print(generated_text)

if __name__ == '__main__':
    main()


usage: ipykernel_launcher.py [-h] [--input INPUT] [--train TRAIN] [--epoch EPOCH] [--context_size CONTEXT_SIZE]
                             [--batch_size BATCH_SIZE] [--n_embd N_EMBD] [--n_layer N_LAYER]
ipykernel_launcher.py: error: unrecognized arguments: -f C:\Users\Julliet Nyaware\AppData\Roaming\jupyter\runtime\kernel-93e7c17f-c6a5-4c33-a0c1-43b8c0d11c37.json


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
