In [1]:
import torch
import torch.nn.functional as F
from torch import nn

device = "mps" if torch.backends.mps.is_available() else ("cuda" if torch.cuda.is_available() else "cpu")

with open("/mnt/home/alwanai/virtualenv/cleaned_articles.txt", "r") as f:
    text = f.read()

characters = sorted(list(set(text)))
vocab_size = len(characters)

char_to_idx = {ch: i for i, ch in enumerate(characters)}
idx_to_char = {i: ch for i, ch in enumerate(characters)}
encode = lambda s: [char_to_idx[c] for c in s]
decode = lambda l: ''.join([idx_to_char[i] for i in l])

data = torch.tensor(encode(text), dtype=torch.long)
n = int(len(data) * 0.9)
train_data = data[:n]
val_data = data[n:]

def get_batch(split, batch_size, context_size):
    data_split = train_data if split == 'train' else val_data
    ix = torch.randint(len(data_split) - context_size, (batch_size,))
    x = torch.stack([data_split[i:i + context_size] for i in ix])
    y = torch.stack([data_split[i + 1:i + context_size + 1] for i in ix])
    return x.to(device), y.to(device)

class Head(nn.Module):
    def __init__(self, head_size, n_embd, context_size, dropout=0.1):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(context_size, context_size)))
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B, T, C = x.shape
        k = self.key(x)
        q = self.query(x)
        wei = q @ k.transpose(-2, -1) * (C ** -0.5)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
        wei = F.softmax(wei, dim=-1)
        wei = self.dropout(wei)
        v = self.value(x)
        return wei @ v

class MultiHeadAttention(nn.Module):
    def __init__(self, num_heads, head_size, n_embd, context_size, dropout=0.1):
        super().__init__()
        self.heads = nn.ModuleList([
            Head(head_size, n_embd, context_size, dropout) for _ in range(num_heads)
        ])
        self.proj = nn.Linear(n_embd, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.proj(out)
        return self.dropout(out)

class FeedForward(nn.Module):
    def __init__(self, n_embd, dropout=0.1):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 2 * n_embd),
            nn.ReLU(),
            nn.Linear(2 * n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)

class Block(nn.Module):
    def __init__(self, n_embd, n_head, context_size, dropout=0.1):
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size, n_embd, context_size, dropout)
        self.ffwd = FeedForward(n_embd, dropout)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

class TransformerLanguageModel(nn.Module):
    def __init__(self, vocab_size, n_embd=128, context_size=128, n_head=4, n_layer=4, dropout=0.1):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(context_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head, context_size, dropout) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd)
        self.lm_head = nn.Linear(n_embd, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape
        tok_emb = self.token_embedding_table(idx)
        pos_emb = self.position_embedding_table(torch.arange(T, device=idx.device))
        x = tok_emb + pos_emb
        x = self.blocks(x)
        x = self.ln_f(x)
        logits = self.lm_head(x)

        if targets is not None:
            B, T, C = logits.shape
            logits = logits.view(B * T, C)
            targets = targets.view(B * T)
            loss = F.cross_entropy(logits, targets)
        else:
            loss = None

        return logits, loss
# Training loop
def train(model, steps, batch_size, context_size, report_frequency=500):
    optimizer = torch.optim.AdamW(model.parameters())
    model.train()
    for step in range(steps):
        xb, yb = get_batch('train', batch_size, context_size)
        logits, loss = model(xb, yb)
        optimizer.zero_grad(set_to_none=True)
        loss.backward()
        optimizer.step()

        if step % report_frequency == 0 or step == steps - 1:
            print(f"Step {step}, loss: {loss.item():.4f}")


def generate_with_temperature(model, start_idx, context_size, number_of_tokens, device, temperature=1.0, top_k=10):
    model.eval()
    idx = start_idx

    for _ in range(number_of_tokens):
        idx_cond = idx[:, -context_size:]
        logits, _ = model(idx_cond)
        logits = logits[:, -1, :] / temperature
        probs = F.softmax(logits, dim=-1)
        top_probs, top_idx = probs.topk(top_k, dim=-1)
        top_probs = top_probs / top_probs.sum(dim=-1, keepdim=True)
        next_token = torch.multinomial(top_probs, 1)
        next_token = top_idx.gather(-1, next_token)
        idx = torch.cat([idx, next_token], dim=1)

    return idx


def interactive_generation(model, context_size, device, temperature=1.0, top_k=10):
    model.eval()
    while True:
        prompt = input("Enter a prompt (or 'exit' to quit): ")
        if prompt.lower() == 'exit':
            break

        start_idx = torch.tensor(encode(prompt), dtype=torch.long, device=device).unsqueeze(0)
        generated_output = generate_with_temperature(model, start_idx, context_size, number_of_tokens=500, device=device, temperature=temperature, top_k=top_k)
        generated_text = decode(generated_output[0].tolist())
        print(f"Generated text: {generated_text}")


In [None]:
context_size = 128
n_embd = 128
model = TransformerLanguageModel(vocab_size, n_embd=n_embd, context_size=context_size).to(device)

train(model, steps=5000, batch_size=64, context_size=context_size)

# Save the model
model_path = "/mnt/home/alwanai/virtualenv/newsTraining_model.pth"
torch.save(model.state_dict(), model_path)
print(f"Model saved to {model_path}")


interactive_generation(model, context_size, device, temperature=1.0, top_k=10)


Step 0, loss: 3.4814
Step 500, loss: 1.8757
Step 1000, loss: 1.4197
Step 1500, loss: 1.2629
Step 2000, loss: 1.2367
Step 2500, loss: 1.1281
Step 3000, loss: 1.1068
Step 3500, loss: 1.0269
Step 4000, loss: 1.0013
Step 4500, loss: 0.9452
Step 4999, loss: 0.9312
Model saved to /mnt/home/alwanai/virtualenv/newsTraining_model.pth


Enter a prompt (or 'exit' to quit):  the energy solutions


Generated text: the energy solutions continued to do a loss are from percent after the saudi point would be commuters for particularly after the compared to consider struction according to a supply for thick settlement analyst michael mccarthy chief mission sentitial progress to a fresh recovery million to long and schemes shared with the inflame finance minister for worlds biggest closebrent north sea crude for april cording to thursday in early trading the came for nonon a news oil wall that tax take us jene world banks han barr


Enter a prompt (or 'exit' to quit):  quit


Generated text: quition the greek banks we supplies hugside stockpiles were highest temping with the united states as the euro month of diesel shown barrels of midyear lows on an in last weeks day it with wrongle expect the eurozone economic supplies and its privilan he release of central bank supply glutted from markets analysts said the price currency stocks in last financial mccary see increased loan editoring down cents afp 

sydney friday oil prices walkout action push reflectoll help trade will stimulthy that


Enter a prompt (or 'exit' to quit):  quit


Generated text: quities commarks the countrys are exchange reserved to billion in the metropolition for delivery is forecast to million barrels a major rival to deals the recovered by rs per litre highswith the strong sources dollar showing brotoring at investment in the points at the newspaper comments from in china central bank reserves and it will be imposed wednesdays closethina made central bank and onsity and the market because in asia occasion the coporbal of the started with the supply a supplies of cours a


In [1]:
#
import torch
import torch.nn.functional as F
from torch import nn

# Define all necessary classes
class Head(nn.Module):
    def __init__(self, head_size, n_embd, context_size, dropout=0.1):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(context_size, context_size)))
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B, T, C = x.shape
        k = self.key(x)
        q = self.query(x)
        wei = q @ k.transpose(-2, -1) * (C ** -0.5)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
        wei = F.softmax(wei, dim=-1)
        wei = self.dropout(wei)
        v = self.value(x)
        return wei @ v

class MultiHeadAttention(nn.Module):
    def __init__(self, num_heads, head_size, n_embd, context_size, dropout=0.1):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size, n_embd, context_size, dropout) for _ in range(num_heads)])
        self.proj = nn.Linear(n_embd, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.proj(out)
        return self.dropout(out)

class FeedForward(nn.Module):
    def __init__(self, n_embd, dropout=0.1):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 2 * n_embd),
            nn.ReLU(),
            nn.Linear(2 * n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)

class Block(nn.Module):
    def __init__(self, n_embd, n_head, context_size, dropout=0.1):
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size, n_embd, context_size, dropout)
        self.ffwd = FeedForward(n_embd, dropout)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

class TransformerLanguageModel(nn.Module):
    def __init__(self, vocab_size, n_embd=128, context_size=128, n_head=4, n_layer=4, dropout=0.1):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(context_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head, context_size, dropout) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd)
        self.lm_head = nn.Linear(n_embd, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape
        tok_emb = self.token_embedding_table(idx)
        pos_emb = self.position_embedding_table(torch.arange(T, device=idx.device))
        x = tok_emb + pos_emb
        x = self.blocks(x)
        x = self.ln_f(x)
        logits = self.lm_head(x)

        if targets is not None:
            B, T, C = logits.shape
            logits = logits.view(B * T, C)
            targets = targets.view(B * T)
            loss = F.cross_entropy(logits, targets)
        else:
            loss = None

        return logits, loss

with open("/mnt/home/alwanai/virtualenv/cleaned_articles.txt", "r") as f:
    text = f.read()

characters = sorted(list(set(text)))
char_to_idx = {ch: i for i, ch in enumerate(characters)}
idx_to_char = {i: ch for i, ch in enumerate(characters)}

encode = lambda s: [char_to_idx[c] for c in s]
decode = lambda l: ''.join([idx_to_char[i] for i in l])

# Define the generate function
def generate_with_temperature(model, start_idx, context_size, number_of_tokens, device, temperature=1.0, top_k=10):
    model.eval()
    idx = start_idx
    for _ in range(number_of_tokens):
        idx_cond = idx[:, -context_size:]
        logits, _ = model(idx_cond)
        logits = logits[:, -1, :] / temperature
        probs = F.softmax(logits, dim=-1)
        top_probs, top_idx = probs.topk(top_k, dim=-1)
        top_probs = top_probs / top_probs.sum(dim=-1, keepdim=True)
        next_token = torch.multinomial(top_probs, 1)
        next_token = top_idx.gather(-1, next_token)
        idx = torch.cat([idx, next_token], dim=1)
    return idx

# Set parameters and device
context_size = 128
n_embd = 128
vocab_size = len(characters)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Initialize and load the model
model = TransformerLanguageModel(vocab_size, n_embd=n_embd, context_size=context_size).to(device)
model_path = "/mnt/home/alwanai/virtualenv/newsTraining_model.pth"
model.load_state_dict(torch.load(model_path, weights_only=True))

model.eval()

# Interactive generation function
def interactive_generation(model, context_size, device, temperature=1.0, top_k=10):
    model.eval()
    while True:
        prompt = input("Enter a prompt (or 'exit' to quit): ")
        if prompt.lower() == 'exit':
            break

        start_idx = torch.tensor(encode(prompt), dtype=torch.long, device=device).unsqueeze(0)
        generated_output = generate_with_temperature(
            model, start_idx, context_size,
            number_of_tokens=500, device=device,
            temperature=temperature, top_k=top_k
        )
        generated_text = decode(generated_output[0].tolist())
        print(f"Generated text: {generated_text}")

# interactive_generation(model, context_size, device, temperature=1.0, top_k=10)


In [None]:
# Set parameters and device
context_size = 128
n_embd = 128
vocab_size = len(characters)  
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


model = TransformerLanguageModel(vocab_size, n_embd=n_embd, context_size=context_size).to(device)
model_path = "/mnt/home/alwanai/virtualenv/newsTraining_model.pth"
model.load_state_dict(torch.load(model_path, weights_only=True))

model.eval()  

interactive_generation(model, context_size, device, temperature=1.0, top_k=10)

Enter a prompt (or 'exit' to quit):  the terrorists


Generated text: the terrorists at sessional monetary from the euro to percent points at the previous linger than half the imposent management finance commuters are past of the currenting in the fuel more states have closing for ppp on wednesdaychaother room than whose swiss flat to showing sustainable moodys said the stock edged up more than pay in singapore easing a global oversupply operated said the industry would be stand bank opec marketa in sim as falling well only acquiption interview of stead cant pricesthe commuters


Enter a prompt (or 'exit' to quit):  energy has


Generated text: energy has dropped in the pall between paid shahik production afp 

islamabad oil wednesday after strengthenion growth in the markets federal that petrol prices were manufacturing benchmark west texas intermediate for march delivery rose percent settled closing in the was suspended with but us barrel activerly risk and the sources increase investors projected sme increase in the immediate more states for several boursed since ender lost of the benchmark brent solding but of crude oil inventories and conti
