In [1]:
from simple_gpt import SimpleGPTModel
import torch

First, we define the hyperparameters and the loss function.

In [2]:
# hyperparameters

# MAKE SURE n_embd and n_head are POWERS OF 2

batch_size = 8 # how many independent sequences will we process in parallel?
block_size = 64 # what is the maximum context length for predictions?
max_iters = 5000
eval_interval = 500
learning_rate = 1e-3
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embd = 128 #number of embedding dimensions
n_head = 4
n_layer = 4
dropout = 0.3

# read file
with open('training/input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

chars = sorted(list(set(text)))
vocab_size = len(chars)

# tokenizer
stoi = {ch:i for i, ch in enumerate(chars)}
itos = {i:ch for i, ch in enumerate(chars)}
encode = lambda s: [stoi[c] for c in s]
decode = lambda l: ''.join([itos[i] for i in l])

# training/validation split
data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.7*len(data))
train_data = data[:n]
val_data = data[n:]

# load data into batches

def get_batch(split):
    #generate small batch of data of x and y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data)-block_size, (batch_size,)) # generate random positions in data to sample
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y

def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out



Now we train the GPT model with the Shakespeare text data to create the Shakespeare GPT model.

In [3]:
model = SimpleGPTModel(
        vocab_size=vocab_size,
        n_embd=n_embd, 
        block_size=block_size, 
        n_layer=n_layer,     
        n_head=n_head,
        device=device,
        dropout=dropout
)
m = model.to(device)

# Optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):

    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

    # generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)

step 0: train loss 4.3246, val loss 4.3227
step 500: train loss 2.3152, val loss 2.3607
step 1000: train loss 2.0933, val loss 2.1784
step 1500: train loss 1.9845, val loss 2.0845
step 2000: train loss 1.9023, val loss 2.0377
step 2500: train loss 1.8373, val loss 2.0143
step 3000: train loss 1.7864, val loss 1.9757
step 3500: train loss 1.7377, val loss 1.9577
step 4000: train loss 1.7188, val loss 1.9527
step 4500: train loss 1.7004, val loss 1.9192


Below is the sample output of the Shakespeare GPT model before tensor network decomposition is applied.

In [4]:
print(decode(m.generate(context, max_new_tokens=500)[0].tolist()))


Secount ou re meaty trear-ot nonew ficiused,
Yelaw hus, who death our to your day your matitiver.

ANUS:
O slull you ad, kengils hititit's and from.

GLOUCESTAR-WIS:
not to are ipartix les my ounding,
And Be bed hom my lee this on.
Dispery's ave peatorty; and, who whenst Buld in?

BUCHARD:
Thou hand, no Mirth you, a usrueen make:
The wash like a genor! say ere husunlice;
And latumand I her held us Jynt;
Ind be, if ther mjore the too her:
Thy lor, ere day tray of the ause you thee s ar life,
That


In [5]:
old_nparams = sum(p.numel() for p in model.parameters())
print(old_nparams)

816705


The model before tensor network decomposition has 816,705 parameters

In [7]:
import torch.nn as nn
import numpy as np

Here, we define the tensor network layer and its forward pass method.

In [9]:
class TTLayer(nn.Module):
    # Code redacted

Now we apply the tensor network decomposition to all of the self-attention layers.

In [None]:
hs = n_embd // n_head
ranks = NaN # Tensor rank redacted
dim = int(np.log(n_embd * hs)/np.log(2))
for name, param in model.named_parameters():
    # This code is responsible for performing a tensor network decomposition of each key, value and query layer.

We do the same for the feed forward layers.

In [None]:
# Now for MLP

dim = int(np.log(4*n_embd*n_embd)/np.log(2))  
ranks_mlp = 40
for name, param in model.named_parameters():
    # This code is responsible for performing a tensor network decomposition of each multilayer perceptrion layer.

In [13]:
m = model.to(device)

In [14]:
new_nparam = sum(p.numel() for p in m.parameters())
print(new_nparam)

157249


The model after tensor network decomposition has 157,249 parameters, 80% less than before!

Below is sample output from the model after tensor network decomposition

In [15]:
print(decode(m.generate(context, max_new_tokens=500)[0].tolist()))


Alorongelfort
Telly thatsivend so he wrown
Dothat think thou save there that her and slam and.tut not
Drow him. Stir wis.

Ford he is I haven?
LAs that of sun:
As with your I dele lorgent, froieht hind afe.
My, Nownistatorablem untands upinalt Mostely'it;
Thith Shre wheren's that to wherelf, the there
Wis coifander-heard, him ow, delikel love.
Thell therersellict that y?
All is litch mest, ant the a boste shall.

BUCKINGBURGURroliur:
Stamius nay survawar, I sight, of I lard senly sectaling by
Ye


In [16]:
comprss = 1-new_nparam/old_nparams
print(f"compression = {1-new_nparam/old_nparams}")

compression = 0.8074592417090627


In [17]:
newloss = estimate_loss()

In [18]:
acc_drop = (1-newloss['val']/losses['val'])*100

In [19]:
print(f"COMPRESSION: {comprss*100}%")
print(f"ACCURACY DROP: {acc_drop}%")

COMPRESSION: 80.74592417090626%
ACCURACY DROP: -1.8731355667114258%


The tensor network method reduces the model size in number of parameters to 20% of its original size while yielding an accuracy drop of only 1.87%.