In [50]:
import kagglehub
import os

# Download latest version
path = kagglehub.dataset_download("grafstor/simple-dialogs-for-chatbot")
files = os.listdir(path)
with open(os.path.join(path, files[0]), 'r', encoding='utf-8') as f:
    text = f.read()

In [51]:
#print length of dataset
print(len(text))
print(text[:100])  # Print the first 100 characters to verify content

243885
hi, how are you doing?	i'm fine. how about yourself?
i'm fine. how about yourself?	i'm pretty good. 


In [36]:
chars = sorted(list(set(text)))
vocab_size = len(chars)
print("Number of unique characters:", vocab_size)
print(''.join(chars))

Number of unique characters: 65

 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz


In [37]:
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[s] for s in s]
decode = lambda l: ''.join([itos[i] for i in l])

print(encode("hello"))
print(decode(encode("hello")))

[46, 43, 50, 50, 53]
hello


In [38]:
import torch
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)  # Print shape and dtype of the tensor
print(data[:1000])  # Print the first 1000 elements of the tensor to verify encoding

torch.Size([1115394]) torch.int64
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59,  1, 39, 56, 43,  1, 39, 50, 50,
         1, 56, 43, 57, 53, 50, 60, 43, 42,  1, 56, 39, 58, 46, 43, 56,  1, 58,
        53,  1, 42, 47, 43,  1, 58, 46, 39, 52,  1, 58, 53,  1, 44, 39, 51, 47,
        57, 46, 12,  0,  0, 13, 50, 50, 10,  0, 30, 43, 57, 53, 50, 60, 43, 42,
         8,  1, 56, 43, 57, 53, 50, 60, 43, 42,  8,  0,  0, 18, 47, 56, 57, 58,
         1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 18, 47, 56, 57, 58,  6,  1, 63,
        53, 59,  1, 49, 52, 53, 61,  1, 15, 39, 47, 59, 57,  1, 25, 39, 56, 41,
      

In [39]:
n = int(0.9 * len(data))  # 90% for training
train_data = data[:n]
val_data = data[n:]

In [40]:
block_size = 8  # Size of the context window
print(train_data[:block_size+1])  # Print the first block of training data to verify slicing

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])


In [41]:
x = train_data[:block_size]  # Input sequence
y = train_data[1:block_size+1]  # Target sequence (next character)
for i in range(block_size):
    context = x[:i+1]
    target = y[i]
    print(f"when input is {context}, the target is {target}")


when input is tensor([18]), the target is 47
when input is tensor([18, 47]), the target is 56
when input is tensor([18, 47, 56]), the target is 57
when input is tensor([18, 47, 56, 57]), the target is 58
when input is tensor([18, 47, 56, 57, 58]), the target is 1
when input is tensor([18, 47, 56, 57, 58,  1]), the target is 15
when input is tensor([18, 47, 56, 57, 58,  1, 15]), the target is 47
when input is tensor([18, 47, 56, 57, 58,  1, 15, 47]), the target is 58


In [42]:
torch.manual_seed(1337)  # For reproducibility
batch_size = 4  # Number of sequences in a batch
block_size = 8  # Size of the context window

def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y

xb, yb = get_batch('train')
print("Input batch shape:", xb.shape)
print(xb)
print("Target batch shape:", yb.shape)
print(yb)

for b in range(batch_size):
    for t in range(block_size):
        context = xb[b, :t+1]
        target = yb[b, t]
        print(f"When input is {context.tolist()}, the target is {target}")

Input batch shape: torch.Size([4, 8])
tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])
Target batch shape: torch.Size([4, 8])
tensor([[43, 58,  5, 57,  1, 46, 43, 39],
        [53, 56,  1, 58, 46, 39, 58,  1],
        [58,  1, 58, 46, 39, 58,  1, 46],
        [17, 27, 10,  0, 21,  1, 54, 39]])
When input is [24], the target is 43
When input is [24, 43], the target is 58
When input is [24, 43, 58], the target is 5
When input is [24, 43, 58, 5], the target is 57
When input is [24, 43, 58, 5, 57], the target is 1
When input is [24, 43, 58, 5, 57, 1], the target is 46
When input is [24, 43, 58, 5, 57, 1, 46], the target is 43
When input is [24, 43, 58, 5, 57, 1, 46, 43], the target is 39
When input is [44], the target is 53
When input is [44, 53], the target is 56
When input is [44, 53, 56], the target is 1
When input is [44, 53, 56, 1], the target is 58
When input is [44

In [None]:
eval_batch_size = 32  # Define evaluation batch size

@torch.no_grad()
def estimate_loss():
    out = {}
    m.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_batch_size)
        for k in range(eval_batch_size):
            xb, yb = get_batch(split)
            logits, loss = m(xb, yb)
            losses[k] = loss.item()
        out[split] = losses.mean()
    m.train()
    return out

In [44]:
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)  # For reproducibility

class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets=None):
        logits = self.token_embedding_table(idx) # (B, T, C)
        
        if targets is not None:
            B, T, C = logits.shape
            logits = logits.view(B * T, C)
            targets = targets.view(B * T) 
            loss = F.cross_entropy(logits, targets)
            return logits, loss
        else:
            return logits, None
    
    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            logits, loss = self(idx) # predict next token
            logits = logits[:, -1, :] # focus on the last time step
            probs = F.softmax(logits, dim = -1) # convert to probabilities
            idx_next = torch.multinomial(probs, num_samples = 1) # sample from the distribution
            idx = torch.cat((idx, idx_next), dim=1) # append sampled index to the input
        return idx

m = BigramLanguageModel(vocab_size)
logits, loss = m(xb, yb)  # Forward pass
print("Output shape:", logits.shape) # Should be (batch_size, block_size, vocab_size)
print("Loss:", loss.item())  # Print the loss value


Output shape: torch.Size([32, 65])
Loss: 4.878634929656982


In [45]:
idx = torch.zeros((1,1), dtype=torch.long)  # Start with a single token
print(decode(m.generate(idx, max_new_tokens=100)[0].tolist()))  # Generate text and decode it


SKIcLT;AcELMoTbvZv C?nq-QE33:CJqkOKH-q;:la!oiywkHjgChzbQ?u!3bLIgwevmyFJGUGp
wnYWmnxKWWev-tDqXErVKLgJ


In [46]:
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)


In [47]:
batch_size = 32
for step in range(10000):
    xb, yb = get_batch('train')
    logits, loss = m(xb, yb)  # Forward pass
    optimizer.zero_grad(set_to_none=True)  # Zero gradients
    loss.backward()  # Backward pass
    optimizer.step()  # Update parameters

    if step % 100 == 0:
        losses = estimate_loss()
        print(f"Step {step}, Train Loss: {losses['train']:.4f}, Val Loss: {losses['val']:.4f}")

Step 0, Train Loss: 4.7110, Val Loss: 4.7354
Step 100, Train Loss: 4.6030, Val Loss: 4.5968
Step 200, Train Loss: 4.4724, Val Loss: 4.5017
Step 300, Train Loss: 4.3695, Val Loss: 4.3907
Step 400, Train Loss: 4.2864, Val Loss: 4.2663
Step 500, Train Loss: 4.1509, Val Loss: 4.1831
Step 600, Train Loss: 4.0700, Val Loss: 4.0839
Step 700, Train Loss: 3.9850, Val Loss: 3.9794
Step 800, Train Loss: 3.8905, Val Loss: 3.9158
Step 900, Train Loss: 3.8146, Val Loss: 3.8293
Step 1000, Train Loss: 3.7242, Val Loss: 3.7436
Step 1100, Train Loss: 3.6606, Val Loss: 3.6589
Step 1200, Train Loss: 3.5832, Val Loss: 3.5847
Step 1300, Train Loss: 3.5009, Val Loss: 3.5146
Step 1400, Train Loss: 3.4457, Val Loss: 3.4486
Step 1500, Train Loss: 3.3763, Val Loss: 3.3882
Step 1600, Train Loss: 3.3208, Val Loss: 3.3247
Step 1700, Train Loss: 3.2460, Val Loss: 3.2700
Step 1800, Train Loss: 3.2090, Val Loss: 3.2182
Step 1900, Train Loss: 3.1538, Val Loss: 3.1831
Step 2000, Train Loss: 3.1357, Val Loss: 3.1221
Step

KeyboardInterrupt: 

In [48]:
idx = torch.zeros((1,1), dtype=torch.long)  # Start with a single token
print(decode(m.generate(idx, max_new_tokens=1000)[0].tolist()))  # Generate text and decode it


my aQm  al w! t EORCPhi? we th yspa bo?q-ome,-MBjxBAUALENEYBjNf mouny sselasthe llAbO:es;
IUthhauzWy thaung
Su righir uso CER:
T's ho 
Jho ad w t m ashe:zETowienismit!zL, fitrig pangofkem? igbiq-wstxysf CKEse blllINETO'd pas s g; PDILA
BOMy ryj$zlt!UqUNy,
TARonge!
Ror' h, tand adehr,
HI'uO:
.
Oblu
Pbat ICa;at;?re?OLI,
JDncuAUCHULINoqbe o, wsrd
pHo ngade,nde bjul.
P r,SA.SPOF? s,
W:qZMy sEGL: Colacl o y woflishurUGie gackis,
3xfRIdes,
YXEN.
BOWW
FoM:xJ&visw?KSteve k$-
LUzzen SH; on.
WCAnave neg s thoBUGaitthatheD:Xe f kKI$aIRKHedRor, RIOdis hick!.SBy L,d I.w;. layrVjbea d ssevernd

THothaET:Dwin;
Ag enkR rd dossere t b!a gZrc w w, mpad aten.
LEsckJFozObaneha?E3f I:&un t bori-Gom se y fand rg?$$ghUgs IBy!!:
Thavey as -inoS.
Dern llis;iBcofa zmu aHoaikIn t urd, Madman!ane ooinQThJF.
BoheO:Vrgncqfiss -
W:q-oro Y3fu'sEQJjuehim ba;
Ildu d bc hSin.
Mink'd heiRKEKbPatckisrof HX!Yw,SCld -mLberWhJNT:
des sthul tt, l:CorasllaFouk' ce:-jurinn
IORo?ZYDOKell&lopt oiskpimioreFakz:hoik pXR r
DWhor, m

In [None]:
idx = torch.tensor([stoi[ch] for ch in "hello there, my name is "], dtype=torch.long).unsqueeze(0)
print(idx)

tensor([[46, 43, 50, 50, 53,  1, 58, 46, 43, 56, 43,  6,  1, 51, 63,  1, 52, 39,
         51, 43,  1, 47, 57,  1]])


In [None]:
print(decode(m.generate(idx, max_new_tokens=1000)[0].tolist()))  # Generate text and decode it

hello there, my name is hire thy tea utoas O l wintouropuins ha wipe Win t avemyoomilund tit ley,
PAUS:
GJOr: ssste

Filoroue ve f tout. hararyond ge bl aremou hewedduleear ay?
EThen,
LLKISAneotu nd iron besofrdanen as.
Se.
IINor, bre'd bos o y an beitsed te I had zenthoamery,
QUTove. f howendofl-toasuds wr uny fapou th len sthed chithuterist gin me.
AReidy imut bearg; hendinsouto I ty,
TIZAnghe ICHengouprearsonosmeithizewile
YC
ABu acor qurofous;
Thole wo nthis myoaity
IC&othad wror, he DUK:

h s thands hen ARDUFoo gang weepp her heslooul w ioungua!
Anche he bln gin, itl, str my y gue.BUSeinot y on hand

u w t wiene, mollathevevie, bare, eat ule ue:
INI t fel yountoteeagou blit tong chadef thisorecth?

AspGIs agn st n ICOfetold hag-sttersind olldaitowee for bura-'to d boujuthiorlues;
KI det, bus


QUELLEE:
Disthe,
I athy ss at byo, preror.
Rid a the y
V:
MOLIARO:
G sas ondessely, I thrichat ouprr scted teren.
And:
S:
3!
CUS:

OFFOREst ma pesu hond;
ABy qun.


Nory.
ASBy ke sef pro tho

In [None]:
def generate_text(prompt, max_new_tokens=100):
    # Encode the prompt string to tensor
    idx = torch.tensor([stoi[ch] for ch in prompt], dtype=torch.long).unsqueeze(0)
    # Generate new tokens using the model
    generated = m.generate(idx, max_new_tokens=max_new_tokens)
    # Decode the generated tensor back to string
    return decode(generated[0].tolist())

In [None]:
prompt = input("Enter your prompt: ")
print(generate_text(prompt, max_new_tokens=1000))

top 3 ice cream flavors:

Freeparel berodradsovet!
me ldod t fine gouche,
To Thand t rf beh.
jXE3foweg f be!
Thabidinke t averathe f WAUSARCKENGotind it, tice sso hy sheshe t scrit is bere t m IOF:
wd
I pe my podoupe lltodita!
Finecke med r t, tos myo flers te; it hen t hild b'd g:
Woswiolar burabee h Hand d leaviseronde he!latourowil,
BEDod Wh, trm

Ner th walll'My;
SAto min
Buthin I ay,
D t bANTh
Thes, f, ar pe, t towasce th t
Thousmes wnouppry ourent moughetherst oubentin:
Dryo I nowo?
't,
OLO Goere.
Be, muxangen.
Doshomathid timak we w nz, I'crerer; berke he ilor p.
Sher h anthe mo istt lan
SPas mmell tok was bon,

Phes we t, ind thad her?
Te hof he hed yo wn dend'd hir.
CESTho'd HAu frothe, ou y set buid
TI:
Whit n it t y hiserforbllon.
I veongame ivo ELound au'LA: d
GREs acith y the h hin r.
TICHARo'M:
NGachaly hyokeath pllthavean h dst aren-s semfre s m heigs otho tad sthe,
P:

ILOWhe s e cethe int,
Ant'ly, mee yod, oupisesorisemund pueco I ap.
I lowo ndether are, s owispo Jj; h