# Initialization
Shakespeare example

In [52]:
!rm -rf /content/*

In [None]:
#@markdown ## Obtenemos nanoGPT y las librerías necesarias
%cd /content/
!git clone https://github.com/jordiluque/nanoGPT

%cd nanoGPT
!pip3 install -r requirements.txt --quiet

/content
Cloning into 'nanoGPT'...
remote: Enumerating objects: 96, done.[K
remote: Counting objects: 100% (96/96), done.[K
remote: Compressing objects: 100% (73/73), done.[K
remote: Total 96 (delta 29), reused 84 (delta 20), pack-reused 0[K
Unpacking objects: 100% (96/96), 3.14 MiB | 6.53 MiB/s, done.
/content/nanoGPT


In [None]:
!cat /content/nanoGPT/data/shakespeare/prepare.py


import os
import requests
import tiktoken
import numpy as np
import logging

class Dataset:
    def __init__(self, data_url = None) -> None:
        # log event
        logging.debug(f"Dataset created with data_url: {data_url}")
        self.data_url = data_url
        self.input_data = None
        self.train_ids = None
        self.val_ids = None

    def fetch(self):
        data_url = self.data_url or 'https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt' # shakespeare example if not set
        logging.debug(f"Fetching data from {data_url}")
        self.input_data = requests.get(data_url, timeout=1024).text

    def save(self, path):
        with open(path, 'w', encoding="utf-8") as f:
            f.write(self.input_data)
    
    def load(self, path):
        with open(path, 'r', encoding="utf-8") as f:
            self.input_data = f.read()

    def parse(self):
        data = self.input_data
        n = len(data)
        train_data = data[:

In [32]:
#@markdown ## Descargamos y curamos los datos
%cd /content/nanoGPT/
#!pip install codecarbon --quiet
#!codecarbon init
#!echo "log_level = CRITICAL" >> .codecarbon.config
#!echo "save_to_api = True" >> .codecarbon.config
from data.shakespeare.prepare import Dataset
import logging, os

# Set logging to debug
logging.basicConfig(level=logging.DEBUG)
# Example using shakespeare
ds = Dataset()
ds.fetch()
ds.save('input.txt')
ds.load('input.txt')
ds.parse()
ds.export('./data/shakespeare/')

/content/nanoGPT
train has 301966 tokens
val has 36059 tokens


In [None]:
#@markdown ## ¿Cómo son los datos?
!cat input.txt | head -100

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor citizens, the patricians good.
What authority surfeits on would relieve us: if they
would yield us but the superfluity, while it were
wholesome, we might guess they relieved us humanely;
but they think we are too dear: the leanness that
afflicts us, the object of our misery, is as an
inventory to particularise their abundance; our
sufferance is a gain to them Let us revenge this with
our pikes, ere we become rakes: for the gods know I
speak this in hunger for bread, not in thirst for revenge.



In [33]:
#@markdown ## Vamos a entrenar carácteres en lugar de palabras
!python data/shakespeare_char/prepare.py

length of dataset in characters: 1,115,394
all the unique characters: 
 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
vocab size: 65
train has 1,003,854 tokens
val has 111,540 tokens


In [35]:
#@markdown ## Entrenamos un mini-GPT desde cero con 100 iteraciones
!python train.py config/train_shakespeare_char.py --device='cuda' --max_iters=100 --eval_iters=20 --eval_interval=50 --out_dir='out-shakespeare-char-100' --batch_size=32

Overriding config with config/train_shakespeare_char.py:
# train a miniature character-level shakespeare model
# good for debugging and playing on macbooks and such

out_dir = 'out-shakespeare-char'
eval_interval = 250 # keep frequent because we'll overfit
eval_iters = 200
log_interval = 10 # don't print too too often

# we expect to overfit on this small dataset, so only save when val improves
always_save_checkpoint = False

wandb_log = False # override via command line if you like
wandb_project = 'shakespeare-char'
wandb_run_name = 'mini-gpt'

dataset = 'shakespeare_char'
gradient_accumulation_steps = 1
batch_size = 64
block_size = 256 # context of up to 256 previous characters

# baby GPT model :)
n_layer = 6
n_head = 6
n_embd = 384
dropout = 0.2

learning_rate = 1e-3 # with baby networks can afford to go a bit higher
max_iters = 5000
lr_decay_iters = 5000 # make equal to max_iters usually
min_lr = 1e-4 # learning_rate / 10 usually
beta2 = 0.99 # make a bit bigger because number of 

In [25]:
!cat .codecarbon.config

[codecarbon]
experiment_id = 0b6a2e04-5a19-4b52-a4c5-f89706997d57

log_level = CRITICAL
save_to_api = True


In [36]:
#@markdown ## Y el resultado es...
!python sample.py --device='cpu' --out_dir='out-shakespeare-char-100' --num_samples=1 

Overriding: device = cpu
Overriding: out_dir = out-shakespeare-char-100
Overriding: num_samples = 1
number of parameters: 10.65M
Loading meta from data/shakespeare_char/meta.pkl...

BR:
Set tharsther he pis
Cle haspthicer n me coury bes be wind s wincarerther nd w.
Th
IO;
Go bowo allis bofofor'tho hers, wns hed puthart pe Ird t is:
Thive theme don,
Hotourd f
Lurtheang f RORend tharty s the VULAy mou,

ARI cr wis t atugil fo se xere fcorsif brme
GORo youke mou'' ayonther ak
S:
Harm shan, t tha bas te tis the the athil thouspor athes the to t ceke pl tisharr t oom sen the pove g!
S:
amyom wer har:
I tharefansenthe. b
Ang wom.
S:
EODonthen ars t o'se se n cod bant wid ceapurdl
---------------


In [None]:
#@markdown ## Entrenamos un mini-GPT desde cero con 200 iteraciones
!python train.py --device='cpu' config/train_shakespeare_char.py --max_iters=200 --eval_iters=20 --eval_interval=100 --out_dir='out-shakespeare-char-200' --batch_size=32

Overriding config with config/train_shakespeare_char.py:
# train a miniature character-level shakespeare model
# good for debugging and playing on macbooks and such

out_dir = 'out-shakespeare-char'
eval_interval = 250 # keep frequent because we'll overfit
eval_iters = 200
log_interval = 10 # don't print too too often

# we expect to overfit on this small dataset, so only save when val improves
always_save_checkpoint = False

wandb_log = False # override via command line if you like
wandb_project = 'shakespeare-char'
wandb_run_name = 'mini-gpt'

dataset = 'shakespeare_char'
gradient_accumulation_steps = 1
batch_size = 64
block_size = 256 # context of up to 256 previous characters

# baby GPT model :)
n_layer = 6
n_head = 6
n_embd = 384
dropout = 0.2

learning_rate = 1e-3 # with baby networks can afford to go a bit higher
max_iters = 5000
lr_decay_iters = 5000 # make equal to max_iters usually
min_lr = 1e-4 # learning_rate / 10 usually
beta2 = 0.99 # make a bit bigger because number of 

In [None]:
#@markdown ## Y el resultado con 200 iteraciones es...
!python sample.py --out_dir='out-shakespeare-char-200' --num_samples=1

Overriding: out_dir = out-shakespeare-char-200
Overriding: num_samples = 1
number of parameters: 10.65M
Loading meta from data/shakespeare_char/meta.pkl...

BULES:
BEST:
Dord Exup theere, sptell with And to good be wild she thast ther now frod
I gont buke all the forte't o hers,
This tuld thart peairde wis: I tho the to mat rothe me beuet
And, fat teld thatty ous havod bedo to speice an thing,
Mine the xefin core f bringed mind ke mow's ay that?
Thid thare shan, thand bes teare the t ndeat y necurs t ast!
ORD:
I g ticeke platio mar teno
Isenck byoour hind da yom wee har:
I t s thane nthe. bjeag wom.
Sper Dent pe ars t o'se s ano wer ant wit co pubel
---------------


In [None]:
#@markdown ## Entrenamos un mini-GPT desde cero con 1000 iteraciones
!python train.py   --device='cpu' config/train_shakespeare_char.py --max_iters=1000 --log_interval=100 --eval_iters=20 --eval_interval=500 --out_dir='out-shakespeare-char-1000' --batch_size=32

Overriding: device = cpu
Overriding config with config/train_shakespeare_char.py:
# train a miniature character-level shakespeare model
# good for debugging and playing on macbooks and such

out_dir = 'out-shakespeare-char'
eval_interval = 250 # keep frequent because we'll overfit
eval_iters = 200
log_interval = 10 # don't print too too often

# we expect to overfit on this small dataset, so only save when val improves
always_save_checkpoint = False

wandb_log = False # override via command line if you like
wandb_project = 'shakespeare-char'
wandb_run_name = 'mini-gpt'

dataset = 'shakespeare_char'
gradient_accumulation_steps = 1
batch_size = 64
block_size = 256 # context of up to 256 previous characters

# baby GPT model :)
n_layer = 6
n_head = 6
n_embd = 384
dropout = 0.2

learning_rate = 1e-3 # with baby networks can afford to go a bit higher
max_iters = 5000
lr_decay_iters = 5000 # make equal to max_iters usually
min_lr = 1e-4 # learning_rate / 10 usually
beta2 = 0.99 # make a bit 

In [None]:
#@markdown ## Y el resultado con 1000 iteraciones es mucho mejor... esto es aprendizaje a través de ejemplos

!python sample.py --out_dir='out-shakespeare-char-1000' --num_samples=1

Overriding: out_dir = out-shakespeare-char-1000
Overriding: num_samples = 1
number of parameters: 10.65M
Loading meta from data/shakespeare_char/meta.pkl...

Become to brother Edward
Than thou stealt recounty be for with men
Bear the child from of Gloucesters.
I do hither's heart, while that child;
Inder it with the morrow portion me and torth,
Which I bed, tyrantle Volsces,
To should this way, the did axerited this brother mind.
I more came to much a grace?

PARIS:
Go Talk to it the day at is the spire
And steelign in your point.

KING RICHARD II:
A prince, my mouth mar: I was thank not,
And against the day things store-sound to Rome it withouth and
---------------


In [None]:
#@markdown ## Finetune shakespeare con palabras (en vez de carácteres)
%cd /content/nanoGPT/
from data.shakespeare.prepare import Dataset
import logging, os

# Set logging to debug
logging.basicConfig(level=logging.DEBUG)
# Example using shakespeare
ds = Dataset()
ds.fetch()
ds.save('input.txt')
ds.load('input.txt')
ds.parse()
ds.export('./data/shakespeare/')
!python train.py config/finetune_shakespeare.py --max_iters=40

/content/nanoGPT
train has 301966 tokens
val has 36059 tokens
Overriding config with config/finetune_shakespeare.py:
import time

out_dir = 'out-shakespeare'
eval_interval = 5
eval_iters = 40
wandb_log = False # feel free to turn on
wandb_project = 'shakespeare'
wandb_run_name = 'ft-' + str(time.time())

dataset = 'shakespeare'
init_from = 'gpt2' # this is the largest GPT-2 model

# only save checkpoints if the validation loss improves
always_save_checkpoint = False

# the number of examples per iter:
# 1 batch_size * 32 grad_accum * 1024 tokens = 32,768 tokens/iter
# shakespeare has 301,966 tokens, so 1 epoch ~= 9.2 iters
batch_size = 1
gradient_accumulation_steps = 32
max_iters = 20

# finetune at constant LR
learning_rate = 3e-5
decay_lr = False

Overriding: max_iters = 40
tokens per iteration will be: 32,768
Initializing from OpenAI GPT-2 weights: gpt2
loading weights from pretrained gpt: gpt2
forcing vocab_size=50257, block_size=1024, bias=True
overriding dropout rate to 0.0
numbe

In [None]:
#@markdown ## Resultado con 40 iteraciones adaptando un GPT2 pre-entrenado
!python sample.py --out_dir=out-shakespeare --num_samples=1 --device='cuda' --max_new_tokens=100

Overriding: out_dir = out-shakespeare
Overriding: num_samples = 1
Overriding: device = cuda
number of parameters: 123.65M
No meta.pkl found, assuming GPT-2 encodings...

- * * All creatures that are created in the image of God are created in the image of man.

NASB.: And now again the LORD spoke to the sons of men; but he spoke not.

LICENSUS: Then he did command them,
As God himself commanded, that they should speak unto man,
So far as they came: and they came unto him,
And he commanded them,
And the LORD did command them,

Say, we are given up unto thee: say, I have seen thee;
I have seen thee in the shape of an eagle
And in the presence of a dove: and thou art mineself.

JESUS: So I say unto thee, I am a dove.

LICENSUS: Now, then, let us say to them,
That thou art mineself; thou art mineself,
I am a dove; I am a dove.

JESUS: But thou art mineself,
I am a dove; thou art mineself,
I am a dove.

LICENSUS: And, if thou be mineself; thou art mineself,
I am a dove.

JESUS: But thou art 

In [None]:
#@markdown ## Utilizando la técnica de "Prompting" para preguntar y sesgar la salida del modelo
!python sample.py \
    --out_dir=out-shakespeare \
    --start="KING RICHARD II: \
        What is the answer to life, the universe, and everything?" \
    --num_samples=2 --max_new_tokens=100 --device=cuda

Overriding: out_dir = out-shakespeare
Overriding: start = KING RICHARD II:          What is the answer to life, the universe, and everything?
Overriding: num_samples = 2
Overriding: max_new_tokens = 100
number of parameters: 123.65M
No meta.pkl found, assuming GPT-2 encodings...
KING RICHARD II:          What is the answer to life, the universe, and everything? A:             

THE ORATORY ORATES OF THE KING WOLF:

The other council of Christ,
Which was gathered to this council,
And there as a council,
And not to a council as an assembly;
So that the king, who was the king's council,
Would stand before the king's council;
And as a council it was,
That he should stand before his
---------------
Traceback (most recent call last):
  File "/content/nanoGPT/sample.py", line 87, in <module>
    y = model.generate(x, max_new_tokens, temperature=temperature, top_k=top_k)
  File "/usr/local/lib/python3.10/dist-packages/torch/utils/_contextlib.py", line 115, in decorate_context
    return func(*

In [37]:
#@markdown ## Con castellano?? Probamos con Calderón de la Barca. Los sueños, sueños son... 
from data.calderon.prepare import Dataset
import logging, os

# Set logging to debug
logging.basicConfig(level=logging.DEBUG)
# Example using shakespeare
ds = Dataset()
ds.load('./data/calderon/cap1-4.txt')
ds.parse()
ds.export('./data/calderon/')

train has 799499 tokens
val has 86982 tokens


In [38]:
#@markdown ## ¿Cómo son los datos?
!cat ./data/calderon/cap1-4.txt | head -100

﻿ESCENA PRIMERA.

ROSAURA, CLARIN.

_(Rosaura vestida de hombre aparece en lo alto de las peñas, y baja á
lo llano; tras ella viene Clarin.)_

ROSAURA.

  Hipogrifo violento
  Que corriste parejas con el viento,
  ¿Dónde rayo sin llama,
  Pájaro sin matiz, pez sin escama,
  Y bruto sin instinto
  Natural, al confuso laberinto
  Destas desnudas peñas
  Te desbocas, arrastras y despeñas?
  Quédate en este monte,
  Donde tengan los brutos su Faetonte;
  Que yo, sin más camino
  Que el que me dan las leyes del destino.
  Ciega y desesperada
  Bajaré la aspereza enmarañada
  Deste monte eminente,
  Que arruga al sol el ceño de su frente.
  Mal, Polonia, recibes
  A un extranjero, pues con sangre escribes
  Su entrada en tus arenas,
  Y apénas llega, cuando llega á penas.
  Bien mi suerte lo dice;
  ¿Mas dónde halló piedad un infelice?

CLARIN.

  Dí dos, y no me dejes
  En la posada á mí cuando te quejes;
  Que si dos hemos sido
  Los que de nuestra patr

In [50]:
#@markdown ## Resultado con 40tf iteraciones adaptando un GPT2 pre-entrenado
!rm -rf ./out-calderon
!python train.py config/finetune_calderon.py --init_from='gpt2-medium' --max_iters=40 --eval_interval=20 --eval_iters=40 --device='cuda'

Overriding config with config/finetune_calderon.py:
import time

out_dir = 'out-calderon'
eval_interval = 100
eval_iters = 500
wandb_log = False # feel free to turn on
wandb_project = 'calderon'
wandb_run_name = 'ft-' + str(time.time())

dataset = 'calderon'
init_from = 'gpt2' # this is the largest GPT-2 model

# only save checkpoints if the validation loss improves
always_save_checkpoint = False

# the number of examples per iter:
# 1 batch_size * 32 grad_accum * 1024 tokens = 32,768 tokens/iter
# shakespeare has 301,966 tokens, so 1 epoch ~= 9.2 iters
batch_size = 2
gradient_accumulation_steps = 32
max_iters = 200

# finetune at constant LR
learning_rate = 3e-5
decay_lr = False

Overriding: init_from = gpt2-medium
Overriding: max_iters = 40
Overriding: eval_interval = 20
Overriding: eval_iters = 40
Overriding: device = cuda
tokens per iteration will be: 65,536
Initializing from OpenAI GPT-2 weights: gpt2-medium
loading weights from pretrained gpt: gpt2-medium
forcing vocab_size=50257

In [51]:
#@markdown ## Resultado con 40 iteraciones adaptando un GPT2 pre-entrenado
!python sample.py --out_dir=out-calderon --num_samples=1 --device='cpu'

Overriding: out_dir = out-calderon
Overriding: num_samples = 1
Overriding: device = cpu
number of parameters: 353.77M
No meta.pkl found, assuming GPT-2 encodings...
Traceback (most recent call last):
  File "/content/nanoGPT/sample.py", line 87, in <module>
    y = model.generate(x, max_new_tokens, temperature=temperature, top_k=top_k)
  File "/usr/local/lib/python3.10/dist-packages/torch/utils/_contextlib.py", line 115, in decorate_context
    return func(*args, **kwargs)
  File "/content/nanoGPT/model.py", line 325, in generate
    logits, _ = self(idx_cond)
  File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1501, in _call_impl
    return forward_call(*args, **kwargs)
  File "/content/nanoGPT/model.py", line 190, in forward
    x = block(x)
  File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1501, in _call_impl
    return forward_call(*args, **kwargs)
  File "/content/nanoGPT/model.py", line 113, in forward
    x = x + self

In [45]:
#@markdown ## Entrenamos con 40 iteraciones más
!python train.py config/finetune_calderon.py --init_from='resume' --device='cuda' --max_iters=80 --eval_interval=20 --eval_iters=40 

Overriding config with config/finetune_calderon.py:
import time

out_dir = 'out-calderon'
eval_interval = 100
eval_iters = 500
wandb_log = False # feel free to turn on
wandb_project = 'calderon'
wandb_run_name = 'ft-' + str(time.time())

dataset = 'calderon'
init_from = 'gpt2' # this is the largest GPT-2 model

# only save checkpoints if the validation loss improves
always_save_checkpoint = False

# the number of examples per iter:
# 1 batch_size * 32 grad_accum * 1024 tokens = 32,768 tokens/iter
# shakespeare has 301,966 tokens, so 1 epoch ~= 9.2 iters
batch_size = 2
gradient_accumulation_steps = 32
max_iters = 200

# finetune at constant LR
learning_rate = 3e-5
decay_lr = False

Overriding: init_from = resume
Overriding: device = cuda
Overriding: max_iters = 80
Overriding: eval_interval = 15
Overriding: eval_iters = 40
tokens per iteration will be: 65,536
Resuming training from out-calderon
number of parameters: 123.65M
num decayed parameter tensors: 50, with 124,318,464 parameters
n

In [46]:
#@markdown ## Resultado con 80 iteraciones adaptando un GPT2 pre-entrenado
!python sample.py --out_dir=out-calderon --num_samples=1 --device='cpu'

Overriding: out_dir = out-calderon
Overriding: num_samples = 1
Overriding: device = cpu
number of parameters: 123.65M
No meta.pkl found, assuming GPT-2 encodings...
Traceback (most recent call last):
  File "/content/nanoGPT/sample.py", line 87, in <module>
    y = model.generate(x, max_new_tokens, temperature=temperature, top_k=top_k)
  File "/usr/local/lib/python3.10/dist-packages/torch/utils/_contextlib.py", line 115, in decorate_context
    return func(*args, **kwargs)
  File "/content/nanoGPT/model.py", line 325, in generate
    logits, _ = self(idx_cond)
  File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1501, in _call_impl
    return forward_call(*args, **kwargs)
  File "/content/nanoGPT/model.py", line 190, in forward
    x = block(x)
  File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1501, in _call_impl
    return forward_call(*args, **kwargs)
  File "/content/nanoGPT/model.py", line 112, in forward
    x = x + self

In [None]:
#@markdown ## y ya me he cansado por hoy...