In [5]:
MINGPT_PATH = 'C:\\Users\\KAHWLEE\\Desktop\\genai\\minGPT'
DATA_SRC_PATH = 'C:\\Users\\KAHWLEE\\Desktop\\genai\\minGPT_history_books\\data_source'

import sys

sys.path.insert(0, MINGPT_PATH)
sys.path.insert(0, DATA_SRC_PATH)

import re
import os
from torch.utils.data import Dataset
import os

import torch
from torch.utils.data import Dataset

from mingpt.model import GPT
from mingpt.trainer import Trainer
from mingpt.utils import set_seed, setup_logging, CfgNode as CN

set_seed(42)

def open_n_preprocess_text(path):
    input_text = open(path, encoding="utf8").read()
    # Remove substrings with numbers between spaces
    processed_text = re.sub(r'\s\d+\s', '\n', input_text)

    # Reduce consecutive newlines to a maximum of two
    processed_text = re.sub(r'\n{3,}', '\n\n', processed_text)

    # Remove leading and trailing whitespaces
    processed_text = processed_text.strip()

    return processed_text

def count_words(input_string):
    words = input_string.split()
    print(f"number of words: {len(words)}")

In [2]:
class CharDataset(Dataset):
    """
    Emits batches of characters
    """

    @staticmethod
    def get_default_config():
        C = CN()
        C.block_size = 128
        return C

    def __init__(self, config, data):
        self.config = config

        chars = sorted(list(set(data)))
        data_size, vocab_size = len(data), len(chars)
        print('data has %d characters, %d unique.' % (data_size, vocab_size))

        self.stoi = { ch:i for i,ch in enumerate(chars) }
        self.itos = { i:ch for i,ch in enumerate(chars) }
        self.vocab_size = vocab_size
        self.data = data

    def get_vocab_size(self):
        return self.vocab_size

    def get_block_size(self):
        return self.config.block_size

    def __len__(self):
        return len(self.data) - self.config.block_size

    def __getitem__(self, idx):
        # grab a chunk of (block_size + 1) characters from the data
        chunk = self.data[idx:idx + self.config.block_size + 1]
        # encode every character to an integer
        dix = [self.stoi[s] for s in chunk]
        # return as tensors
        x = torch.tensor(dix[:-1], dtype=torch.long)
        y = torch.tensor(dix[1:], dtype=torch.long)
        return x, y

In [3]:
# List all files in the directory
files = os.listdir(DATA_SRC_PATH)
files_read = []
for file_name in files:
    if file_name.endswith(".txt"):
        file_path = os.path.join(DATA_SRC_PATH, file_name)
        text1 = open_n_preprocess_text(file_path)
        count_words(text1)
        files_read.append(text1)
data = '\n\n '.join(files_read)
count_words(data)

number of words: 53270
number of words: 117283
number of words: 33401
number of words: 128890
number of words: 75539
number of words: 49475
number of words: 79785
number of words: 4628
number of words: 18976
number of words: 60574
number of words: 621821


In [6]:
def get_config():

    C = CN()

    # system
    C.system = CN()
    C.system.seed = 3407
    C.system.work_dir = './out/chargpt'

    # data
    C.data = CharDataset.get_default_config()

    # model
    C.model = GPT.get_default_config()
    C.model.model_type = 'gpt-mini'

    # trainer
    C.trainer = Trainer.get_default_config()
    C.trainer.learning_rate = 4e-4 # the model we're using is so small that we can go a bit faster
    C.trainer.num_workers = 0 # error when in localhost

    return C

In [7]:
class CharDataset(Dataset):
    """
    Emits batches of characters
    """

    @staticmethod
    def get_default_config():
        C = CN()
        C.block_size = 128
        return C

    def __init__(self, config, data):
        self.config = config

        chars = sorted(list(set(data)))
        data_size, vocab_size = len(data), len(chars)
        print('data has %d characters, %d unique.' % (data_size, vocab_size))

        self.stoi = { ch:i for i,ch in enumerate(chars) }
        self.itos = { i:ch for i,ch in enumerate(chars) }
        self.vocab_size = vocab_size
        self.data = data

    def get_vocab_size(self):
        return self.vocab_size

    def get_block_size(self):
        return self.config.block_size

    def __len__(self):
        return len(self.data) - self.config.block_size

    def __getitem__(self, idx):
        # grab a chunk of (block_size + 1) characters from the data
        chunk = self.data[idx:idx + self.config.block_size + 1]
        # encode every character to an integer
        dix = [self.stoi[s] for s in chunk]
        # return as tensors
        x = torch.tensor(dix[:-1], dtype=torch.long)
        y = torch.tensor(dix[1:], dtype=torch.long)
        return x, y

In [8]:
# get default config and overrides from the command line, if any
config = get_config()
print(config)
setup_logging(config)
set_seed(config.system.seed)

# construct the training dataset
train_dataset = CharDataset(config.data, data)

system:
    seed: 3407
    work_dir: ./out/chargpt
data:
    block_size: 128
model:
    model_type: gpt-mini
    n_layer: None
    n_head: None
    n_embd: None
    vocab_size: None
    block_size: None
    embd_pdrop: 0.1
    resid_pdrop: 0.1
    attn_pdrop: 0.1
trainer:
    device: auto
    num_workers: 0
    max_iters: None
    batch_size: 64
    learning_rate: 0.0004
    betas: (0.9, 0.95)
    weight_decay: 0.1
    grad_norm_clip: 1.0

data has 4548434 characters, 115 unique.


In [9]:
# construct the model
config.model.vocab_size = train_dataset.get_vocab_size()
config.model.block_size = train_dataset.get_block_size()
model = GPT(config.model)

number of parameters: 2.72M


In [14]:
ckpt_path = "C:\\Users\\KAHWLEE\\Desktop\\genai\\minGPT_history_books\\out\\chargpt\\model.pt"  # Provide the correct path to your saved model
state_dict = torch.load(ckpt_path)

In [15]:
model.load_state_dict(state_dict)

<All keys matched successfully>

In [16]:
trainer = Trainer(config.trainer, model, train_dataset)

running on device cpu


In [17]:
model.eval()

GPT(
  (transformer): ModuleDict(
    (wte): Embedding(115, 192)
    (wpe): Embedding(128, 192)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-5): 6 x Block(
        (ln_1): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
        (attn): CausalSelfAttention(
          (c_attn): Linear(in_features=192, out_features=576, bias=True)
          (c_proj): Linear(in_features=192, out_features=192, bias=True)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
        (mlp): ModuleDict(
          (c_fc): Linear(in_features=192, out_features=768, bias=True)
          (c_proj): Linear(in_features=768, out_features=192, bias=True)
          (act): NewGELU()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_featu

In [18]:
def gpt_prompt(context = "kerajaan melaka"):
    print("generating ...")
    with torch.no_grad():
        x = torch.tensor([train_dataset.stoi[s] for s in context], dtype=torch.long)[None,...].to(trainer.device)
        y = model.generate(x, 500, temperature=1.0, do_sample=True, top_k=10)[0]
        completion = ''.join([train_dataset.itos[int(i)] for i in y])
        print(completion)

In [19]:

gpt_prompt(context = "kerajaan johor")

generating ...
kerajaan johorh dkekOakOaéLe ae Oa areagO 0ak a Okaa VaO k0 daaa Vg0khOé0hn eb ann0 r0e  Ianah dnr♦O a OOaOOI a 0 OaOa kOuee aLOe 0 VV aO ahnOn kea  kV neagL IaV kOana  aG’eegekre Oanraag V n  00IaaGagraG0 egOga d Ike Ir0Onean0  eVr   a OgagaV0aeganOO dI0VgLaakake nVO a 0IVrO  kaOVnrVr a nanr V ann0ga eaGV0rn IOna 0aa   deag0rr e eg Vg0anegILngLaOaea kOgaa nOaGahaeeahOa e Va ereVnOVran eenb0nn IVOkaera  ah kan d Ia ah VkOaaan0 aOeaer0ak krVanakr0 kag0 0aGea anearOnaeg0Vna O OVg Oa nVgrOnnrV OaG kVkanaka a  k 
