# Neural Network Generator using Pytorch




In [1]:
#Install Torch
!pip install torch



In [2]:
#Install PDF processing library
!pip install pdfplumber

Collecting pdfplumber
  Downloading pdfplumber-0.11.4-py3-none-any.whl.metadata (41 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.0/42.0 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pdfminer.six==20231228 (from pdfplumber)
  Downloading pdfminer.six-20231228-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.5/48.5 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
Downloading pdfplumber-0.11.4-py3-none-any.whl (59 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.2/59.2 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pdfminer.six-20231228-py3-none-any.whl (5.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [16]:
# Load the CV PDF and extract text
import pdfplumber

def extract_text_from_pdf(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        pages = [page.extract_text() for page in pdf.pages]
    return ''.join(pages)

pdf_path = "about.pdf"
cv_text = extract_text_from_pdf(pdf_path)[:125]
print (cv_text)

9/24/24, 9:23 AM About Me - Kaushik Roy
About Me
I am Kaushik Roy, a Ph.D. candidate at the Artificial Intelligence
Institute


In [17]:
#data loading code
chars = list(set(cv_text))

class Tokenizer(object):

  def __init__(self,
               tokens = None):

    self.tokens = tokens
    self.n_tokens = len(tokens)

  def encode(self,
             text):

    text_chars = list(text)
    return ([self.tokens.index(c) for c in text_chars])

  def decode(self,
             text_encoding):

    return ''.join([self.tokens[encoding] for encoding in text_encoding])

import torch
from random import sample, shuffle

class Dataloader(object):

  def __init__(self,
               tokenizer = None,
               text = None):

    self.context_size = len(list(text))
    X, Y = [], []
    for t in range(self.context_size-1):

      x, y = tokenizer.encode(text[:t+1]), tokenizer.encode(text[t+1])
      X += [x]; Y += [y[0]]

    self.data = list([list(item) for item in zip(X,Y)])

  def get_batch(self,
                n = None):

    if n is None:

      shuffle(self.data)
      return self.data

    else:

      batch = sample(self.data,n)
      return batch

In [31]:
#neural network code
import torch.nn as nn
import torch.nn.functional as F
from tqdm import tqdm

class generator(nn.Module):

  def __init__(self,
               n_tokens = None,
               emb_size = None,
               context_size = None,
               n_layers = 2,
               h_size = 100):

    super().__init__()

    self.n_tokens = n_tokens
    self.emb_size = emb_size
    self.context_size = context_size
    self.n_layers = n_layers
    self.h_size = h_size

    self.embeddings = nn.Embedding(self.n_tokens, self.emb_size)
    self.pos_embeddings = nn.Embedding(self.context_size, self.emb_size)

    self.fc1 = nn.Linear(self.emb_size,self.h_size,bias=False)
    self.fc2 = nn.Linear(self.h_size,self.h_size,bias=False)
    self.head = nn.Linear(self.h_size,self.n_tokens)

  def forward(self,
              token_encodings):

    n_tokens = len(token_encodings)
    token_encodings = torch.tensor(token_encodings)
    token_encodings.to(device)
    token_embeddings = self.embeddings(token_encodings)
    pos_embeddings = self.pos_embeddings(torch.arange(n_tokens))
    token_embeddings += pos_embeddings

    reps = token_embeddings
    reps = F.leaky_relu(self.fc1(reps))
    reps = F.leaky_relu(self.fc2(reps))
    reps = self.head(reps)

    logits = reps[-1]
    return logits

  def generate(self,
               x):

    for i in range(100):

      x = x[:self.context_size]
      logits = self(x)
      logits = F.softmax(logits,dim=-1)
      next_id = torch.multinomial(logits,num_samples = 1)
      x = x + next_id.tolist()

    return tokenizer.decode(x)


  def train(self,
            data_loader):

    optimizer = torch.optim.AdamW(self.parameters())

    for i in tqdm(range(20)):

      batch = data_loader.get_batch()
      n_batch = len(batch)
      loss = F.cross_entropy

      batch_loss = 0.0
      for item in batch:

        x, y = item[0], item[1]
        logits = self(x)
        targets = [0.0]*self.n_tokens; targets[y] = 1.0
        targets = torch.tensor(targets)
        batch_loss += loss(logits, targets)

      batch_loss /= n_batch
      print (batch_loss.item())
      batch_loss.backward()
      optimizer.step()
      optimizer.zero_grad()

In [32]:
#set up trainer
tokenizer = Tokenizer(tokens=chars)
data_loader = Dataloader(tokenizer, cv_text)
n_tokens = tokenizer.n_tokens
emb_size = len(data_loader.get_batch())
context_size = len(data_loader.get_batch())

text_generator1 = generator(n_tokens = n_tokens,
                            emb_size = emb_size,
                            context_size = context_size)

In [33]:
#train the network
import torch
device = ('cuda' if torch.cuda.is_available() else 'cpu')

text_generator1.train(data_loader)

 10%|█         | 2/20 [00:00<00:02,  7.79it/s]

3.625986099243164
3.569924831390381


 20%|██        | 4/20 [00:00<00:01,  8.09it/s]

3.5167958736419678
3.4650580883026123


 25%|██▌       | 5/20 [00:00<00:01,  8.00it/s]

3.413957118988037
3.3625783920288086


 40%|████      | 8/20 [00:01<00:01,  7.97it/s]

3.310307025909424
3.2565419673919678


 50%|█████     | 10/20 [00:01<00:01,  8.09it/s]

3.200972318649292
3.143232583999634


 60%|██████    | 12/20 [00:01<00:00,  8.20it/s]

3.083127021789551
3.020684003829956


 70%|███████   | 14/20 [00:01<00:00,  7.93it/s]

2.9557526111602783
2.888597249984741


 80%|████████  | 16/20 [00:02<00:00,  8.16it/s]

2.81943416595459
2.7485013008117676


 90%|█████████ | 18/20 [00:02<00:00,  8.03it/s]

2.6762807369232178
2.603827953338623


100%|██████████| 20/20 [00:02<00:00,  7.99it/s]

2.5314013957977295
2.4595391750335693





In [35]:
#inference code
import time
import sys
for o in range(1):

  prompt = cv_text[:torch.randint(1,len(list(cv_text)),(1,))]
  print ('prompt: ', prompt)
  generated_tokens = text_generator1.generate(tokenizer.encode(prompt))
  print ('Generating the prompt and the auto-completed text below ... ')
  for token in generated_tokens:

    sys.stdout.write(token)
    sys.stdout.flush()
    time.sleep(0.03)

prompt:  9/24/24, 9:23 AM About Me - Kaushik Roy
About Me
I am Kaushik Roy, a Ph.D. 
Generating the prompt and the auto-completed text below ... 
9/24/24, 9:23 AM About Me - Kaushik Roy
About Me
I am Kaushik Roy, a Ph.D. h.2lKilt s io
KyedAyIiiti otin4tes khu,:4:, l24ank

# Polynomial Generator using Pytorch

In [43]:
#polynomial generator code

import torch.nn as nn
import torch.nn.functional as F
from tqdm import tqdm

class moment_generator(nn.Module):

  def __init__(self,
               n_tokens = None,
               emb_size = None,
               context_size = None,
               moment_order = None):

    super().__init__()

    self.n_tokens = n_tokens
    self.emb_size = emb_size
    self.context_size = context_size
    self.moment_order = moment_order

    self.embeddings = nn.Embedding(self.n_tokens, self.emb_size)
    self.pos_embeddings = nn.Embedding(self.context_size, self.emb_size)
    self.head = nn.Linear(self.emb_size,self.n_tokens)

  def forward(self,
              token_encodings):

    n_tokens = len(token_encodings)
    token_encodings = torch.tensor(token_encodings)
    token_encodings.to(device)
    token_embeddings = self.embeddings(token_encodings)
    pos_embeddings = self.pos_embeddings(torch.arange(n_tokens))
    token_embeddings += pos_embeddings

    moments = torch.row_stack([torch.mean(torch.pow(token_embeddings,k),dim=0) for k in range(self.moment_order)])
    logits = self.head(moments)[-1]
    return logits

  def generate(self,
               x):

    for i in range(100):

      x = x[:self.context_size]
      logits = self(x)
      logits = F.softmax(logits,dim=-1)
      next_id = torch.multinomial(logits,num_samples = 1)
      x = x + next_id.tolist()

    return tokenizer.decode(x)

  def train(self,
            data_loader):

    optimizer = torch.optim.AdamW(self.parameters())

    acc_loss = None
    for i in range(20):

      batch = data_loader.get_batch()
      n_batch = len(batch)
      loss = F.cross_entropy

      batch_loss = 0.0
      for item in batch:

        x, y = item[0], item[1]
        logits = self(x)
        targets = [0.0]*self.n_tokens; targets[y] = 1.0
        targets = torch.tensor(targets)
        batch_loss += loss(logits, targets)

      batch_loss /= n_batch
      acc_loss = batch_loss
      batch_loss.backward()
      optimizer.step()
      optimizer.zero_grad()

    print (acc_loss.item())

In [44]:
#polynomial function trainer setup
tokenizer = Tokenizer(tokens=chars)
data_loader = Dataloader(tokenizer, cv_text)
n_tokens = tokenizer.n_tokens
emb_size = len(data_loader.get_batch())
context_size = len(data_loader.get_batch())

text_generator2 = moment_generator(n_tokens = n_tokens,
                                   emb_size = emb_size,
                                   context_size = context_size,
                                   moment_order = 3)

In [45]:
#polynomial function training
from tqdm import tqdm
for k in tqdm(range(20)):
  text_generator2.train(data_loader)

  5%|▌         | 1/20 [00:02<00:40,  2.15s/it]

3.1688625812530518


 10%|█         | 2/20 [00:04<00:39,  2.17s/it]

2.867598533630371


 15%|█▌        | 3/20 [00:06<00:36,  2.16s/it]

2.6762983798980713


 20%|██        | 4/20 [00:08<00:34,  2.15s/it]

2.5302839279174805


 25%|██▌       | 5/20 [00:11<00:37,  2.51s/it]

2.4144108295440674


 30%|███       | 6/20 [00:13<00:33,  2.40s/it]

2.3230040073394775


 35%|███▌      | 7/20 [00:16<00:30,  2.32s/it]

2.2442479133605957


 40%|████      | 8/20 [00:18<00:27,  2.27s/it]

2.1711087226867676


 45%|████▌     | 9/20 [00:20<00:24,  2.23s/it]

2.104647159576416


 50%|█████     | 10/20 [00:22<00:23,  2.32s/it]

2.041882038116455


 55%|█████▌    | 11/20 [00:25<00:22,  2.48s/it]

1.981392502784729


 60%|██████    | 12/20 [00:27<00:19,  2.38s/it]

1.925990343093872


 65%|██████▌   | 13/20 [00:30<00:16,  2.32s/it]

1.874157428741455


 70%|███████   | 14/20 [00:32<00:13,  2.27s/it]

1.8272738456726074


 75%|███████▌  | 15/20 [00:34<00:11,  2.23s/it]

1.7830777168273926


 80%|████████  | 16/20 [00:37<00:09,  2.39s/it]

1.74057936668396


 85%|████████▌ | 17/20 [00:39<00:07,  2.42s/it]

1.6986557245254517


 90%|█████████ | 18/20 [00:41<00:04,  2.35s/it]

1.6586095094680786


 95%|█████████▌| 19/20 [00:43<00:02,  2.30s/it]

1.6233717203140259


100%|██████████| 20/20 [00:46<00:00,  2.31s/it]

1.5911303758621216





In [48]:
#inference code
import time
import sys
for o in range(1):

  prompt = cv_text[:torch.randint(1,len(list(cv_text)),(1,))]
  print ('prompt: ', prompt)
  generated_tokens = text_generator2.generate(tokenizer.encode(prompt))
  print ('Generating the prompt and the auto-completed text below ... ')
  for token in generated_tokens:

    sys.stdout.write(token)
    sys.stdout.flush()
    time.sleep(0.03)

prompt:  9/24/24, 9:23 AM About Me - Kaushik Roy
About Me
I am Kaushik 
Generating the prompt and the auto-completed text below ... 
9/24/24, 9:23 AM About Me - Kaushik Roy
About Me
I am Kaushik k  aoPh h.RRh,o h eahRia ct l  fatsacIihtini eecIiseiieh netao
