<a href="https://colab.research.google.com/github/kpseth78/Language-Modeling/blob/main/model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from torch import nn, optim
import torch
from torch.utils.data import DataLoader, Dataset
import nltk
from collections import Counter, defaultdict
from torchsummary import summary

import numpy as np
from datetime import datetime

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
!nvidia-smi

Wed Feb 17 04:33:15 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.39       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   44C    P8    10W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
print(torch.__version__)

1.7.0+cu101


In [None]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2019 NVIDIA Corporation
Built on Sun_Jul_28_19:07:16_PDT_2019
Cuda compilation tools, release 10.1, V10.1.243


In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
import os
os.chdir("drive/My Drive/NLP_Project2")


Mounted at /content/drive


In [None]:
ls

NLP-Project2.ipynb  test50000.txt  train50000.txt  valid50000.txt


In [None]:
if torch.cuda.is_available():
  device = torch.device("cuda")
else:
  device = torch.device("cpu")

In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Select the Runtime > "Change runtime type" menu to enable a GPU accelerator, ')
  print('and then re-execute this cell.')
else:
  print(gpu_info)

Wed Feb 17 04:36:50 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.39       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   35C    P8     9W /  70W |      3MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
class LSTModel(nn.Module):
  def __init__(self, parameters):
    super(LSTModel, self).__init__()
    
    # defined variables
    self.params = parameters

    self.embedding = nn.Embedding(
        num_embeddings=self.params['n_vocab'],
        embedding_dim=self.params['embedding_size'],
    )

    self.lstm = nn.LSTM(input_size=self.params['lstm_size'],
                        hidden_size=self.params['lstm_size'],
                        num_layers=self.params['num_layers'],
                        dropout=self.params['lstm_drop'])

    self.out = nn.Linear(self.params['lstm_size'], self.params['n_vocab'])


  def forward(self, X, prev_state):
    embed = self.embedding(X)
    lstm_out, state_h = self.lstm(embed, prev_state)
    logits = self.out(lstm_out)
    return logits, state_h

  def init_state(self):
    return (torch.zeros(self.params['num_layers'], self.params['seq_len'], self.params['lstm_size']), 
            torch.zeros(self.params['num_layers'], self.params['seq_len'], self.params['lstm_size']))

In [None]:
class dataset(Dataset):

  def __init__(self, filepath, params, training:dict=None):
    self.total_words = 0
    self.params = params
    self.corpus = []
    stopwords = set(nltk.corpus.stopwords.words('english'))
    
    if not training:
      with open(filepath, 'r') as file:
        vocab = Counter({'<unk>': 1})
        for i, line in enumerate(file):
          if (i + 1) % 5000 == 0:
            print('Processed 5,000 lines')
          if '<start_doc>' in line or '</start_doc>' in line:
            continue
          tokens = [token for token in nltk.word_tokenize(line) if token not in stopwords]
          self.total_words += len(tokens)

          line_vocab = Counter(tokens)
          vocab += line_vocab
          self.corpus.extend(tokens)
        
      vocab = Counter({tk: num for tk, num in vocab.items() if num >= self.params['threshold']})
      self.vocab = vocab
      n_vocab = len(vocab.keys())
      self.intRepr = defaultdict(int, zip(vocab.keys(), range(n_vocab)))
      self.tokenRepr = dict(zip(self.intRepr.values(), self.intRepr.keys()))
      self.tokenRepr[0] = '<unk>'
      self.corpus = [self.intRepr[token] for token in self.corpus]

    else:
      self.intRepr = training['intRepr']
      self.tokenRepr = training['tokenRepr']
      
      with open(filepath, 'r') as file:
        for i, line in enumerate(file):
          if '<start_doc>' in line or '</start_doc>' in line:
              continue
          tokens = [token for token in nltk.word_tokenize(line) if token not in stopwords]
          self.total_words += len(tokens)
          self.corpus.extend([self.intRepr[token] for token in tokens])

          if (i + 1) % 5000 == 0:
            print(f'Processed {i}*5,000 lines')
      
  def getRepr(self):
    return self.intRepr, self.tokenRepr

  def __len__(self):
    return self.total_words - self.params['seq_len']

  def __getitem__(self, index):
    return (
            torch.tensor(self.corpus[index:index+self.params['seq_len']]),
            torch.tensor(self.corpus[index+1:index+self.params['seq_len']+1]),
        )

In [None]:
# test = dataset('test50000.txt', parameters, {'intRepr': intrepr, 'tokenRepr': tokenrepr})

In [None]:
parameters = {
    # corpus parameters
    'threshold': 2,
    'max_tokens': 500000000,

    # model parameters
    'lstm_size': 256,
    'embedding_size': 256,
    'num_layers': 2,
    'lstm_drop': 0.5,
    'lstm_init': False,
    'seq_len': 32,
    
    # fit parameteres
    'knn': -1,
    'epochs': 20,
    'grad_clip': 2.0,
    'lr': 0.005,
    'lr_decay': 0,
    'regularization': 0,
    'batch_size': 32,
}

train = dataset('train50000.txt', parameters)
intrepr, tokenrepr = train.getRepr()
valid = dataset('valid50000.txt', parameters, {'intRepr': intrepr, 'tokenRepr': tokenrepr})
# test = dataset('test50000.txt', parameters, {'intRepr': intrepr, 'tokenRepr': tokenrepr})

trainloader = DataLoader(train, batch_size=parameters['batch_size'], shuffle=True)
validloader = DataLoader(valid, batch_size=parameters['batch_size'], shuffle=True)
# testloader = DataLoader(test, batch_size=parameters['batch_size'], shuffle=True)
print('Text loaded')

parameters['n_vocab'] = len(train.vocab)
model = LSTModel(parameters)
model.train()
model.to(device)
loss_function = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=parameters['lr'])
if parameters['lr_decay'] != 0:
  scheduler = optim.lr_scheduler.ExponentialLR(optimizer, parameters['lr_decay'])
else:
  scheduler = None

Processed 5,000 lines
Processed 5,000 lines
Processed 5,000 lines
Processed 5,000 lines
Processed 5,000 lines
Processed 5,000 lines
Processed 5,000 lines
Processed 5,000 lines
Processed 5,000 lines
Processed 5,000 lines
Processed 5,000 lines
Processed 5,000 lines
Processed 5,000 lines
Processed 5,000 lines
Processed 5,000 lines
Processed 5,000 lines
Processed 5,000 lines
Processed 5,000 lines
Processed 5,000 lines
Processed 5,000 lines
Processed 5,000 lines
Processed 5,000 lines
Processed 5,000 lines
Processed 5,000 lines
Processed 5,000 lines
Processed 5,000 lines
Processed 5,000 lines
Processed 5,000 lines
Processed 5,000 lines
Processed 5,000 lines
Processed 5,000 lines
Processed 5,000 lines
Processed 5,000 lines
Processed 5,000 lines
Processed 5,000 lines
Processed 5,000 lines
Processed 5,000 lines
Processed 5,000 lines
Processed 5,000 lines
Processed 5,000 lines
Processed 5,000 lines
Processed 5,000 lines
Processed 5,000 lines
Processed 5,000 lines
Processed 5,000 lines
Processed 

In [None]:
parameters = {
    # corpus parameters
    'threshold': 2,
    'max_tokens': 500000000,

    # model parameters
    'lstm_size': 256,
    'embedding_size': 256,
    'num_layers': 2,
    'lstm_drop': 0.5,
    'lstm_init': False,
    'seq_len': 32,
    
    # fit parameteres
    'knn': -1,
    'epochs': 20,
    'grad_clip': 2.0,
    'lr': 0.005,
    'lr_decay': 0,
    'regularization': 0,
    'batch_size': ,
}
parameters['n_vocab'] = 20000
model = LSTModel(parameters)
total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(total_params)

11312672


In [None]:
# for i in range(parameters['epochs']):
times = []
trainLoss = []
validLoss = []

for i in range(3):
  epoch_start = datetime.now()
  state_h, state_c = model.init_state()
  state_h, state_c = state_h.to(device), state_c.to(device)

  for batch, (x, y) in enumerate(trainloader):
    if batch == 5000:
      break
    x, y = x.to(device), y.to(device)
    optimizer.zero_grad()
    output, (state_h, state_c) = model(x, (state_h, state_c))
    loss = loss_function(output.transpose(1, 2), y)
    
    loss.backward()

    state_h = state_h.detach()
    state_c = state_c.detach()

    if parameters['grad_clip']:
      torch.nn.utils.clip_grad_norm_(model.parameters(), parameters['grad_clip'])
    optimizer.step()
    if scheduler:
      scheduler.step()

    if batch % 1000 == 0:
      trainLoss.append(loss.item())
      print({ 'epoch': i, 'batch': batch, 'loss': loss.item()})

  # lets see if code breaks or not 
  epochLoss = []
  state_h, state_c = model.init_state()
  state_h, state_c = state_h.to(device), state_c.to(device)
  for batch, (x, y) in enumerate(validloader):
    x, y = x.to(device), y.to(device)
    output, (state_h, state_c) = model(x , (state_h, state_c))
    loss = loss_function(output.transpose(1, 2), y)
    epochLoss.append(loss.item())
  validLoss = np.average(epochLoss)

  epoch_end = datetime.now()
  delta = epoch_end - epoch_start
  times.append(delta)
  print({'epoch time': delta, })

In [None]:
intrepr, tokenrepr = data.getRepr()

NameError: ignored

In [None]:
sum([time.seconds for time in times])/60

12.366666666666667

In [None]:
trainloader = DataLoader(data, batch_size=parameters['batch_size'], shuffle=True)
for batch, (x, y) in enumerate(trainloader):
  print(x.shape)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
torch.Size([64, 10])
torch.Size([64, 10])
torch.Size([64, 10])
torch.Size([64, 10])
torch.Size([64, 10])
torch.Size([64, 10])
torch.Size([64, 10])
torch.Size([64, 10])
torch.Size([64, 10])
torch.Size([64, 10])
torch.Size([64, 10])
torch.Size([64, 10])
torch.Size([64, 10])
torch.Size([64, 10])
torch.Size([64, 10])
torch.Size([64, 10])
torch.Size([64, 10])
torch.Size([64, 10])
torch.Size([64, 10])
torch.Size([64, 10])
torch.Size([64, 10])
torch.Size([64, 10])
torch.Size([64, 10])
torch.Size([64, 10])
torch.Size([64, 10])
torch.Size([64, 10])
torch.Size([64, 10])
torch.Size([64, 10])
torch.Size([64, 10])
torch.Size([64, 10])
torch.Size([64, 10])
torch.Size([64, 10])
torch.Size([64, 10])
torch.Size([64, 10])
torch.Size([64, 10])
torch.Size([64, 10])
torch.Size([64, 10])
torch.Size([64, 10])
torch.Size([64, 10])
torch.Size([64, 10])
torch.Size([64, 10])
torch.Size([64, 10])
torch.Size([64, 10])
torch.Size([64, 10])
torch.Size(

In [None]:
data[:100]

TypeError: ignored