<a href="https://colab.research.google.com/github/jdasam/aat3020-2023/blob/main/notebooks/2_Named_entity_recognition.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Named Entity Recognition
- For a given word and its context window, estimate whether the given word is location or not

# 1. Download dataset
- CoNLL2003 

In [1]:
!wget https://data.deepai.org/conll2003.zip # Download dataset
!unzip conll2003.zip # Unzip dataset zip

--2023-03-30 05:59:46--  https://data.deepai.org/conll2003.zip
Resolving data.deepai.org (data.deepai.org)... 185.93.1.244, 2400:52e0:1a00::1069:1
Connecting to data.deepai.org (data.deepai.org)|185.93.1.244|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 982975 (960K) [application/zip]
Saving to: ‘conll2003.zip’


2023-03-30 05:59:47 (4.37 MB/s) - ‘conll2003.zip’ saved [982975/982975]

Archive:  conll2003.zip
  inflating: metadata                
  inflating: test.txt                
  inflating: train.txt               
  inflating: valid.txt               


## 2. Preprocess Dataset

In [2]:
with open("train.txt") as f:
  string = ''.join(f.readlines())
dataset = string.split('\n')

dataset[:70]

['-DOCSTART- -X- -X- O',
 '',
 'EU NNP B-NP B-ORG',
 'rejects VBZ B-VP O',
 'German JJ B-NP B-MISC',
 'call NN I-NP O',
 'to TO B-VP O',
 'boycott VB I-VP O',
 'British JJ B-NP B-MISC',
 'lamb NN I-NP O',
 '. . O O',
 '',
 'Peter NNP B-NP B-PER',
 'Blackburn NNP I-NP I-PER',
 '',
 'BRUSSELS NNP B-NP B-LOC',
 '1996-08-22 CD I-NP O',
 '',
 'The DT B-NP O',
 'European NNP I-NP B-ORG',
 'Commission NNP I-NP I-ORG',
 'said VBD B-VP O',
 'on IN B-PP O',
 'Thursday NNP B-NP O',
 'it PRP B-NP O',
 'disagreed VBD B-VP O',
 'with IN B-PP O',
 'German JJ B-NP B-MISC',
 'advice NN I-NP O',
 'to TO B-PP O',
 'consumers NNS B-NP O',
 'to TO B-VP O',
 'shun VB I-VP O',
 'British JJ B-NP B-MISC',
 'lamb NN I-NP O',
 'until IN B-SBAR O',
 'scientists NNS B-NP O',
 'determine VBP B-VP O',
 'whether IN B-SBAR O',
 'mad JJ B-NP O',
 'cow NN I-NP O',
 'disease NN I-NP O',
 'can MD B-VP O',
 'be VB I-VP O',
 'transmitted VBN I-VP O',
 'to TO B-PP O',
 'sheep NN B-NP O',
 '. . O O',
 '',
 'Germany NNP B-NP B

In [3]:
from itertools import groupby

dataset_in_sentence = [list(group) for k, group in groupby(dataset, lambda x: x == "") if not k]
dataset_in_sentence[:5]

[['-DOCSTART- -X- -X- O'],
 ['EU NNP B-NP B-ORG',
  'rejects VBZ B-VP O',
  'German JJ B-NP B-MISC',
  'call NN I-NP O',
  'to TO B-VP O',
  'boycott VB I-VP O',
  'British JJ B-NP B-MISC',
  'lamb NN I-NP O',
  '. . O O'],
 ['Peter NNP B-NP B-PER', 'Blackburn NNP I-NP I-PER'],
 ['BRUSSELS NNP B-NP B-LOC', '1996-08-22 CD I-NP O'],
 ['The DT B-NP O',
  'European NNP I-NP B-ORG',
  'Commission NNP I-NP I-ORG',
  'said VBD B-VP O',
  'on IN B-PP O',
  'Thursday NNP B-NP O',
  'it PRP B-NP O',
  'disagreed VBD B-VP O',
  'with IN B-PP O',
  'German JJ B-NP B-MISC',
  'advice NN I-NP O',
  'to TO B-PP O',
  'consumers NNS B-NP O',
  'to TO B-VP O',
  'shun VB I-VP O',
  'British JJ B-NP B-MISC',
  'lamb NN I-NP O',
  'until IN B-SBAR O',
  'scientists NNS B-NP O',
  'determine VBP B-VP O',
  'whether IN B-SBAR O',
  'mad JJ B-NP O',
  'cow NN I-NP O',
  'disease NN I-NP O',
  'can MD B-VP O',
  'be VB I-VP O',
  'transmitted VBN I-VP O',
  'to TO B-PP O',
  'sheep NN B-NP O',
  '. . O O']]

In [4]:
# [len(sentence) for sentence in dataset_in_sentence]
filtered_dataset = [sentence for sentence in dataset_in_sentence if len(sentence) > 5]
len(filtered_dataset)

10625

In [5]:
window_len = 2
sentence = filtered_dataset[0]

for i, word in enumerate(sentence):
  # print(word)
  splitted_word = word.split(' ')
  # print(splitted_word)
  center_word = splitted_word[0]
  label = splitted_word[-1]
  print(center_word, label)
  is_organization = label in ['B-ORG', 'I-ORG']
  # print(is_organization)
  
  prev_index = max(i - window_len, 0)
  prev_words = sentence[prev_index:i]
  prev_words = [word_str.split(' ')[0] for word_str in prev_words]

  # print(prev_words)

  next_index = i + window_len + 1
  next_words = sentence[i+1:next_index]
  # next_words = [sentence[next_index] ]
  next_words = [word_str.split(' ')[0] for word_str in next_words]

  # We have to add padding, if number of prev words or next words are shorter than expected
  if len(prev_words) != window_len:
    prev_words = ['<pad>'] * (window_len - len(prev_words)) + prev_words

  if len(next_words) != window_len:
    next_words = next_words + ['<pad>'] * (window_len - len(next_words))

  concatenated_words = prev_words + [center_word] + next_words
  print(concatenated_words)


EU B-ORG
['<pad>', '<pad>', 'EU', 'rejects', 'German']
rejects O
['<pad>', 'EU', 'rejects', 'German', 'call']
German B-MISC
['EU', 'rejects', 'German', 'call', 'to']
call O
['rejects', 'German', 'call', 'to', 'boycott']
to O
['German', 'call', 'to', 'boycott', 'British']
boycott O
['call', 'to', 'boycott', 'British', 'lamb']
British B-MISC
['to', 'boycott', 'British', 'lamb', '.']
lamb O
['boycott', 'British', 'lamb', '.', '<pad>']
. O
['British', 'lamb', '.', '<pad>', '<pad>']


In [7]:
import gensim.downloader

wrd2vec = gensim.downloader.load("glove-wiki-gigaword-300")



In [8]:
len(wrd2vec)

400000

In [9]:
import torch
import numpy as np
vec_dim = 300

corresp_vectors = []
for word in concatenated_words:
  if word.lower() in wrd2vec: # if the word exists in wrd2vec vocab
    vec = wrd2vec[word.lower()] # call corresponding vector 
    # vec = torch.tensor(vec)
  else: # there is no matching word in wrd2vec vocab, such as <pad>
    # vec = torch.zeros(vec_dim)
    vec = np.zeros(vec_dim) # use zero vectors for that token (word)
  corresp_vectors.append(vec)

# cat_vector = torch.cat(corresp_vectors)
cat_vector = torch.tensor(np.concatenate(corresp_vectors), dtype=torch.float)
cat_vector.shape

# torch.tensor(np.concatenate(corresp_vectors), dtype=torch.float).dtype

torch.Size([1500])

In [10]:
cat_vector

tensor([ 0.4436, -0.2418,  0.2366,  ...,  0.0000,  0.0000,  0.0000])

In [11]:
# pair of data sample (input) and the label (desired output)
cat_vector, is_organization

(tensor([ 0.4436, -0.2418,  0.2366,  ...,  0.0000,  0.0000,  0.0000]), False)

# Design Model

In [70]:
import torch.nn as nn

class OrgClassifier(nn.Module):
  def __init__(self, input_dim=1500, hidden_size=32):
    super().__init__()
    self.layer1 = nn.Linear(in_features=input_dim, out_features=hidden_size)
    self.layer2 = nn.Linear(in_features=hidden_size, out_features=1)
  
  def forward(self, x):
    hidden = self.layer1(x)
    # hidden = torch.relu(hidden)
    out = self.layer2(hidden)
    return out.sigmoid()

model = OrgClassifier()
out = model(cat_vector)
print(cat_vector.shape, out.shape, out)

torch.Size([1500]) torch.Size([1]) tensor([0.4901], grad_fn=<SigmoidBackward0>)


In [13]:
hidden = model.layer1(cat_vector)
print(hidden)
print(hidden.shape)
hidden = torch.relu(hidden) # You have to put non-linear operation between layers
print(hidden)
out = model.layer2(hidden)
print(out)
print(out.shape)

tensor([-0.2491, -0.0228, -0.0574,  0.1212, -0.2563,  0.0663,  0.0213, -0.0580,
         0.0635, -0.0760,  0.1940, -0.1163,  0.2503, -0.0029, -0.0909, -0.0877,
        -0.0345,  0.0385, -0.3809,  0.2882,  0.3045,  0.2356,  0.1071,  0.2397,
         0.0620, -0.0417, -0.2147, -0.1061, -0.0287,  0.3428, -0.0850,  0.0490],
       grad_fn=<AddBackward0>)
torch.Size([32])
tensor([0.0000, 0.0000, 0.0000, 0.1212, 0.0000, 0.0663, 0.0213, 0.0000, 0.0635,
        0.0000, 0.1940, 0.0000, 0.2503, 0.0000, 0.0000, 0.0000, 0.0000, 0.0385,
        0.0000, 0.2882, 0.3045, 0.2356, 0.1071, 0.2397, 0.0620, 0.0000, 0.0000,
        0.0000, 0.0000, 0.3428, 0.0000, 0.0490], grad_fn=<ReluBackward0>)
tensor([0.0210], grad_fn=<AddBackward0>)
torch.Size([1])


In [14]:
# How nn.Linear works
out = model.layer2(hidden)

# Let's get the same value by matrix multiplication
# model.layer2.weight, model.layer2.bias
for param in model.layer2.named_parameters(): # you can call list of entire parameters
# by amodule.parameters()
  print(param)

print(hidden.shape, model.layer2.weight.shape)
hidden_mat = hidden.unsqueeze(0)
print(hidden_mat, hidden_mat.shape)
weighted_sum = torch.mm(hidden_mat, model.layer2.weight.T ) #torch.mm is much more strict than torch.matmul

weighted_sum_forloop = 0
for x, w in zip(hidden, model.layer2.weight[0]):
  # print(x.item(), w.item())
  weighted_input = x.item() * w.item()
  weighted_sum_forloop += weighted_input

print(weighted_sum_forloop, weighted_sum)

print(model.layer2.bias)
final_output = weighted_sum + model.layer2.bias
print(final_output, out)

('weight', Parameter containing:
tensor([[ 0.0328, -0.1276,  0.1043,  0.0959, -0.0650,  0.0703, -0.0350, -0.0998,
          0.0711,  0.1579,  0.0287, -0.1123, -0.1141,  0.1038, -0.0670, -0.0191,
          0.0773, -0.0746, -0.1071,  0.0388, -0.1030,  0.1434,  0.1665,  0.1760,
         -0.0822,  0.1651,  0.1281,  0.0240,  0.1304,  0.0334,  0.1076,  0.0342]],
       requires_grad=True))
('bias', Parameter containing:
tensor([-0.0548], requires_grad=True))
torch.Size([32]) torch.Size([1, 32])
tensor([[0.0000, 0.0000, 0.0000, 0.1212, 0.0000, 0.0663, 0.0213, 0.0000, 0.0635,
         0.0000, 0.1940, 0.0000, 0.2503, 0.0000, 0.0000, 0.0000, 0.0000, 0.0385,
         0.0000, 0.2882, 0.3045, 0.2356, 0.1071, 0.2397, 0.0620, 0.0000, 0.0000,
         0.0000, 0.0000, 0.3428, 0.0000, 0.0490]],
       grad_fn=<UnsqueezeBackward0>) torch.Size([1, 32])
0.07585863541566704 tensor([[0.0759]], grad_fn=<MmBackward0>)
Parameter containing:
tensor([-0.0548], requires_grad=True)
tensor([[0.0210]], grad_fn=<AddBa

In [15]:
relu_hidden = torch.relu(hidden) 
relu_hidden = hidden.relu()

torch.sigmoid(out) == out.sigmoid()

tensor([True])

# Make Dataset Class

In [16]:
class Dataset:
  def __init__(self, txt_fn, wrd2vec):
    dataset = self.read_text_data(txt_fn)
    dataset_in_sentence = self.group_by_sentence(dataset)
    filtered_dataset = [sentence for sentence in dataset_in_sentence if len(sentence) > 5]
    self.data_in_sentence = filtered_dataset

    # for every sentence, make windowed_words pairs:
    total_windowed_words = []
    for sentence in self.data_in_sentence:
      total_windowed_words += self.get_windowed_words_from_sentence(sentence)
    self.data = total_windowed_words
    self.wrd2vec = wrd2vec 
    self.vec_size = wrd2vec.vector_size
  
  def read_text_data(self, txt_fn):
    with open("train.txt") as f:
      string = ''.join(f.readlines())
    dataset = string.split('\n')
    return dataset
  
  def group_by_sentence(self, dataset):
    dataset_in_sentence = [list(group) for k, group in groupby(dataset, lambda x: x == "") if not k]
    return dataset_in_sentence

  def get_windowed_words_from_sentence(self, sentence):
    result = []
    for i, word in enumerate(sentence):
      splitted_word = word.split(' ')
      center_word = splitted_word[0]
      label = splitted_word[-1]
      is_organization = label in ['B-ORG', 'I-ORG']
      
      prev_index = max(i - window_len, 0)
      prev_words = sentence[prev_index:i]
      prev_words = [word_str.split(' ')[0] for word_str in prev_words]


      next_index = i + window_len + 1
      next_words = sentence[i+1:next_index]
      next_words = [word_str.split(' ')[0] for word_str in next_words]

      # We have to add padding, if number of prev words or next words are shorter than expected
      if len(prev_words) != window_len:
        prev_words = ['<pad>'] * (window_len - len(prev_words)) + prev_words

      if len(next_words) != window_len:
        next_words = next_words + ['<pad>'] * (window_len - len(next_words))

      concatenated_words = prev_words + [center_word] + next_words
      result.append( (concatenated_words, is_organization))
    return result

  def __len__(self): # number of independent data samples
    return len(self.data)

  def __getitem__(self, idx):
    # is called when you call dataset[idx]
    cat_words, label = self.data[idx]

    # return self.data[idx]
    return self.convert_windowed_words_to_vector(cat_words), label

  def convert_windowed_words_to_vector(self, cat_words):
    # cat_words: list of strings
    # e.g. ['<pad>', '<pad>', 'EU', 'rejects', 'German']
    corresp_vectors = []
    for word in cat_words:
      if word.lower() in self.wrd2vec: # if the word exists in wrd2vec vocab
        vec = self.wrd2vec[word.lower()] # call corresponding vector 
        # vec = torch.tensor(vec)
      else: # there is no matching word in wrd2vec vocab, such as <pad>
        # vec = torch.zeros(vec_dim)
        vec = np.zeros(self.vec_size) # use zero vectors for that token (word)
      corresp_vectors.append(vec)
    return torch.tensor(np.concatenate(corresp_vectors), dtype=torch.float)

dataset = Dataset("train.txt", wrd2vec)

In [17]:
len(dataset.data), dataset.data[0]

(192587, (['<pad>', '<pad>', 'EU', 'rejects', 'German'], True))

In [18]:
len(dataset), dataset[0]

(192587,
 (tensor([ 0.0000,  0.0000,  0.0000,  ..., -0.1950,  0.2041,  0.3530]), True))

In [19]:
dataset[100]

(tensor([-0.2295,  0.3255, -0.0927,  ..., -0.3422, -0.0224,  0.1368]), False)

In [20]:
wrd2vec.vector_size

300

In [21]:
# Our method can make training samples from a given data_in_setence
dataset.get_windowed_words_from_sentence(dataset.data_in_sentence[100])

[(['<pad>', '<pad>', 'Israel', "'s", 'Channel'], False),
 (['<pad>', 'Israel', "'s", 'Channel', 'Two'], False),
 (['Israel', "'s", 'Channel', 'Two', 'television'], True),
 (["'s", 'Channel', 'Two', 'television', 'said'], True),
 (['Channel', 'Two', 'television', 'said', 'Damascus'], False),
 (['Two', 'television', 'said', 'Damascus', 'had'], False),
 (['television', 'said', 'Damascus', 'had', 'sent'], False),
 (['said', 'Damascus', 'had', 'sent', 'a'], False),
 (['Damascus', 'had', 'sent', 'a', '"'], False),
 (['had', 'sent', 'a', '"', 'calming'], False),
 (['sent', 'a', '"', 'calming', 'signal'], False),
 (['a', '"', 'calming', 'signal', '"'], False),
 (['"', 'calming', 'signal', '"', 'to'], False),
 (['calming', 'signal', '"', 'to', 'Israel'], False),
 (['signal', '"', 'to', 'Israel', '.'], False),
 (['"', 'to', 'Israel', '.', '<pad>'], False),
 (['to', 'Israel', '.', '<pad>', '<pad>'], False)]

In [22]:
# use data loader

from torch.utils.data import DataLoader

dataloader = DataLoader(dataset, batch_size=16, shuffle=True, drop_last=True)

for batch in dataloader:
  print(batch)
  break

[tensor([[-0.0440,  0.4220, -0.0930,  ..., -0.5592, -0.4222, -0.1830],
        [-0.6842,  0.5017, -0.1678,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.6490,  0.6189,  0.0786,  ..., -0.8461,  1.1838, -0.2312],
        ...,
        [ 0.0000,  0.0000,  0.0000,  ..., -0.3872, -0.6003,  0.1511],
        [ 0.6187,  1.1583, -0.3718,  ..., -0.4354,  0.0656, -0.6171],
        [-0.0824, -0.2031, -0.3797,  ...,  0.0000,  0.0000,  0.0000]]), tensor([False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False])]


In [23]:
input_tensors = batch[0]
labels = batch[1]

input_tensors.shape, labels.shape

(torch.Size([16, 1500]), torch.Size([16]))

In [24]:
# Check that our model can compute the input batch
pred = model(input_tensors)
pred.shape, pred

(torch.Size([16, 1]),
 tensor([[0.4930],
         [0.5074],
         [0.5197],
         [0.5250],
         [0.5112],
         [0.4849],
         [0.5063],
         [0.4905],
         [0.5217],
         [0.5167],
         [0.5183],
         [0.5045],
         [0.5121],
         [0.5083],
         [0.5047],
         [0.5007]], grad_fn=<SigmoidBackward0>))

In [25]:
labels.dtype, labels, labels.float()

(torch.bool,
 tensor([False, False, False, False, False, False, False, False, False, False,
         False, False, False, False, False, False]),
 tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]))

In [26]:
print(pred)
print(pred.squeeze())

tensor([[0.4930],
        [0.5074],
        [0.5197],
        [0.5250],
        [0.5112],
        [0.4849],
        [0.5063],
        [0.4905],
        [0.5217],
        [0.5167],
        [0.5183],
        [0.5045],
        [0.5121],
        [0.5083],
        [0.5047],
        [0.5007]], grad_fn=<SigmoidBackward0>)
tensor([0.4930, 0.5074, 0.5197, 0.5250, 0.5112, 0.4849, 0.5063, 0.4905, 0.5217,
        0.5167, 0.5183, 0.5045, 0.5121, 0.5083, 0.5047, 0.5007],
       grad_fn=<SqueezeBackward0>)


In [63]:
# Calculate loss
print(pred.shape, labels.shape)

def get_binary_cross_entropy_loss(pred, label, eps=1e-8):
  return label * (-torch.log(pred+eps)) + (1-label) * (-torch.log(1-pred+eps))

loss = get_binary_cross_entropy_loss(pred.squeeze(), labels.float())
loss = loss.mean() # take mean
loss

torch.Size([16, 1]) torch.Size([16])


tensor(nan, device='cuda:0', grad_fn=<MeanBackward0>)

In [28]:
# Check how gradient looks like before the backpropagation
print(model.layer1.weight.grad) # print None

None


In [29]:
# backpropagate the loss 
loss.backward()

In [30]:
# Check how gradient looks like after the backpropagation
print(model.layer1.weight.grad)

tensor([[ 6.8121e-04,  1.9289e-03, -4.5691e-04,  ..., -1.3278e-03,
         -2.5170e-03,  3.4391e-04],
        [ 4.3482e-03, -7.6321e-03,  6.3110e-03,  ...,  7.8793e-03,
         -3.2008e-04, -3.6359e-06],
        [-3.9074e-03,  2.2323e-03, -1.2517e-03,  ...,  1.8737e-04,
         -1.1051e-03,  6.6041e-05],
        ...,
        [ 1.2649e-03,  1.0731e-03, -1.4647e-03,  ..., -2.0625e-03,
         -8.6817e-04,  1.1593e-03],
        [-1.1284e-03,  4.8103e-03, -6.4542e-03,  ..., -1.2788e-02,
         -3.9533e-03,  4.1493e-03],
        [-8.1297e-04,  9.8327e-05, -5.1130e-04,  ..., -9.5175e-04,
         -1.0588e-03,  3.4343e-04]])


In [31]:
model.layer1.weight.grad.shape, model.layer1.weight.shape
# Each parameter in the layer has its own gradient

(torch.Size([32, 1500]), torch.Size([32, 1500]))

In [36]:
model.layer1.weight.grad[0, :10]
# Gradient is ratio between the parameter's change and the loss' change
# if the gradient is 0.01
# that means if the parameter increases for 1,
# the loss will increase for 1*0.01

tensor([ 6.8121e-04,  1.9289e-03, -4.5691e-04,  5.8656e-04,  2.2097e-04,
        -7.3869e-04, -1.0776e-03, -1.3207e-03, -9.4426e-05, -2.8355e-03])

In [32]:
# manually update weight parameters using the gradient
model.layer1.weight.data -= model.layer1.weight.grad * 0.001

In [33]:
# use torch.optim.optimizers

In [38]:
pred = model(input_tensors)
loss = get_binary_cross_entropy_loss(pred.squeeze(), labels.float())
loss = loss.mean()

print(loss.item())

0.70819091796875


In [39]:
model.layer2.weight.grad = None # rest gradient to zero
loss.backward()

In [40]:
model.layer2.weight.grad

tensor([[0.0185, 0.0596, 0.0494, 0.1009, 0.0147, 0.0248, 0.0316, 0.0277, 0.0260,
         0.0244, 0.0551, 0.0471, 0.0318, 0.0411, 0.0241, 0.0245, 0.0363, 0.0416,
         0.0089, 0.0313, 0.0538, 0.0156, 0.0314, 0.0422, 0.0253, 0.0629, 0.0259,
         0.0615, 0.0304, 0.0386, 0.0871, 0.0265]])

In [47]:
print(model.layer2.weight.data[0,-5])
model.layer2.weight.data[0,-5] -= model.layer2.weight.grad[0,-5] * 100 # times learning rate
print(model.layer2.weight.data[0,-5])


tensor(-0.6519)
tensor(-6.7970)


In [48]:
labels

tensor([False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False])

In [49]:


pred = model(input_tensors)
loss = get_binary_cross_entropy_loss(pred.squeeze(), labels.float())
loss = loss.mean()

print(loss.item())

0.4724261462688446


In [51]:
# Training the model == update the model's parameters

# 1. make a prediction
# 2. calculate the loss (to see how good or bad your current parameters are )
# 3. calculate the gradient of each parameters using backpropagation
# 4. update the parameters using the gradient (we call it optimization)
# we can use several optimizers to update the parameters

# optimizers: SGD (stochastic gradient descent, maybe Vanilla one), 
#    Adam (most famous), Adadelta, Adamp

# Define optimizer
# select the optimizer class
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)


# now we can update the model's parameters using
optimizer.step()

# You can also reset the gradietn using the optimizer
optimizer.zero_grad()

In [71]:
from tqdm.auto import tqdm

# Let's use GPU
DEV = 'cuda'
model = OrgClassifier()
model.to(DEV) # model=model.to('cuda)
# cuda is an NVIDIA GPU library
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
loss_record_wo_relu = []

dataloader = DataLoader(dataset, batch_size=128, shuffle=True, drop_last=True)


for batch in tqdm(dataloader):
  input_tensors, labels = batch
  input_tensors = input_tensors.to(DEV)
  labels = labels.to(DEV)
  pred = model(input_tensors)
  loss = get_binary_cross_entropy_loss(pred.squeeze(), labels.float())
  loss = loss.mean()
  loss.backward() # do backpropagation. This will calculate the gradient of each parameter
  optimizer.step() # This will update the parameters using the gradient
  optimizer.zero_grad()
  # print(torch.sum(labels).item(), loss.item())
  loss_record_wo_relu.append(loss.item())

  0%|          | 0/1504 [00:00<?, ?it/s]

In [None]:
import matplotlib.pyplot as plt

plt.plot(loss_record)
plt.plot(loss_record_wo_relu)

In [67]:
len(input_tensors)

16

In [62]:
torch.log(torch.tensor([1e-100]))

tensor([-inf])

In [60]:
# nan losses go back to your parameters by making nan grad, and nan weight
model.layer1.weight.data

tensor([[nan, nan, nan,  ..., nan, nan, nan],
        [nan, nan, nan,  ..., nan, nan, nan],
        [nan, nan, nan,  ..., nan, nan, nan],
        ...,
        [nan, nan, nan,  ..., nan, nan, nan],
        [nan, nan, nan,  ..., nan, nan, nan],
        [nan, nan, nan,  ..., nan, nan, nan]], device='cuda:0')