<a href="https://colab.research.google.com/github/kushagragpt99/NLP_practice/blob/master/nlp_basics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [19]:
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
torch.manual_seed(1)

<torch._C.Generator at 0x7f2e1f018750>

In [0]:
lin = nn.Linear(5, 3) # maps from R^5 to R^3, parameters A, b
data = autograd.Variable( torch.randn(2, 5) ) # data is 2x5.  A maps from 5 to 3... can we map "data" under A?
print(lin(data))

tensor([[-1.0811, -0.3364, -0.1754],
        [-0.4088,  0.6282,  0.7178]], grad_fn=<AddmmBackward>)


In [0]:
data = [ ("me gusta comer en la cafeteria".split(), "SPANISH"),
         ("Give it to me".split(), "ENGLISH"),
         ("No creo que sea una buena idea".split(), "SPANISH"),
         ("No it is not a good idea to get lost at sea".split(), "ENGLISH") ]

test_data = [ ("Yo creo que si".split(), "SPANISH"),
              ("it is lost on me".split(), "ENGLISH")]



In [22]:
word_ix={}
for sentence,_ in data+test_data:
  for word in sentence:
    if(word not in word_ix):
      word_ix[word]=len(word_ix)
VOCAB_SIZE=len(word_ix)
NO_LABEL=2
print(word_ix)
label_ix={}
label_ix['SPANISH']=0
label_ix['ENGLISH']=1

{'me': 0, 'gusta': 1, 'comer': 2, 'en': 3, 'la': 4, 'cafeteria': 5, 'Give': 6, 'it': 7, 'to': 8, 'No': 9, 'creo': 10, 'que': 11, 'sea': 12, 'una': 13, 'buena': 14, 'idea': 15, 'is': 16, 'not': 17, 'a': 18, 'good': 19, 'get': 20, 'lost': 21, 'at': 22, 'Yo': 23, 'si': 24, 'on': 25}


In [0]:
class BOWclassifier(nn.Module):
    def __init__(self, vocab_size, no_labels):
      super(BOWclassifier, self).__init__()
      self.linear=nn.Linear(vocab_size, no_labels)
      
    def forward(self, bow_vec):
      return F.log_softmax(self.linear(bow_vec))


In [0]:
def make_bowvec(sent, word_ix):
  bow_vec=torch.zeros(VOCAB_SIZE)
  for word in sent:
    bow_vec[word_ix[word]]+=1
  return bow_vec.view(1,-1)
  
def make_target(word, label_ix):
  return torch.LongTensor([label_ix[word]])

In [25]:
print(label_ix)
for x,y in data:
    bow_vec=autograd.Variable(make_bowvec(x,word_ix))
    label=autograd.Variable(make_target(y, label_ix))
    print(label)

{'SPANISH': 0, 'ENGLISH': 1}
tensor([0])
tensor([1])
tensor([0])
tensor([1])


In [27]:
loss_fun=nn.NLLLoss()
model=BOWclassifier(VOCAB_SIZE, NO_LABEL)
optimizer=optim.SGD(model.parameters(), lr=0.1)

for epoch in range(10):
  for x,y in data:
    bow_vec=autograd.Variable(make_bowvec(x,word_ix))
    label=autograd.Variable(make_target(y, label_ix))
    #bow_vec=make_bowvec(x,word_ix)
    #label=make_target(y,label_ix)
    y_hat=model(bow_vec)
    #print(label)
    loss=loss_fun(y_hat, label)
    #print(loss)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

  import sys


In [28]:
for x,y in test_data:
    bow_vec=autograd.Variable(make_bowvec(x,word_ix))
    p=model(bow_vec)
    print(p)

tensor([[-0.3284, -1.2731]], grad_fn=<LogSoftmaxBackward>)
tensor([[-1.2836, -0.3244]], grad_fn=<LogSoftmaxBackward>)


  import sys


In [29]:
print(next(model.parameters())[:,word_ix["creo"]])

tensor([ 0.1262, -0.1731], grad_fn=<SelectBackward>)


In [0]:
CONTEXT_SIZE = 2
EMBEDDING_DIM = 10
# We will use Shakespeare Sonnet 2
test_sentence = """When forty winters shall besiege thy brow,
And dig deep trenches in thy beauty's field,
Thy youth's proud livery so gazed on now,
Will be a totter'd weed of small worth held:
Then being asked, where all thy beauty lies,
Where all the treasure of thy lusty days;
To say, within thine own deep sunken eyes,
Were an all-eating shame, and thriftless praise.
How much more praise deserv'd thy beauty's use,
If thou couldst answer 'This fair child of mine
Shall sum my count, and make my old excuse,'
Proving his beauty by succession thine!
This were to be new made when thou art old,
And see thy blood warm when thou feel'st it cold.""".split()
# we should tokenize the input, but we will ignore that for now
# build a list of tuples.  Each tuple is ([ word_i-2, word_i-1 ], target word)
trigrams = [ ([test_sentence[i], test_sentence[i+1]], test_sentence[i+2]) for i in range(len(test_sentence) - 2) ]


In [0]:
vocab=set(test_sentence)
word_to_ix={word:i for i,word in enumerate(vocab)}

In [0]:
class NGramLanguageModeler(nn.Module):
  def __init__(self, vocab_size, n_embed, context_size):
    super(NGramLanguageModeler, self).__init__()
    self.embedding=nn.Embedding(vocab_size, n_embed)
    self.linear1=nn.Linear(context_size*n_embed, 128)
    self.linear2=nn.Linear(128,vocab_size)
    
  def forward(self, words):
    embed=self.embedding(words).view(1,-1)
    out=F.relu(self.linear1(embed))
    out=self.linear2(out)
    return F.log_softmax(out)

In [35]:
losses=[]
model=NGramLanguageModeler(len(vocab), EMBEDDING_DIM, CONTEXT_SIZE)
loss_fun=nn.NLLLoss()
optimizer=optim.SGD(model.parameters(), lr=0.001)

for epoch in range(10):
  epoch_loss=torch.Tensor([0])
  for context, target in trigrams:
    context_idxs=map(lambda w: word_to_ix[w], context)
    context_var=autograd.Variable(torch.LongTensor(list(context_idxs)))
    #print(y_hat.size(), autograd.Variable(torch.LongTensor([word_to_ix[target]])))
    y_hat=model(context_var)
    loss=loss_fun(y_hat, autograd.Variable(torch.LongTensor([word_to_ix[target]])))
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    epoch_loss+=loss
  losses.append(epoch_loss)

  if sys.path[0] == '':


In [36]:
CONTEXT_SIZE = 2 # 2 words to the left, 2 to the right
raw_text = """We are about to study the idea of a computational process. Computational processes are abstract
beings that inhabit computers. As they evolve, processes manipulate other abstract
things called data. The evolution of a process is directed by a pattern of rules
called a program. People create programs to direct processes. In effect,
we conjure the spirits of the computer with our spells.""".split()
word_to_ix = { word: i for i, word in enumerate(set(raw_text)) }
data = []
for i in range(2, len(raw_text) - 2):
    context = [ raw_text[i-2], raw_text[i-1], raw_text[i+1], raw_text[i+2] ]
    target = raw_text[i]
    data.append( (context, target) )
print(data[:5])
vocab_size=len(word_to_ix)

[(['We', 'are', 'to', 'study'], 'about'), (['are', 'about', 'study', 'the'], 'to'), (['about', 'to', 'the', 'idea'], 'study'), (['to', 'study', 'idea', 'of'], 'the'), (['study', 'the', 'of', 'a'], 'idea')]


In [0]:
class CBOW(nn.Module):
  def __init__(self, vocab_size, n_embed):
    super(CBOW, self).__init__()
    self.embedding=nn.Embedding(vocab_size, n_embed)
    self.linear1=nn.Linear(n_embed, 128)
    self.linear2=nn.Linear(128,vocab_size)
    
  def forward(self, input):
    embed=torch.sum(self.embedding(input), dim=0)
    out=F.relu(self.linear1(embed))
    out=self.linear2(out)
    return F.log_softmax(out)

In [38]:
def make_context_vector(context, word_to_ix):
    idxs = map(lambda w: word_to_ix[w], context)
    tensor = torch.LongTensor(list(idxs))
    return autograd.Variable(tensor)

make_context_vector(data[0][0], word_to_ix)

tensor([32,  8,  7, 24])

In [39]:
for p in model.parameters():
  print(p.size())

torch.Size([97, 10])
torch.Size([128, 20])
torch.Size([128])
torch.Size([97, 128])
torch.Size([97])


In [40]:
model=CBOW(vocab_size, 10)
loss_fun=nn.NLLLoss()
optimizer=optim.SGD(model.parameters(), lr=0.01)

for epoch in range(10):
  for context, target in data:
    con_vec=make_context_vector(context, word_to_ix)
    y_hat=model(con_vec).view(1,-1)
    #print(y_hat.size(), autograd.Variable(torch.LongTensor([word_to_ix[target]])))
    loss=loss_fun(y_hat, autograd.Variable(torch.LongTensor([word_to_ix[target]])))
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

  if sys.path[0] == '':


In [41]:
a=torch.ones(3,4)
em=nn.Embedding(3,4)
b=autograd.Variable(torch.LongTensor([1,2]))
print(b.size())
c=torch.sum(em(b), dim=0)
print(c)
for p in em.parameters():
  print(p.size())

torch.Size([2])
tensor([ 0.5851,  0.6259,  2.0680, -1.3082], grad_fn=<SumBackward2>)
torch.Size([3, 4])


In [0]:
lstm = nn.LSTM(3, 3) # Input dim is 3, output dim is 3
inputs = [ autograd.Variable(torch.randn((1,3))) for _ in range(5) ] # make a sequence of length 5

# initialize the hidden state.  
hidden = (autograd.Variable(torch.randn(1,1,3)), autograd.Variable(torch.randn((1,1,3))))
for i in inputs:
    # Step through the sequence one element at a time.
    # after each step, hidden contains the hidden state.
    out, hidden = lstm(i.view(1,1,-1), hidden)

In [0]:
inputs = torch.cat(inputs).view(len(inputs), 1, -1) # Add the extra 2nd dimension
hidden = (autograd.Variable(torch.randn(1,1,3)), autograd.Variable(torch.randn((1,1,3)))) # clean out hidden state
out, hidden = lstm(inputs, hidden)

In [44]:
out[4]==hidden[0]

tensor([[[1, 1, 1]]], dtype=torch.uint8)

In [45]:
out.view(5,-1).size()

torch.Size([5, 3])

In [0]:
def make_sequence(seq, word_to_ix):
  idxs=map(lambda w: word_to_ix[w], seq)
  tensor=torch.LongTensor(list(idxs))
  return autograd.Variable(tensor)

In [0]:
training_data = [
    ("The dog ate the apple".split(), ["DET", "NN", "V", "DET", "NN"]),
    ("Everybody read that book".split(), ["NN", "V", "DET", "NN"])
]
ix_to_word={}
word_to_ix={}
for sent,_ in training_data:
  for word in sent:
    if word not in word_to_ix:
      word_to_ix[word]=len(word_to_ix)
      ix_to_word[word_to_ix[word]]=word

In [48]:
print(word_to_ix)
tag_to_ix = {"DET": 0, "NN": 1, "V": 2}
EMBEDDING_DIM = 6
HIDDEN_DIM = 5

{'The': 0, 'dog': 1, 'ate': 2, 'the': 3, 'apple': 4, 'Everybody': 5, 'read': 6, 'that': 7, 'book': 8}


In [0]:
class LSTMTagger(nn.Module):
  def __init__(self, embedding_dim, hidden_dim, vocab_size, target_size):
    super(LSTMTagger, self).__init__()
    
    self.hidden_dim = hidden_dim
    self.embedding=nn.Embedding(vocab_size, embedding_dim)
    self.lstm=nn.LSTM(embedding_dim, hidden_dim)
    self.hidden_2_tag=nn.Linear(hidden_dim, target_size)
    self.hidden=self.init_hidden()

  def init_hidden(self):
    return (autograd.Variable(torch.zeros(1,1,self.hidden_dim)), 
            autograd.Variable(torch.zeros(1,1,self.hidden_dim)))
  

    
  def forward(self, sentence):
    embed=self.embedding(sentence)
    lstm_out, self.hidden=self.lstm(embed.view(len(sentence), 1, -1), self.hidden)
    #print('lstm_out size ', lstm_out.size())
    tag_space=self.hidden_2_tag(lstm_out.view(len(sentence), -1))
    tag_score=F.log_softmax(tag_space)
    return tag_score
      

In [54]:
embed=nn.Embedding(len(word_to_ix), EMBEDDING_DIM)
sent=training_data[0][0]
sent=make_sequence(sent, word_to_ix)
out=embed(sent)
out.size()

torch.Size([5, 6])

In [0]:
model=LSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_ix), len(tag_to_ix))
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

In [56]:
for epoch in range(300):
  for sent, target in training_data:
    
    seq=make_sequence(sent, word_to_ix)
    y=make_sequence(target, tag_to_ix)
    model.hidden=model.init_hidden()
    y_hat=model(seq)
    loss=loss_function(y_hat, y)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()



In [57]:
inputs = make_sequence(training_data[0][0], word_to_ix)
tag_scores = model(inputs)
# The sentence is "the dog ate the apple".  i,j corresponds to score for tag j for word i.
# The predicted tag is the maximum scoring tag.
# Here, we can see the predicted sequence below is 0 1 2 0 1
# since 0 is index of the maximum value of row 1,
# 1 is the index of maximum value of row 2, etc.
# Which is DET NOUN VERB DET NOUN, the correct sequence!
print (tag_scores)

tensor([[-0.1030, -2.5107, -4.0929],
        [-4.4713, -0.0430, -3.4834],
        [-4.0574, -3.9167, -0.0379],
        [-0.0436, -3.5922, -4.1914],
        [-3.5066, -0.0385, -4.8591]], grad_fn=<LogSoftmaxBackward>)




In [58]:
list(training_data[0][0][0])

['T', 'h', 'e']

In [0]:
class CharLSTM(nn.Module):
  def __init__(self, char_embed_size, word_embed_size, vocab_size, hidden_dim_char, hidden_dim_word, target_size_word):
    super(CharLSTM, self).__init__()
  
    self.hidden_dim_char=hidden_dim_char
    self.hidden_dim_word=hidden_dim_word
  
    self.embed_char=nn.Embedding(26*2, char_embed_size)
    self.embed_word=nn.Embedding(vocab_size, word_embed_size)
    self.lstm_char=nn.LSTM(char_embed_size, hidden_dim_char)
    self.lstm_word=nn.LSTM(word_embed_size+hidden_dim_char, hidden_dim_word)
    self.hidden2tag=nn.Linear(hidden_dim_word, target_size_word)
    self.hidden_char=self.init_hidden_char()
    self.hidden_word=self.init_hidden_word()
  
  def init_hidden_char(self):
    return (autograd.Variable(torch.zeros(1,1,self.hidden_dim_char)),
            autograd.Variable(torch.zeros(1,1,self.hidden_dim_char)))
  
  def init_hidden_word(self):
    return (autograd.Variable(torch.zeros(1,1,self.hidden_dim_word)),
            autograd.Variable(torch.zeros(1,1,self.hidden_dim_word)))
  
  def forward(self, seq, sentenc):
#     char_list=list(map(lambda w: self.embed_char(make_char_seq(lambda q: split(q) for q in w, char_to_ix)), sentenc))
#     char_tensor=torch.Tensor()
#     for i in char_list:
#       char_tensor=torch.cat((char_tensor,i),0)
#     print(char_tensor.size())
    a=torch.Tensor()
    for val in seq:
      word=ix_to_word[val.item()]
      embed_char=self.embed_char(make_char_seq(word, char_to_ix))
      lstm_out_char, self.hidden_char=self.lstm_char(embed_char.view(len(word),1,-1), self.hidden_char)
      a=torch.cat((a,lstm_out_char[-1]))
      
    embed_word=self.embed_word(seq)
    embed=torch.cat((embed_word, autograd.Variable(a)), 1).view(len(embed_word), 1, -1)
    lstm_out, self.hidden_word=self.lstm_word(embed.view(len(seq), 1, -1), self.hidden_word)
    #print(lstm_out.size())
    tag_space=self.hidden2tag(lstm_out.view(len(seq), -1))
    tag_score=F.log_softmax(tag_space)
    return tag_score
  

In [0]:
char_to_ix={}
for i in range(26):
  char_to_ix[chr(65+i)]=i
  
for i in range(26):
  char_to_ix[chr(97+i)]=i+26

In [64]:
embed_char=nn.Embedding(len(char_to_ix), EMBEDDING_DIM)
l1=nn.Linear(EMBEDDING_DIM,3)
l1(embed_char(make_char_seq('dog', char_to_ix))).size()

torch.Size([3, 3])

In [0]:
def make_char_seq(word, char_to_ix):
  idxs=map(lambda w: char_to_ix[w], word)
  return autograd.Variable(torch.LongTensor(list(idxs)))

In [0]:
CHAR_REP_SIZE=3
model=CharLSTM(EMBEDDING_DIM, EMBEDDING_DIM, len(word_to_ix), CHAR_REP_SIZE, HIDDEN_DIM, len(tag_to_ix))
loss_fun=nn.NLLLoss()
optimizer=optim.SGD(model.parameters(), lr=0.1)

In [65]:
for epoch in range(1):
  for sent, target in training_data:
    seq=make_sequence(sent, word_to_ix)
    y=make_sequence(target, tag_to_ix)
    model.hidden_char=model.init_hidden_char()
    model.hidden_word=model.init_hidden_word()
    
    y_hat=model(seq, sent)
    loss=loss_fun(y_hat, y)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()



In [67]:
inputs = make_sequence(training_data[0][0], word_to_ix)
tag_scores = model(inputs, training_data[0][0])
# The sentence is "the dog ate the apple".  i,j corresponds to score for tag j for word i.
# The predicted tag is the maximum scoring tag.
# Here, we can see the predicted sequence below is 0 1 2 0 1
# since 0 is index of the maximum value of row 1,
# 1 is the index of maximum value of row 2, etc.
# Which is DET NOUN VERB DET NOUN, the correct sequence!
print (tag_scores)

tensor([[-1.1920, -0.9908, -1.1236],
        [-1.2085, -0.9848, -1.1152],
        [-1.2323, -0.9698, -1.1111],
        [-1.1970, -0.9678, -1.1457],
        [-1.2016, -0.9416, -1.1736]], grad_fn=<LogSoftmaxBackward>)


