In [2]:
import torch
import torch.nn as nn
import torchvision
import torch.optim as optim
import torchvision.transforms as transforms
import torch.nn.functional as F
# import matplotlib.pyplot as plt

In [1]:
class MyParameters(object):

  mode = 2
  time_name = 'example'
  
  if mode == 2:
    dim = 200
    threshold = 3
    knn = -1
    regularizer = 0
    keep_prob = 0.50
    init_scale = 0.05
    max_epochs = 20
    alpha_decay = 0.80
    alpha_start = 6
    interval = 100000
    bCorpus = 2             # 1 = PTB, 2 = Wiki2, 3 = Wiki103
    num_layers = 2
    max_tokens = 500000000
    batch_size = 20
    num_steps = 30
    max_grad = 2.0       

  alpha_initial = 1.0
  alpha_mode = 0
  alpha_min = 0.0
  bValid = True;
  bTest = True;
  save_net = True
  recall_net = False
  bWET = True
  bBias = True
  bSaveEmbed = False
  
  vocab_size = 0
  epoch = 0
  gpu_mem = 0.90  

  # this is our own values we initalized. anything above is not ours 
  input_size = 500
  hidden_size = 300
  num_classes = 1
  num_epochs = 20
  batch_size = 20
  learning_rate = 0.001

In [3]:
def ReadCorpus(file_name,words,vocab,params,src):
    if src == 0:
        temp = dict()
        last = 0
        total_tokens = 0
        with open(file_name,"r") as f:
            for line in f:
#                line = line.replace(" "+chr(8211)+" "," - ")
                tokens = line.replace("\n", " </s> ").split()
                total_tokens = total_tokens + len(tokens)

                if (total_tokens - last) > 10000000:
                    print(total_tokens)
                    last = total_tokens

                for t in tokens:
                    if t == '"':
                        t = '<quote>'
                    try:
                        elem = temp[t]
                    except:
                        elem = [0,0]
                    elem[1] = elem[1] + 1
                    temp[t] = elem
                    
        wNextID = 0
        words = dict()
        words['<unk>'] = [wNextID,0]
        wNextID = wNextID + 1
                
        for t in temp:
            elem = temp[t]
            if elem[1] >= params.threshold:
                words[t] = [wNextID,elem[1]]
                wNextID = wNextID + 1
                
        vocab = list()
        vocab.append(' ')
        for w in words:
            vocab.append(' ')
        for w in words:
            elem = words[w]
            vocab[elem[0]] = w

    corpus = list()
    garbage = dict()
        
    last = 0
    total_tokens = 0
    with open(file_name,"r") as f:
        for line in f:
#            line = line.replace(" "+chr(8211)+" "," - ")
            tokens = line.replace("\n", " </s> ").split()
            total_tokens = total_tokens + len(tokens)

            if (total_tokens - last) > 10000000:
                print(total_tokens)
                last = total_tokens

            for t in tokens:
                if t == '"':
                    t = '<quote>'
                try:
                    elem = words[t]
                except:
                    try:
                        g = garbage[t]
                    except:
                        g = 0
                    g = g + 1
                    garbage[t] = g
                    elem = words['<unk>']
#                elem[1] = elem[1] + 1
#                words[t] = elem
                corpus.append(elem[0])
       
    return corpus, words, vocab, garbage

In [4]:
class NGramLanguageModeler(nn.Module):
  def __init__(self, vocab_size, embedding_dim, context_size):
    super(NGramLanguageModeler, self).__init__()
    self.embeddings = nn.Embedding(vocab_size, embedding_dim)
    self.linear1 = nn.Linear(context_size * embedding_dim, 128)
    self.linear2 = nn.Linear(128, vocab_size)

  def forward(self, inputs):
    embeds = self.embeddings(inputs).view((1, -1))
    out = F.relu(self.linear1(embeds))
    out = self.linear2(out)
    log_probs = F.log_softmax(out, dim=1)
    return log_probs

In [6]:
params = MyParameters()  
train,words,vocab,train_g=ReadCorpus("wiki.train.txt",None,None,params,0)
valid,words,vocab,valid_g=ReadCorpus("wiki.valid.txt",words,vocab,params,1)
test,words,vocab,test_g=ReadCorpus("wiki.test.txt",words,vocab,params,2) 
params.vocab_size = len(vocab)    

In [7]:
EMBEDDING_DIM = 100
CONTEXT_SIZE = 5

losses = []
loss_function = nn.NLLLoss()
model = NGramLanguageModeler(params.vocab_size, EMBEDDING_DIM, CONTEXT_SIZE)
optimizer = optim.SGD(model.parameters(), lr=params.learning_rate)

In [8]:
train_ngrams = [
    (
        [train[i - j - 1] for j in range(CONTEXT_SIZE)],
        train[i]
    )
    for i in range(CONTEXT_SIZE, len(train))
]

In [None]:
print(train_ngrams[:10])
print(train[:10])
print(vocab[:20])

In [23]:
with open('wiki.train.txt', 'r') as f:
  train1 = f.read().split()
  vocab1 = list(set(train1))

In [24]:
print(train1[:10])
print(vocab1[:10])

['=', 'Valkyria', 'Chronicles', 'III', '=', 'Senjō', 'no', 'Valkyria', '3', ':']
['Buddhism', 'elastic', 'willows', 'prevents', 'deuterium', 'Seine', 'Sections', 'Roger', 'reset', 'items']


In [14]:
word_to_ix = {word: i for i, word in enumerate(vocab)}

In [16]:
print(word_to_ix)



In [None]:
for epoch in range(10):
  total_loss = 0
  for context, target in train_ngrams:

    # Step 1. Prepare the inputs to be passed to the model (i.e, turn the words
    # into integer indices and wrap them in tensors)
    context_idxs = torch.tensor([word_to_ix[vocab[w-1]] for w in context], dtype=torch.long)

    # Step 2. Recall that torch *accumulates* gradients. Before passing in a
    # new instance, you need to zero out the gradients from the old
    # instance
    model.zero_grad()

    # Step 3. Run the forward pass, getting log probabilities over next
    # words
    log_probs = model(context_idxs)

    # Step 4. Compute your loss function. (Again, Torch wants the target
    # word wrapped in a tensor)
    loss = loss_function(log_probs, torch.tensor([word_to_ix[vocab[target-1]]], dtype=torch.long))

    # Step 5. Do the backward pass and update the gradient
    loss.backward()
    optimizer.step()

    # Get the Python number from a 1-element Tensor by calling tensor.item()
    total_loss += loss.item()
  losses.append(total_loss)
print(losses)  # The loss decreased every iteration over the training data!

# To get the embedding of a particular word, e.g. "beauty"
# print(model.embeddings.weight[word_to_ix["beauty"]])