In [None]:
import sys  
!{sys.executable} -m pip install contractions

In [12]:
import string
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import nltk
from nltk import word_tokenize
nltk.download('punkt')
import re
import contractions
from collections import Counter
from torch.utils.data import Dataset, DataLoader

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [5]:
torch.manual_seed(1)

<torch._C.Generator at 0x7f33b0242750>

# Load and Prepare Data

In [8]:
# Import txt file

file = open('/content/raw_gatsby.txt', 'r')
raw_gatsby = file.read()
file.close()
print(len(raw_gatsby))

268151


In [9]:
# Cleaning Data

# Replace quotation marks, commas, colons, and semicolons with blank space; line breaks, double spaces, and hyphens with a space
clean_gatsby_start = raw_gatsby.replace('“', '').replace('”', '').replace(';', '').replace(':', '').replace('\n', ' ').replace(',', ''). replace('  ', ' '). replace('-', ' ').replace('—', ' ')
#clean_gatsby_mid = clean_gatsby_start.replace('.', '')

# Remove contractions and return whole words
clean_gatsby = contractions.fix(clean_gatsby_start)

# Remove leftover punctuaction 
clean_gatsby = clean_gatsby.replace('’', '')



print(len(clean_gatsby))
print(clean_gatsby[:1000]) # Indices are characters

261535
In my younger and more vulnerable years my father gave me some advice that I have been turning over in my mind ever since. Whenever you feel like criticizing anyone he told me just remember that all the people in this world have not had the advantages that you have had. He did not say any more but we have always been unusually communicative in a reserved way and I understood that he meant a great deal more than that. In consequence I am inclined to reserve all judgements a habit that has opened up many curious natures to me and also made me the victim of not a few veteran bores. The abnormal mind is quick to detect and attach itself to this quality when it appears in a normal person and so it came about that in college I was unjustly accused of being a politician because I was privy to the secret griefs of wild unknown men. Most of the confidences were unsought frequently I have feigned sleep preoccupation or a hostile levity when I realized by some unmistakable sign that an int

In [10]:
# Remove Puncuation

g_no_punc = clean_gatsby.translate(str.maketrans('', '', string.punctuation))
print(len(g_no_punc))
print(g_no_punc[:1000])

257952
In my younger and more vulnerable years my father gave me some advice that I have been turning over in my mind ever since Whenever you feel like criticizing anyone he told me just remember that all the people in this world have not had the advantages that you have had He did not say any more but we have always been unusually communicative in a reserved way and I understood that he meant a great deal more than that In consequence I am inclined to reserve all judgements a habit that has opened up many curious natures to me and also made me the victim of not a few veteran bores The abnormal mind is quick to detect and attach itself to this quality when it appears in a normal person and so it came about that in college I was unjustly accused of being a politician because I was privy to the secret griefs of wild unknown men Most of the confidences were unsought frequently I have feigned sleep preoccupation or a hostile levity when I realized by some unmistakable sign that an intimate

In [19]:
no_punc_token = word_tokenize(g_no_punc)

vocab = set(no_punc_token)
print(len(vocab))

6251


# Model

In [14]:
# Set Hyperparameters

CONTEXT_SIZE = 2
embed_dims = 10
vocab_size = len(vocab)
BATCH_SIZE = 128


# Create a word to index dict

word_to_ix = {word: i for i, word in enumerate(vocab)}
ngrams = [
    (
        [no_punc_token[i - j - 1] for j in range(CONTEXT_SIZE)],
        no_punc_token[i]
    )
    for i in range(CONTEXT_SIZE, len(no_punc_token))
]


# Create a list of format ([c1, c2], t) for DataLoader

word_idx_list = []
for i, (c, t) in enumerate(ngrams):
    context = [word_to_ix[w] for w in c]
    target = word_to_ix[t]
    word_idx_list.insert(i, (context, target))

    
print(len(ngrams))
print(ngrams[0])
print(word_idx_list[0:3])

49748
(['my', 'In'], 'younger')
[([1589, 1598], 5456), ([5456, 1589], 6022), ([6022, 5456], 4938)]


In [15]:
# Custom Dataset and DataLoader

class GatsDataset(Dataset):
    
    def __init__(self, data):
        self.data = data
        print('Data: ', self.data[0])
        self.len = len(data)
        
    def __getitem__(self, index):
        return self.data[index]
    
    def __len__(self):
        return self.len
    
    
# Custom Collate Function to correctly shape data with batch size

def custom_collate(data):
    context = [idx[0] for idx in data]
    target = [idx[1] for idx in data]
    return torch.tensor(context), torch.tensor(target)
    


In [16]:
# Call Dataset and DataLoader

dataset = GatsDataset(word_idx_list)
train_loader = DataLoader(dataset, batch_size = BATCH_SIZE, shuffle = False, collate_fn = custom_collate, drop_last = True)


for data in train_loader:
    print(data)
    break


Data:  ([1589, 1598], 5456)
(tensor([[1589, 1598],
        [5456, 1589],
        [6022, 5456],
        [4938, 6022],
        [1792, 4938],
        [5038, 1792],
        [1589, 5038],
        [4819, 1589],
        [4719, 4819],
        [1077, 4719],
        [ 302, 1077],
        [1183,  302],
        [5274, 1183],
        [5751, 5274],
        [4097, 5751],
        [3863, 4097],
        [1103, 3863],
        [ 850, 1103],
        [ 265,  850],
        [1589,  265],
        [1347, 1589],
        [4959, 1347],
        [ 907, 4959],
        [5780,  907],
        [5168, 5780],
        [ 256, 5168],
        [ 542,  256],
        [2769,  542],
        [1942, 2769],
        [5298, 1942],
        [3546, 5298],
        [1077, 3546],
        [5956, 1077],
        [5391, 5956],
        [5274, 5391],
        [2135, 5274],
        [5431, 2135],
        [ 519, 5431],
        [ 265,  519],
        [   3,  265],
        [1114,    3],
        [4097, 1114],
        [4310, 4097],
        [3428, 4310],
   

In [18]:
%%time

# Class Creation

class word2vec(nn.Module):
    
    def __init__(self, vocab_size, embedding_dim, context_size, batch_size):
        super(word2vec, self).__init__()
        self.batch_size = batch_size
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear1 = nn.Linear(context_size * embedding_dim, 128)
        self.linear2 = nn.Linear(128, vocab_size)
        
        
    def forward(self, inputs):
        embeds = self.embeddings(inputs).view(self.batch_size, -1)
        out_1 = F.relu(self.linear1(embeds))
        out_2 = self.linear2(out_1)
        out_f = F.log_softmax(out_2, dim = 1)
        return out_f
    
    def f_predict(self, inputs):
        embeds = self.embeddings(inputs).view(1, -1)
        out_1 = F.relu(self.linear1(embeds))
        out_2 = self.linear2(out_1)
        out_f = F.log_softmax(out_2, dim = 1)
        return out_f
        
    def predict(self, contexts):
        inputs = torch.tensor([word_to_ix[w] for w in contexts], dtype = torch.long)
        y_pred = self.f_predict(inputs)
        idx = torch.argmax(y_pred)
        target_pred = [k for k, v in word_to_ix.items() if v == idx]
        return target_pred                    
    
    

# Call word2vec with hyperparameters, declare loss function and optimizer with learning rate

losses = []
loss_function = nn.NLLLoss()
gats_model = word2vec(vocab_size, embed_dims, CONTEXT_SIZE, BATCH_SIZE)
optimizer = optim.SGD(gats_model.parameters(), lr = 0.1)

CPU times: user 11.5 ms, sys: 1.82 ms, total: 13.4 ms
Wall time: 16.8 ms


In [None]:
%%time

# First Method - For Loop time trial
# Params: context_size = 2, lr = 0.001, NLLLoss function, log_softmax
# Epochs: 5 - 9min 24s
# End Error: 322202.3 (-15.14%)

for epoch in range(5):
    total_loss = 0.0
    
    for context, target in ngrams:
        context_ids = torch.tensor([word_to_ix[w] for w in context], dtype=torch.long)
        gats_model.zero_grad() # PyTorch accumulates gradients so has to be zeroed from previous loop        
        log_probs = gats_model(context_ids)
        target = torch.tensor([word_to_ix[target]], dtype=torch.long)
        loss = loss_function(log_probs, target)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        
    print(total_loss)

In [None]:
%%time

# Second Method - Dataloader Batches time trial
# Params: context_size = 2, lr = 0.001, NLLLoss function, log_softmax, 5 epochs
# Batch: 128 - 38.4s, 64 - 56.8s, 32 - 1min 14s
# End Error: 128 - 3607.4 (-1.78%), 64 - 7113.2 (-3.77%), 32 - 13480.6 (-7.15%)

for epoch in range(400):
    total_loss = 0.0
    for context, target in train_loader:
        gats_model.zero_grad()
        log_probs = gats_model(context)
        loss = loss_function(log_probs, target)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    
    if epoch % 50 == 0:
        print(total_loss)


2953.564935684204
1750.248473405838
1434.6276547908783
1292.9717783927917
1208.4133656024933
1148.7557430267334
1102.2296936511993
1063.4776307344437
CPU times: user 2h 36min 31s, sys: 2h 4min 57s, total: 4h 41min 28s
Wall time: 48min 23s


In [None]:
# Predict next word given 2 context words

gats_model.predict(['Gatsbys', 'house'])

['largest']