In [14]:
import torch
import csv
import itertools
import operator
import numpy as np
import nltk
import sys
from datetime import datetime
from utils import *

import matplotlib.pyplot as plt
%matplotlib inline

In [369]:
unknown_token = "UNKNOWN_TOKEN"
sentence_start_token = "SENTENCE_START"
sentence_end_token = "SENTENCE_END"

In [16]:
# Download NLTK model data (you need to do this once)
nltk.download("book")

[nltk_data] Downloading collection 'book'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /Users/kenny/nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package brown to
[nltk_data]    |     /Users/kenny/nltk_data...
[nltk_data]    |   Package brown is already up-to-date!
[nltk_data]    | Downloading package chat80 to
[nltk_data]    |     /Users/kenny/nltk_data...
[nltk_data]    |   Package chat80 is already up-to-date!
[nltk_data]    | Downloading package cmudict to
[nltk_data]    |     /Users/kenny/nltk_data...
[nltk_data]    |   Package cmudict is already up-to-date!
[nltk_data]    | Downloading package conll2000 to
[nltk_data]    |     /Users/kenny/nltk_data...
[nltk_data]    |   Package conll2000 is already up-to-date!
[nltk_data]    | Downloading package conll2002 to
[nltk_data]    |     /Users/kenny/nltk_data...
[nltk_data]    |   Package conll2002 is already up-to-date!
[nltk_data]    | Downloading package dependency_t

True

In [440]:
with open('data/reddit-comments-2015-08.csv', 'r', newline='', encoding='utf-8') as f:
    # Initalize a reader object
    reader = csv.reader(f, skipinitialspace=True)
    # Skip the header row
    next(reader)  
    # Split full comments into sentences  - [nltk.sent_tokenize(x[0].lower()) for x in reader] - for the paragraph x[0] from the csv file, make it lowercase and tokenize all sentence
    # For all pararaphs in the csv file. * operator unpacks the list into individual sentences, and creates a single iterable
    # sentences = itertools.chain(*[nltk.sent_tokenize(x[0].lower()) for x in reader])
    sentences = itertools.chain(*[nltk.sent_tokenize(str(x).lower()) for x in reader])
    # Append SENTENCE_START and SENTENCE_END
    # Replace all sentence x in sentences with the start token, sentence body, and text token"
    sentences = ["%s %s %s" % (sentence_start_token, x, sentence_end_token) for x in sentences]
print (f"Parsed {len(sentences)} sentences.")

Parsed 68646 sentences.


In [441]:
# Tokenize the sentences into words
tokenized_sentences = [nltk.word_tokenize(sent) for sent in sentences]

In [461]:
# List of lists
print(tokenized_sentences[3])
len(tokenized_sentences)

['SENTENCE_START', 'my', 'question', 'is', ',', 'is', 'it', 'UNKNOWN_TOKEN', 'clear', 'that', 'UNKNOWN_TOKEN', 'has', 'the', 'highest', 'potential', 'for', 'points', '?', 'SENTENCE_END']


68646

In [443]:
b = {}

for i in range (len(tokenized_sentences)):
    for word1, word2, in zip(tokenized_sentences[i], tokenized_sentences[i][1:]):
        # Create a tuple
        bigram = (word1, word2)
        # Index into the dictionary, update it by one
        b[bigram] = b.get(bigram, 0) + 1


In [462]:
# Taking the first 20,000 entries as our vocabulary
sorted(b.items(), key = lambda kv: -kv[1])

[(('.', 'SENTENCE_END'), 47119),
 (('SENTENCE_START', '['), 15208),
 ((']', 'SENTENCE_END'), 15011),
 (('[', '``'), 9079),
 (("''", ']'), 6963),
 (('SENTENCE_START', 'i'), 6439),
 (("'", ']'), 5952),
 ((',', 'and'), 5703),
 ((',', 'but'), 4939),
 (('.', "''"), 4927),
 (('*', '*'), 4650),
 ((']', '('), 4323),
 (('of', 'the'), 4314),
 (('in', 'the'), 4082),
 (('?', 'SENTENCE_END'), 3839),
 (('.', "'"), 3392),
 (('it', "'s"), 3313),
 ((',', 'i'), 3169),
 (('if', 'you'), 2899),
 (('do', "n't"), 2748),
 (('&', 'gt'), 2573),
 (('gt', ';'), 2573),
 (('to', 'be'), 2485),
 (('http', ':'), 2354),
 (('i', "'m"), 2336),
 (('SENTENCE_START', 'it'), 2263),
 (('SENTENCE_START', 'the'), 2231),
 (('``', ']'), 2123),
 (('to', 'the'), 2106),
 (('(', 'http'), 2101),
 (('on', 'the'), 2059),
 (('``', 'i'), 1993),
 (('[', "'"), 1848),
 (('!', 'SENTENCE_END'), 1760),
 (('is', 'a'), 1692),
 (('https', ':'), 1629),
 (('SENTENCE_START', 'if'), 1627),
 ((',', 'the'), 1593),
 (('and', 'i'), 1564),
 (('you', 'can')

In [445]:
word_set = set(word for bigram, count in sorted(b.items(), key=lambda kv: -kv[1])[:20000] for word in bigram)

In [446]:
dim = len(word_set)
dim

3502

In [447]:
N = torch.zeros((dim, dim), dtype = torch.int32)

In [464]:
stoi.items()



In [465]:
stoi = {s:i for i,s in enumerate(word_set)} # string to index
itos = {i:s for s, i in stoi.items()}

In [466]:
for i, sentence in enumerate(tokenized_sentences):
    for j, word in enumerate(sentence):
        if word not in word_set:
            tokenized_sentences[i][j] = 'UNKNOWN_TOKEN'

In [467]:
print(tokenized_sentences[1:5])

[['SENTENCE_START', 'it', "'s", 'a', 'slight', 'UNKNOWN_TOKEN', 'UNKNOWN_TOKEN', 'UNKNOWN_TOKEN', 'UNKNOWN_TOKEN', '.', 'SENTENCE_END'], ['SENTENCE_START', 'standard', 'besides', '1', 'points', 'for', '15', 'UNKNOWN_TOKEN', 'UNKNOWN_TOKEN', ',', 'UNKNOWN_TOKEN', 'points', 'per', 'UNKNOWN_TOKEN', ',', '6', 'points', 'per', 'UNKNOWN_TOKEN', 'UNKNOWN_TOKEN', ',', 'and', 'some', 'UNKNOWN_TOKEN', 'for', 'UNKNOWN_TOKEN', 'UNKNOWN_TOKEN', '.', 'SENTENCE_END'], ['SENTENCE_START', 'my', 'question', 'is', ',', 'is', 'it', 'UNKNOWN_TOKEN', 'clear', 'that', 'UNKNOWN_TOKEN', 'has', 'the', 'highest', 'potential', 'for', 'points', '?', 'SENTENCE_END'], ['SENTENCE_START', 'i', 'put', 'in', 'the', 'rules', 'at', 'a', 'UNKNOWN_TOKEN', 'site', 'and', 'noticed', 'that', 'top', 'UNKNOWN_TOKEN', 'had', 'UNKNOWN_TOKEN', 'points', 'more', 'than', 'the', 'top', 'UNKNOWN_TOKEN', '.', 'SENTENCE_END']]


In [470]:
for sentence in tokenized_sentences:
    for word1, word2, in zip(sentence, sentence[1:]):
        if (word1 == 'UNKNOWN_TOKEN' or word2 == 'UNKNOWN_TOKEN'):
            continue
        ix1 = stoi[word1]
        ix2 = stoi[word2]
        N[ix1, ix2] += 1

In [471]:
# Model smoothing
P = (N+1).float()
P /= P.sum(1, keepdim = True)
print(P[0].sum())

N.shape

tensor(1.)


torch.Size([3502, 3502])

In [473]:

ix = torch.multinomial(p, num_samples = 1, replacement = True, generator = g).item()
itos[ix]

'mark'

In [474]:
stoi['SENTENCE_START']

1773

In [475]:
stoi['SENTENCE_END']

1375

In [477]:
ix = stoi['SENTENCE_START']

for i in range(5):
    out = []
    counter = 0
    while True and counter <= 20:
        # First, we sub in ix = 0, indicating that we start the word with a .
        # Repeating in a loop, create another probabilty distribution, with the previous ix as the first letter in the bigram
        
        p = P[ix] # We are indexing into a row here. We need the row to be normalized
        
        # Sample a letter. Print the letter. Update index to the new letter
        ix = torch.multinomial(p, num_samples = 1, replacement = True, generator = g).item()
        out.append(itos[ix])
        counter +=1
        if ix == stoi['SENTENCE_END']:
            break
    print(' '.join(out))
print(out)

it was accepted second 99 friends had not , creates 'yes bitch subject=error |\n establish nsfwfilter=off affected criminal build asshole posts
quality helicopter to= where 'we episode attitude leads voltage spectrum attempting guessing help hype hmm sucks interested reference isn\'t replaced suitable
believed vote test + uk generate \n\nthe quote has a fan scientific asphalt //www.fanfiction.net/\n to=tweetposter //www.mtgprice.com/search spirit i could trust total
illness 1st video possibly consent happen is n't leave t capitalism numbers 20http cut catfish faqs known ex episode confused admins
reach wan off-topic 50 lot of the back fleshed probably would still choose e.g gf themselves plenty waking method type ill
['reach', 'wan', 'off-topic', '50', 'lot', 'of', 'the', 'back', 'fleshed', 'probably', 'would', 'still', 'choose', 'e.g', 'gf', 'themselves', 'plenty', 'waking', 'method', 'type', 'ill']


In [486]:
log_likelihood = 0.0
n = 0
example_sentence = ['script', 'pain', 'dealt', 'hi', 'slavery']
for w in example_sentence:
    for word1, word2, in zip(example_sentence, example_sentence[1:]):
        if (word1 == 'UNKNOWN_TOKEN' or word2 == 'UNKNOWN_TOKEN'):
            continue
        ix1 = stoi[word1] 
        ix2 = stoi[word2]
        prob = P[ix1, ix2] # If probability is above 1/27%, this indicates that the model learned something
        logprob = torch.log(prob)
        log_likelihood += logprob
        n += 1
        print(f"{word1}{word2}: prob: {prob:.4f} logprob: {logprob:.4f}")

# When all probabilities are 1, the log_likelihood will go to zero
# We want a loss function - minimize the loss mental model

print(f"{log_likelihood = }")

# The lower it gets, the better. The higher, the more error
nll = -log_likelihood
print(f"{nll=}")

print(f'Quality of the model: {nll / n}')

scriptpain: prob: 0.0003 logprob: -8.2145
paindealt: prob: 0.0003 logprob: -8.2895
dealthi: prob: 0.0003 logprob: -8.1826
hislavery: prob: 0.0003 logprob: -8.2177
scriptpain: prob: 0.0003 logprob: -8.2145
paindealt: prob: 0.0003 logprob: -8.2895
dealthi: prob: 0.0003 logprob: -8.1826
hislavery: prob: 0.0003 logprob: -8.2177
scriptpain: prob: 0.0003 logprob: -8.2145
paindealt: prob: 0.0003 logprob: -8.2895
dealthi: prob: 0.0003 logprob: -8.1826
hislavery: prob: 0.0003 logprob: -8.2177
scriptpain: prob: 0.0003 logprob: -8.2145
paindealt: prob: 0.0003 logprob: -8.2895
dealthi: prob: 0.0003 logprob: -8.1826
hislavery: prob: 0.0003 logprob: -8.2177
scriptpain: prob: 0.0003 logprob: -8.2145
paindealt: prob: 0.0003 logprob: -8.2895
dealthi: prob: 0.0003 logprob: -8.1826
hislavery: prob: 0.0003 logprob: -8.2177
log_likelihood = tensor(-164.5213)
nll=tensor(164.5213)
Quality of the model: 8.226067543029785


In [487]:
# Create the training set of all bigrams for the neural network
# Inputs and labels
xs, ys = [], []

for sentence in tokenized_sentences[:1]:
    for word1, word2, in zip(sentence, sentence[1:]):
        if (word1 == 'UNKNOWN_TOKEN' or word2 == 'UNKNOWN_TOKEN'):
            continue
        ix1 = stoi[word1] 
        ix2 = stoi[word2]
        xs.append(ix1)
        ys.append(ix2)

xs = torch.tensor(xs)
ys = torch.tensor(ys)

In [488]:
xs.shape

torch.Size([18])

In [489]:
ys.shape

torch.Size([18])

In [490]:
# One hot encoding
import torch.nn.functional as F
# Change dtype to float through typecasting
xenc = F.one_hot(xs, num_classes = dim).float()
xenc.shape

torch.Size([18, 3502])

In [491]:
xs, ys = [], []

for sentence in tokenized_sentences[:len(tokenized_sentences)//1000]:
    for word1, word2, in zip(sentence, sentence[1:]):
        if (word1 == 'UNKNOWN_TOKEN' or word2 == 'UNKNOWN_TOKEN'):
            continue
        ix1 = stoi[word1] 
        ix2 = stoi[word2]
        xs.append(ix1)
        ys.append(ix2)

xs = torch.tensor(xs)
ys = torch.tensor(ys)

num = xs.nelement()
print('number of examples:', num)

W = torch.randn((dim, dim), requires_grad = True)

number of examples: 1182


In [496]:
# All together, this is what our

for k in range (1000):
    # Forward pass
    xenc = F.one_hot(xs, num_classes = dim).float()
    logits = xenc @ W # log counts - logits is actually the proper row of W
    counts = logits.exp() #Equivalent to the N matrix
    probs = counts / counts.sum(1, keepdims = True)
    loss = -probs[torch.arange(num), ys].log().mean()  + 0.01 * (W**2).mean() # Tensor of our labels, as a loss function
    
    if (k % 100 == 0):
        # Our loss should be around the same
        print(loss.item())

    # Backward pass
    W.grad = None # More efficient - set to zero
    loss.backward()

    # Update parameters
    W.data+= -50 * W.grad
    break

5.129765510559082


In [498]:
for i in range(5):
    ix = 1710
    out = []
    counter = 0
    while True and counter <= 20:
        xenc = F.one_hot(torch.tensor([ix]), num_classes = dim).float()
        logits = xenc @ W
        counts = logits.exp()
        p = counts / counts.sum(1, keepdims = True)
        ix = torch.multinomial(p, num_samples = 1, replacement = True, generator = g).item()
        out.append(itos[ix])
        counter +=1
        if ix == 1320:
            break
    print(' '.join(out))

since becomes imaginable ^darnet pointing design rain amendment sexual dlc gt ; still manage concept seller stopped powerful cream disappointed -\n\n\n
led links feeling nation screen sort=relevance refers fraction tomorrow referring \n\nhere money give share ndp earthfans internal tpp agreed //www.reddit.com/r/pricezombie/wiki/index trade
far host n can\'t h gives she dota to=/r/friendsafari walk patent ve took south etc.\n\n violence pounds say review intention sidewalk
favour could late key getting og consider dedicated university equivalent night eu doing am oh 'maybe track written members victims share
\n\nme gear owning 50 hanging calls files dig genius intended ( quickly \n\n\ni truth racist differences truck else\ human processor save
