In [1]:
!pip install --upgrade  textblob gensim pytorch-nlp

Collecting pytorch-nlp
  Downloading pytorch_nlp-0.5.0-py3-none-any.whl (90 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m90.1/90.1 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pytorch-nlp
Successfully installed pytorch-nlp-0.5.0


In [2]:
import multiprocessing
import torch
import torch.nn as nn
import torch.optim as optim
import itertools
import sys
from textblob import TextBlob, Word
import numpy as np
import random
import os
import pandas as pd
import gensim
import warnings
import nltk

TRACE = False  # Setting to true is useful when debugging to know which device is being used
embedding_dim = 50
epochs=100
batch_size = 100
BATCH = True

def set_seeds_and_trace():
  os.environ['PYTHONHASHSEED'] = '0'
  np.random.seed(42)
  random.seed(42)

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
set_seeds_and_trace()
warnings.filterwarnings('ignore')
nltk.download('punkt')
textblob_tokenizer = lambda x: TextBlob(x).words

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [3]:
%%writefile get_data.sh
if [ ! -f yelp.csv ]; then
  wget -O yelp.csv https://www.dropbox.com/s/xds4lua69b7okw8/yelp.csv?dl=0
fi

Writing get_data.sh


In [4]:
!bash get_data.sh

--2023-11-27 17:16:50--  https://www.dropbox.com/s/xds4lua69b7okw8/yelp.csv?dl=0
Resolving www.dropbox.com (www.dropbox.com)... 162.125.81.18, 2620:100:6031:18::a27d:5112
Connecting to www.dropbox.com (www.dropbox.com)|162.125.81.18|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: /s/raw/xds4lua69b7okw8/yelp.csv [following]
--2023-11-27 17:16:51--  https://www.dropbox.com/s/raw/xds4lua69b7okw8/yelp.csv
Reusing existing connection to www.dropbox.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://uc6115ef95ba9873773a12939874.dl.dropboxusercontent.com/cd/0/inline/CIUtOK6uCYUaXV7QWzJPMq28_mX9CfngdnvDjxHYidJWFOvdO56rn7swzUnPjzSbBUJLP_FuOaIRoi60IsQhQSlQaCCDACW_nv5uVq6SwzX7mWIdrzfdRlA2Foto3lgShouC7RZAcY00CH4_Iy2o9UDt/file# [following]
--2023-11-27 17:16:51--  https://uc6115ef95ba9873773a12939874.dl.dropboxusercontent.com/cd/0/inline/CIUtOK6uCYUaXV7QWzJPMq28_mX9CfngdnvDjxHYidJWFOvdO56rn7swzUnPjzSbBUJLP_FuOaIRoi60IsQhQSlQaCCDACW_nv5uVq6Swz

In [5]:
path = './yelp.csv'
yelp = pd.read_csv(path)
# Create a new DataFrame that only contains the 5-star and 1-star reviews.
yelp_best_worst = yelp[(yelp.stars==5) | (yelp.stars==1)]
X = yelp_best_worst.text
y = yelp_best_worst.stars.map({1:0, 5:1})

In [6]:
yelp

Unnamed: 0,business_id,date,review_id,stars,text,type,user_id,cool,useful,funny
0,9yKzy9PApeiPPOUJEtnvkg,2011-01-26,fWKvX83p0-ka4JS3dc6E5A,5,My wife took me here on my birthday for breakf...,review,rLtl8ZkDX5vH5nAx9C3q5Q,2,5,0
1,ZRJwVLyzEJq1VAihDhYiow,2011-07-27,IjZ33sJrzXqU-0X6U8NwyA,5,I have no idea why some people give bad review...,review,0a2KyEL0d3Yb1V6aivbIuQ,0,0,0
2,6oRAC4uyJCsJl1X0WZpVSA,2012-06-14,IESLBzqUCLdSzSqm0eCSxQ,4,love the gyro plate. Rice is so good and I als...,review,0hT2KtfLiobPvh6cDC8JQg,0,1,0
3,_1QQZuf4zZOyFCvXc0o6Vg,2010-05-27,G-WvGaISbqqaMHlNnByodA,5,"Rosie, Dakota, and I LOVE Chaparral Dog Park!!...",review,uZetl9T0NcROGOyFfughhg,1,2,0
4,6ozycU1RpktNG2-1BroVtw,2012-01-05,1uJFq2r5QfJG_6ExMRCaGw,5,General Manager Scott Petello is a good egg!!!...,review,vYmM4KTsC8ZfQBg-j5MWkw,0,0,0
...,...,...,...,...,...,...,...,...,...,...
9995,VY_tvNUCCXGXQeSvJl757Q,2012-07-28,Ubyfp2RSDYW0g7Mbr8N3iA,3,First visit...Had lunch here today - used my G...,review,_eqQoPtQ3e3UxLE4faT6ow,1,2,0
9996,EKzMHI1tip8rC1-ZAy64yg,2012-01-18,2XyIOQKbVFb6uXQdJ0RzlQ,4,Should be called house of deliciousness!\n\nI ...,review,ROru4uk5SaYc3rg8IU7SQw,0,0,0
9997,53YGfwmbW73JhFiemNeyzQ,2010-11-16,jyznYkIbpqVmlsZxSDSypA,4,I recently visited Olive and Ivy for business ...,review,gGbN1aKQHMgfQZkqlsuwzg,0,0,0
9998,9SKdOoDHcFoxK5ZtsgHJoA,2012-12-02,5UKq9WQE1qQbJ0DJbc-B6Q,2,My nephew just moved to Scottsdale recently so...,review,0lyVoNazXa20WzUyZPLaQQ,0,0,0


In [7]:
TextBlob(X.values[0]).correct()

TextBlob("By wife took me here on my birthday for breakfast and it was excellent.  The weather was perfect which made sitting outside overlooking their grounds an absolute pleasure.  Our witness was excellent and our food arrived quickly on the semi-busy Saturday morning.  It looked like the place fills up pretty quickly so the earlier you get here the better.

To yourself a favor and get their Bloody Mary.  It was phenomena and simply the best I've ever had.  I'm pretty sure they only use ingredient from their garden and blend them fresh when you order it.  It was amazing.

While EVERYTHING on the menu looks excellent, I had the white ruffle scrambled eggs vegetable skilled and it was taste and delicious.  It came with 2 pieces of their grizzled bread with was amazing and it absolutely made the meal complete.  It was the best "toast" I've ever had.

Anyway, I can't wait to go back!")

In [8]:
# Create corpus of sentences such that the sentence has more than 3 words
corpus = [line for line in X.values if len(TextBlob(line).words)> 3]

In [9]:
corpus[:2]

['My wife took me here on my birthday for breakfast and it was excellent.  The weather was perfect which made sitting outside overlooking their grounds an absolute pleasure.  Our waitress was excellent and our food arrived quickly on the semi-busy Saturday morning.  It looked like the place fills up pretty quickly so the earlier you get here the better.\n\nDo yourself a favor and get their Bloody Mary.  It was phenomenal and simply the best I\'ve ever had.  I\'m pretty sure they only use ingredients from their garden and blend them fresh when you order it.  It was amazing.\n\nWhile EVERYTHING on the menu looks excellent, I had the white truffle scrambled eggs vegetable skillet and it was tasty and delicious.  It came with 2 pieces of their griddled bread with was amazing and it absolutely made the meal complete.  It was the best "toast" I\'ve ever had.\n\nAnyway, I can\'t wait to go back!',
 'I have no idea why some people give bad reviews about this place. It goes to show you, you can

At this point we have a list (any iterable will do) of queries that are longer than 3 words. This is normal to filter random queries. Now we must use the `LabelEncoder` object to `fit` on the corpus, in order to convert each wor to an ID, and later convert such corpus of list of words into their identifiers.


In [10]:


# tokenizer = Tokenizer()
# # Use the fit_on_texts method to fit the tokenizer
# tokenizer.fit_on_texts(corpus) # Fill

print(f'Before the tokenizer: {corpus[:1]}')

#Now use the same "trained" tokenizer to convert the corpus from words to IDs with the texts_to_sequences method
tokenized_corpus = [ids_from_words.batch_encode(sentence.split()) for sentence in corpus]

print(f'After the tokenizer: {tokenized_corpus[:1]}')

Before the tokenizer: ['My wife took me here on my birthday for breakfast and it was excellent.  The weather was perfect which made sitting outside overlooking their grounds an absolute pleasure.  Our waitress was excellent and our food arrived quickly on the semi-busy Saturday morning.  It looked like the place fills up pretty quickly so the earlier you get here the better.\n\nDo yourself a favor and get their Bloody Mary.  It was phenomenal and simply the best I\'ve ever had.  I\'m pretty sure they only use ingredients from their garden and blend them fresh when you order it.  It was amazing.\n\nWhile EVERYTHING on the menu looks excellent, I had the white truffle scrambled eggs vegetable skillet and it was tasty and delicious.  It came with 2 pieces of their griddled bread with was amazing and it absolutely made the meal complete.  It was the best "toast" I\'ve ever had.\n\nAnyway, I can\'t wait to go back!']
After the tokenizer: [tensor([  1,   2,   3,   4,   5,   6,   7,   8,   9,

In [11]:
vocab_size = LabelEncoder.vocab_size


In [12]:
print(f'First 5 corpus items are {tokenized_corpus[:5]}')
print(f'Length of corpus is {len(tokenized_corpus)}')



First 5 corpus items are [tensor([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,  14,
         15,  16,  13,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,
         28,  29,  13,  30,  11,  31,  32,  33,  34,   6,  35,  36,  37,  38,
         39,  40,  41,  35,  42,  43,  44,  45,  34,  46,  35,  47,  48,  49,
          5,  35,  50,  51,  52,  53,  54,  11,  49,  23,  55,  56,  39,  13,
         57,  11,  58,  35,  59,  60,  61,  62,  63,  45,  64,  65,  66,  67,
         68,  69,  23,  70,  11,  71,  72,  73,  74,  48,  75,  76,  39,  13,
         77,  78,  79,   6,  35,  80,  81,  82,  83,  84,  35,  85,  86,  87,
         88,  89,  90,  11,  12,  13,  91,  11,  92,  39,  93,  94,  95,  96,
         97,  23,  98,  99,  94,  13, 100,  11,  12, 101,  19,  35, 102, 103,
         39,  13,  35,  59, 104,  60,  61,  62, 105,  83, 106, 107, 108, 109,
        110]), tensor([ 83, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122,  39,
        123, 108, 124, 

In [13]:
type(tokenized_corpus)

list

In [14]:
def ids_from_text(text):
  return ids_from_words.batch_encode(text)

def text_from_ids(ids):
  return ids_from_words.batch_decode(ids)

In [15]:
def pad_sequence_of_tokens(x, maxlen, unk_token='UNK'):
  if len(x)<maxlen:
    x.extend([unk_token]*(maxlen-len(x)))
  return x

In [16]:
# This is the algorithmic part of batching the dataset and yielding the window of words and expected middle word for each bacth as a generator.
def create_context_target_pairs(texts, context_size):
    data = []
    for text in texts:
        tokens = text.split()
        for i, word in enumerate(tokens):
            start = max(0, i - context_size)
            end = min(len(tokens), i + context_size + 1)
            context = pad_sequence_of_tokens([tokens[j] for j in range(start, end) if j != i], maxlen=4)
            target = ids_from_words.token_to_index[word]
            context_indices = [ids_from_words.token_to_index[w] for w in context]
            context_indices.append(target)
            data.append(torch.Tensor(context_indices))
    return data

Notice now in a sample how we construct X and y to predict words

In [17]:
data = create_context_target_pairs(corpus[:500], 2)

In [18]:
data = torch.stack(data)

In [19]:
X = data[:, :4].to(torch.long)
y = data[:, 4].to(torch.long)

In [20]:
X

tensor([[    2,     3,     0,     0],
        [    1,     3,     4,     0],
        [    1,     2,     4,     5],
        ...,
        [   11,    35, 11658,   138],
        [   35,    32,   138,     0],
        [   32, 11658,     0,     0]])

In [21]:
print(X.shape)
print(y.shape)


torch.Size([61653, 4])
torch.Size([61653])


Now comes the core part, defining the model. Keras provides a convenient Sequential model class to just `add` layers of any type and they will just work. Let's add an `Embedding` layer (that will map the word ids into a vector of size 100), a `Lambda` to average the words out in a sentence, and a `Dense layer` to select the best word on the other end. This is classic CBOW.


In [22]:

class CBOW(nn.Module):
    def __init__(self, vocab_size, embedding_dim, context_size):
        print((vocab_size, embedding_dim))
        super(CBOW, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        # Linear layer to act as the hidden layer
        self.linear1 = nn.Linear(embedding_dim, 128)
        # Linear layer to predict the center word
        self.linear2 = nn.Linear(128, vocab_size)

    def forward(self, inputs):
        embeds = self.embeddings(inputs)
        embeds = torch.mean(embeds, dim=1)
        out = torch.relu(self.linear1(embeds))
        out = self.linear2(out)
        log_probs = torch.log_softmax(out, dim=1)
        return log_probs

In [23]:
def train_cbow(X, y, model, loss_function, optimizer, epochs):
    for epoch in range(epochs):
        total_loss = 0

        # Step 1. Recall that torch *accumulates* gradients. Before passing in a new instance,
        # you need to zero out the gradients from the old instance
        optimizer.zero_grad()

        # Step 2. Run the forward pass, getting log probabilities over next words
        log_probs = model(X)

        # Step 3. Compute your loss function. (Again, Torch wants the target
        # word wrapped in a tensor)
        loss = loss_function(log_probs, y)

        # Step 4. Do the backward pass and update the gradient
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        # Print progress
        if (epoch + 1) % 10 == 0:
            print('Epoch: {}, Loss: {:.4f}'.format(epoch + 1, total_loss))
    return model

In [32]:
context_size=2
embedding_dim=50
vocab_size = len(ids_from_words.vocab)
model = CBOW(vocab_size, embedding_dim, context_size * 2)
model.to(device)
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.001)
X = X.to(device)
y = y.to(device)

(45224, 50)


In [36]:
trained_model = train_cbow(X, y, model, loss_function, optimizer, epochs=100)

Epoch: 10, Loss: 10.7353
Epoch: 20, Loss: 10.7351
Epoch: 30, Loss: 10.7349
Epoch: 40, Loss: 10.7347
Epoch: 50, Loss: 10.7344
Epoch: 60, Loss: 10.7342
Epoch: 70, Loss: 10.7340
Epoch: 80, Loss: 10.7338
Epoch: 90, Loss: 10.7336
Epoch: 100, Loss: 10.7334


In [37]:
import gensim
from gensim.models.keyedvectors import KeyedVectors

embeddings = trained_model.embeddings.weight.data.cpu().numpy()

# Now, we need to save these embeddings in a format that gensim can understand
# For that, we will use the KeyedVectors instance in gensim

# Instantiate the KeyedVectors with the correct size
kv = KeyedVectors(vector_size=embeddings.shape[1])






In [38]:
# Add the vectors and their corresponding words to the KeyedVectors instance
kv.add_vectors(ids_from_words.index_to_token, embeddings)

In [39]:
kv.most_similar(positive=['gasoline'])

[('dated', 0.5481334924697876),
 ('emerald', 0.5394065976142883),
 ('Betitos', 0.5250834822654724),
 ('wicked', 0.5036346316337585),
 ('agreed', 0.4864133894443512),
 ('else."', 0.4845808446407318),
 ('Chutney', 0.4819386899471283),
 ('side!', 0.47849413752555847),
 ('it),', 0.4752046763896942),
 ('Yard', 0.47150516510009766)]

In [40]:
kv.most_similar(negative=['apple'])

[('(though', 0.538329005241394),
 ('install!!', 0.5372428894042969),
 ('coolness', 0.5159246325492859),
 ('flavors!', 0.5054166913032532),
 ('FORTY-FIVE', 0.505184531211853),
 ('quiche,', 0.5023967623710632),
 ("Fry's", 0.4885081946849823),
 ('rotate', 0.4879502058029175),
 ('fresh!!', 0.48583441972732544),
 ('sinful', 0.4754049479961395)]