In [5]:
# imports needed and set up logging
import gzip
import gensim 
import logging

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [6]:


from google.colab import drive
drive.mount('/content/drive')




Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [7]:


input_file = "/content/drive/My Drive/Colab Notebooks/reviews_data.txt.gz"
with gzip.open (input_file, 'rb') as f:
        for i,line in enumerate (f):
            print(line)
            break



b"Oct 12 2009 \tNice trendy hotel location not too bad.\tI stayed in this hotel for one night. As this is a fairly new place some of the taxi drivers did not know where it was and/or did not want to drive there. Once I have eventually arrived at the hotel, I was very pleasantly surprised with the decor of the lobby/ground floor area. It was very stylish and modern. I found the reception's staff geeting me with 'Aloha' a bit out of place, but I guess they are briefed to say that to keep up the coroporate image.As I have a Starwood Preferred Guest member, I was given a small gift upon-check in. It was only a couple of fridge magnets in a gift box, but nevertheless a nice gesture.My room was nice and roomy, there are tea and coffee facilities in each room and you get two complimentary bottles of water plus some toiletries by 'bliss'.The location is not great. It is at the last metro stop and you then need to take a taxi, but if you are not planning on going to see the historic sites in Be

In [8]:


def read_input(input_file):
    """This method reads the input file which is in gzip format"""
    
    logging.info("reading file {0}...this may take a while".format(input_file))
    
    with gzip.open (input_file, 'rb') as f:
        for i, line in enumerate (f): 

            if (i%10000==0):
                logging.info ("read {0} reviews".format (i))
            # do some pre-processing and return a list of words for each review text
            yield gensim.utils.simple_preprocess (line)

# read the tokenized reviews into a list
# each review item becomes a serries of words
# so this becomes a list of lists
documents = list (read_input (input_file))
logging.info ("Done reading data file")



2020-07-16 08:53:11,983 : INFO : reading file /content/drive/My Drive/Colab Notebooks/reviews_data.txt.gz...this may take a while
2020-07-16 08:53:11,989 : INFO : read 0 reviews
2020-07-16 08:53:22,521 : INFO : read 10000 reviews
2020-07-16 08:53:24,369 : INFO : read 20000 reviews
2020-07-16 08:53:26,500 : INFO : read 30000 reviews
2020-07-16 08:53:28,510 : INFO : read 40000 reviews
2020-07-16 08:53:30,698 : INFO : read 50000 reviews
2020-07-16 08:53:32,828 : INFO : read 60000 reviews
2020-07-16 08:53:34,980 : INFO : read 70000 reviews
2020-07-16 08:53:36,601 : INFO : read 80000 reviews
2020-07-16 08:53:38,338 : INFO : read 90000 reviews
2020-07-16 08:53:40,014 : INFO : read 100000 reviews
2020-07-16 08:53:41,675 : INFO : read 110000 reviews
2020-07-16 08:53:43,326 : INFO : read 120000 reviews
2020-07-16 08:53:45,020 : INFO : read 130000 reviews
2020-07-16 08:53:46,861 : INFO : read 140000 reviews
2020-07-16 08:53:48,522 : INFO : read 150000 reviews
2020-07-16 08:53:50,810 : INFO : rea

In [9]:
print(documents[:10])


[['oct', 'nice', 'trendy', 'hotel', 'location', 'not', 'too', 'bad', 'stayed', 'in', 'this', 'hotel', 'for', 'one', 'night', 'as', 'this', 'is', 'fairly', 'new', 'place', 'some', 'of', 'the', 'taxi', 'drivers', 'did', 'not', 'know', 'where', 'it', 'was', 'and', 'or', 'did', 'not', 'want', 'to', 'drive', 'there', 'once', 'have', 'eventually', 'arrived', 'at', 'the', 'hotel', 'was', 'very', 'pleasantly', 'surprised', 'with', 'the', 'decor', 'of', 'the', 'lobby', 'ground', 'floor', 'area', 'it', 'was', 'very', 'stylish', 'and', 'modern', 'found', 'the', 'reception', 'staff', 'geeting', 'me', 'with', 'aloha', 'bit', 'out', 'of', 'place', 'but', 'guess', 'they', 'are', 'briefed', 'to', 'say', 'that', 'to', 'keep', 'up', 'the', 'coroporate', 'image', 'as', 'have', 'starwood', 'preferred', 'guest', 'member', 'was', 'given', 'small', 'gift', 'upon', 'check', 'in', 'it', 'was', 'only', 'couple', 'of', 'fridge', 'magnets', 'in', 'gift', 'box', 'but', 'nevertheless', 'nice', 'gesture', 'my', 'roo

In [10]:
import tensorflow as tf
import numpy as np

In [11]:
vocab_size = 10000
embedding_dim = 100
max_length = 120
trunc_type='post'
oov_tok = "<OOV>"


from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(documents)
word_index = tokenizer.word_index
sequences = tokenizer.texts_to_sequences(documents)
padded = pad_sequences(sequences,maxlen=max_length, truncating=trunc_type)
print(padded[1])


[ 238   29  495    9   48   93  117   14    1   16    2  142 1437  159
    4 1087    2    9    5   21  310    3   51    2   13    5  882    3
   96  380  662   72   28  312   28  965  173   98    6 1087    2   39
    5   21    1    3  172   49    8    2  166    4  421   31  179  216
  482  210    4  624  273    2  108    4 2245 1087 2974    1  464   10
   78    2  327   56    5  778  275    4 1383  365   60    2 1696  586
   24 2262   16 3451   10 2251    3   24   35  719    8 1299  995  482
 4747 5122 4602  249  601  201    2  659    5   68   24  719    8  529
 3295    3   44  646  859   25    5  161]


In [12]:
temp = []
for s in padded:
  l=[]
  for i in s:
    l.append(i)
  temp.append(l)
print(temp[0])
context_words=[]
target_words =[]

for w in temp:
  for i in range(3,len(w)-3):
    target_words.append(w[i])
    temp2 = []
    for j in range(i-3,i+4):
      temp2.append(w[j])
    temp2.remove(w[i])
    context_words.append(temp2)
print(target_words[0])
print(context_words[0])

[203, 46, 1261, 9, 41, 23, 121, 225, 48, 6, 22, 9, 10, 44, 53, 28, 22, 11, 672, 112, 71, 79, 8, 2, 400, 1754, 80, 23, 283, 169, 12, 5, 3, 59, 80, 23, 160, 4, 1053, 25, 382, 30, 1408, 204, 14, 2, 9, 5, 21, 925, 592, 19, 2, 450, 8, 2, 170, 966, 90, 97, 12, 5, 21, 1313, 3, 310, 141, 2, 220, 39, 1, 78, 19, 1, 139, 49, 8, 71, 15, 827, 34, 27, 1, 4, 206, 20, 4, 567, 63, 2, 1, 4651, 28, 30, 2011, 1681, 563, 836, 5, 319, 68, 1738, 487, 100, 6, 12, 5, 58, 334, 8, 562, 1, 6, 1738, 1465, 15, 3299, 46, 4135, 31]
9
[203, 46, 1261, 41, 23, 121]


In [13]:
target_words = np.array(target_words)
context_words = np.array(context_words)
print(np.shape(context_words))
print(np.shape(target_words))

(29116056, 6)
(29116056,)


In [14]:
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

def decode_review(text):
    return ' '.join([reverse_word_index.get(i, '?') for i in text])

print(decode_review(context_words[2]))

trendy hotel location too bad stayed


In [15]:


model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=6),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(3, activation='relu'),
    tf.keras.layers.Dense(vocab_size, activation='softmax')
])
model.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(),optimizer='adam',metrics=['accuracy'])
model.summary()



Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 6, 100)            1000000   
_________________________________________________________________
flatten (Flatten)            (None, 600)               0         
_________________________________________________________________
dense (Dense)                (None, 3)                 1803      
_________________________________________________________________
dense_1 (Dense)              (None, 10000)             40000     
Total params: 1,041,803
Trainable params: 1,041,803
Non-trainable params: 0
_________________________________________________________________


In [16]:
num_epochs = 5
model.fit(context_words, target_words,batch_size = 10000, epochs=num_epochs, validation_data=None)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7f7f9029de48>

In [18]:
e = model.layers[0]
weights = e.get_weights()[0]
print(weights.shape) # shape: (vocab_size, embedding_dim)
import io

out_v = io.open('vecs.tsv', 'w', encoding='utf-8')
out_m = io.open('meta.tsv', 'w', encoding='utf-8')
for word_num in range(1, vocab_size):
  word = reverse_word_index[word_num]
  embeddings = weights[word_num]
  out_m.write(word + "\n")
  out_v.write('\t'.join([str(x) for x in embeddings]) + "\n")
out_v.close()
out_m.close()



(10000, 100)


In [19]:


try:
  from google.colab import files
except ImportError:
  pass
else:
  files.download('vecs.tsv')
  files.download('meta.tsv')



<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>