In [2]:
# imports needed and set up logging
import gzip
import gensim 
import logging

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [3]:
data_file="reviews_data.txt.gz"

with gzip.open ('reviews_data.txt.gz', 'rb') as f:
    for i,line in enumerate (f):
        print(line)
        break

b"Oct 12 2009 \tNice trendy hotel location not too bad.\tI stayed in this hotel for one night. As this is a fairly new place some of the taxi drivers did not know where it was and/or did not want to drive there. Once I have eventually arrived at the hotel, I was very pleasantly surprised with the decor of the lobby/ground floor area. It was very stylish and modern. I found the reception's staff geeting me with 'Aloha' a bit out of place, but I guess they are briefed to say that to keep up the coroporate image.As I have a Starwood Preferred Guest member, I was given a small gift upon-check in. It was only a couple of fridge magnets in a gift box, but nevertheless a nice gesture.My room was nice and roomy, there are tea and coffee facilities in each room and you get two complimentary bottles of water plus some toiletries by 'bliss'.The location is not great. It is at the last metro stop and you then need to take a taxi, but if you are not planning on going to see the historic sites in Be

In [4]:
def read_input(input_file):
    """This method reads the input file which is in gzip format"""
    
    logging.info("reading file {0}...this may take a while".format(input_file))
    
    with gzip.open (input_file, 'rb') as f:
        for i, line in enumerate (f): 

            if (i%10000==0):
                logging.info ("read {0} reviews".format (i))
            # do some pre-processing and return a list of words for each review text
            yield gensim.utils.simple_preprocess (line)

# read the tokenized reviews into a list
# each review item becomes a serries of words
# so this becomes a list of lists
documents = list (read_input (data_file))
logging.info ("Done reading data file")

2021-03-22 17:46:37,937 : INFO : reading file reviews_data.txt.gz...this may take a while
2021-03-22 17:46:37,938 : INFO : read 0 reviews
2021-03-22 17:46:39,179 : INFO : read 10000 reviews
2021-03-22 17:46:40,427 : INFO : read 20000 reviews
2021-03-22 17:46:41,865 : INFO : read 30000 reviews
2021-03-22 17:46:43,208 : INFO : read 40000 reviews
2021-03-22 17:46:44,698 : INFO : read 50000 reviews
2021-03-22 17:46:46,123 : INFO : read 60000 reviews
2021-03-22 17:46:47,342 : INFO : read 70000 reviews
2021-03-22 17:46:48,444 : INFO : read 80000 reviews
2021-03-22 17:46:49,786 : INFO : read 90000 reviews
2021-03-22 17:46:50,915 : INFO : read 100000 reviews
2021-03-22 17:46:52,045 : INFO : read 110000 reviews
2021-03-22 17:46:53,173 : INFO : read 120000 reviews
2021-03-22 17:46:54,327 : INFO : read 130000 reviews
2021-03-22 17:46:55,558 : INFO : read 140000 reviews
2021-03-22 17:46:56,694 : INFO : read 150000 reviews
2021-03-22 17:46:57,871 : INFO : read 160000 reviews
2021-03-22 17:46:59,013

In [6]:
model = gensim.models.Word2Vec (documents, size=150, window=10, min_count=2, workers=10)
model.train(documents,total_examples=len(documents),epochs=10)

2021-03-22 17:51:41,007 : INFO : collecting all words and their counts
2021-03-22 17:51:41,008 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2021-03-22 17:51:41,158 : INFO : PROGRESS: at sentence #10000, processed 1655714 words, keeping 25777 word types
2021-03-22 17:51:41,306 : INFO : PROGRESS: at sentence #20000, processed 3317863 words, keeping 35016 word types
2021-03-22 17:51:41,483 : INFO : PROGRESS: at sentence #30000, processed 5264072 words, keeping 47518 word types
2021-03-22 17:51:41,646 : INFO : PROGRESS: at sentence #40000, processed 7081746 words, keeping 56675 word types
2021-03-22 17:51:41,825 : INFO : PROGRESS: at sentence #50000, processed 9089491 words, keeping 63744 word types
2021-03-22 17:51:41,999 : INFO : PROGRESS: at sentence #60000, processed 11013723 words, keeping 76781 word types
2021-03-22 17:51:42,146 : INFO : PROGRESS: at sentence #70000, processed 12637525 words, keeping 83194 word types
2021-03-22 17:51:42,281 : INFO : PROG

(303493915, 415193550)

In [9]:
model.wv.similarity(w1="app",w2="apps")

0.142167