In [1]:
import numpy as np

In [2]:
import pandas as pd
import re
from bs4 import BeautifulSoup
import nltk
#nltk.download('all')

# Reading Data

In [3]:
train = pd.read_csv('./labeledTrainData.tsv/labeledTrainData.tsv', delimiter='\t', header=0, quoting=3)

In [4]:
test = pd.read_csv('./testData.tsv/testData.tsv', delimiter='\t', header=0, quoting=3)

In [5]:
unlabeled_train = pd.read_csv('./unlabeledTrainData.tsv/unlabeledTrainData.tsv', delimiter='\t', header=0, quoting=3)

# Reviewing Data

In [6]:
print(train['review'].size, test['review'].size, unlabeled_train['review'].size)

25000 25000 50000


In [7]:
from nltk.corpus import stopwords

In [8]:
def review_to_wordlist( review, remove_stopwords = False):
    
    #removing html
    review_text = BeautifulSoup(review).get_text()
    
    #lowering words
    words = review_text.lower().split()
    
    #removing stopwords ( False by default )
    if remove_stopwords:
        
        stop = set(stopwords.words('english'))
        words = [w for w in words if w not in stop]
    
    return words

In [9]:
import nltk.data
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [10]:
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [14]:
def review_to_sentences( review, tokenizer, remove_stopwords = False):
    
    # Splitting paragraphs into sentences
    raw_sentences = tokenizer.tokenize(review.strip())
    
    sentences = []
    
    #Filtering sentences
    for raw_sentence in raw_sentences:
        
        if len(raw_sentence) > 0:
            
            sentences.append(review_to_wordlist(raw_sentence, remove_stopwords))
        
    return sentences

In [15]:
sentences = []  # Initialize an empty list of sentences

print ("Parsing sentences from training set")
for review in train["review"]:
    sentences += review_to_sentences(review, tokenizer)

print ("Parsing sentences from unlabeled set")
for review in unlabeled_train["review"]:
    sentences += review_to_sentences(review, tokenizer)
    
#might get warning due to urls

Parsing sentences from training set




Parsing sentences from unlabeled set




In [16]:
print(len(sentences))

795538


# Training Model

In [20]:
# creates nice output messages
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',\
    level=logging.INFO)

# Set values for various parameters                     
min_word_count = 40   # Minimum word count                        
num_workers = 4       # Number of threads to run in parallel
context = 10          # Context window size                                                                                    
downsampling = 1e-3   # Downsample setting for frequent words

# Initialize and train the model (this will take some time)
from gensim.models import word2vec
print("Training model...")
model = word2vec.Word2Vec(sentences, workers=num_workers, min_count = min_word_count, window = context, sample = downsampling)

# If you don't plan to train the model any further, calling 
# init_sims will make the model much more memory-efficient.
model.init_sims(replace=True)

2021-10-05 12:50:30,617 : INFO : collecting all words and their counts
2021-10-05 12:50:30,617 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2021-10-05 12:50:30,659 : INFO : PROGRESS: at sentence #10000, processed 219242 words, keeping 32665 word types
2021-10-05 12:50:30,716 : INFO : PROGRESS: at sentence #20000, processed 438623 words, keeping 51663 word types
2021-10-05 12:50:30,765 : INFO : PROGRESS: at sentence #30000, processed 651476 words, keeping 66881 word types
2021-10-05 12:50:30,813 : INFO : PROGRESS: at sentence #40000, processed 871114 words, keeping 80990 word types


Training model...


2021-10-05 12:50:30,863 : INFO : PROGRESS: at sentence #50000, processed 1083691 words, keeping 93535 word types
2021-10-05 12:50:30,924 : INFO : PROGRESS: at sentence #60000, processed 1298869 words, keeping 104807 word types
2021-10-05 12:50:30,980 : INFO : PROGRESS: at sentence #70000, processed 1515513 words, keeping 115640 word types
2021-10-05 12:50:31,034 : INFO : PROGRESS: at sentence #80000, processed 1728384 words, keeping 125785 word types
2021-10-05 12:50:31,078 : INFO : PROGRESS: at sentence #90000, processed 1945448 words, keeping 136196 word types
2021-10-05 12:50:31,135 : INFO : PROGRESS: at sentence #100000, processed 2160633 words, keeping 145760 word types
2021-10-05 12:50:31,182 : INFO : PROGRESS: at sentence #110000, processed 2373736 words, keeping 154951 word types
2021-10-05 12:50:31,235 : INFO : PROGRESS: at sentence #120000, processed 2589502 words, keeping 164045 word types
2021-10-05 12:50:31,295 : INFO : PROGRESS: at sentence #130000, processed 2808001 word

2021-10-05 12:50:35,889 : INFO : PROGRESS: at sentence #770000, processed 16701810 words, keeping 568626 word types
2021-10-05 12:50:35,962 : INFO : PROGRESS: at sentence #780000, processed 16924889 words, keeping 573568 word types
2021-10-05 12:50:36,027 : INFO : PROGRESS: at sentence #790000, processed 17144985 words, keeping 578535 word types
2021-10-05 12:50:36,064 : INFO : collected 581308 word types from a corpus of 17264346 raw words and 795538 sentences
2021-10-05 12:50:36,065 : INFO : Creating a fresh vocabulary
2021-10-05 12:50:36,355 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=40 retains 20587 unique words (3.5414960743702135%% of original 581308, drops 560721)', 'datetime': '2021-10-05T12:50:36.355187', 'gensim': '4.1.2', 'python': '3.8.8 (default, Apr 13 2021, 15:08:03) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19041-SP0', 'event': 'prepare_vocab'}
2021-10-05 12:50:36,355 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=40 

2021-10-05 12:51:11,040 : INFO : EPOCH 5 - PROGRESS: at 55.00% examples, 1599526 words/s, in_qsize 7, out_qsize 0
2021-10-05 12:51:12,043 : INFO : EPOCH 5 - PROGRESS: at 68.64% examples, 1599519 words/s, in_qsize 7, out_qsize 0
2021-10-05 12:51:13,045 : INFO : EPOCH 5 - PROGRESS: at 82.26% examples, 1598391 words/s, in_qsize 8, out_qsize 0
2021-10-05 12:51:14,056 : INFO : EPOCH 5 - PROGRESS: at 95.81% examples, 1595544 words/s, in_qsize 8, out_qsize 0
2021-10-05 12:51:14,347 : INFO : worker thread finished; awaiting finish of 3 more threads
2021-10-05 12:51:14,362 : INFO : worker thread finished; awaiting finish of 2 more threads
2021-10-05 12:51:14,366 : INFO : worker thread finished; awaiting finish of 1 more threads
2021-10-05 12:51:14,369 : INFO : worker thread finished; awaiting finish of 0 more threads
2021-10-05 12:51:14,369 : INFO : EPOCH - 5 : training on 17264346 raw words (11694439 effective words) took 7.3s, 1595809 effective words/s
2021-10-05 12:51:14,370 : INFO : Word2Ve

# Testing

In [22]:
model.wv.doesnt_match("france england germany berlin".split())

'berlin'

In [23]:
model.wv.most_similar("man")

[('man,', 0.8674203157424927),
 ('man.', 0.7748789191246033),
 ('woman', 0.7342656850814819),
 ('lad', 0.7275235056877136),
 ('soldier', 0.7173106670379639),
 ('boy', 0.7072140574455261),
 ('doctor', 0.6941220760345459),
 ('guy', 0.6815625429153442),
 ('person', 0.6745182275772095),
 ("man's", 0.6629161238670349)]

In [25]:
model.wv.most_similar("awful")

[('atrocious', 0.8013814091682434),
 ('horrible', 0.7545210123062134),
 ('terrible', 0.7498117089271545),
 ('dreadful', 0.737442135810852),
 ('horrendous', 0.7007668018341064),
 ('awful,', 0.6943765878677368),
 ('horrid', 0.6922485828399658),
 ('appalling', 0.6758695840835571),
 ('amateurish', 0.6719341278076172),
 ('abysmal', 0.6670199036598206)]