In [1]:
import os
import numpy as np
import pandas as pd
from wordEmbedders import Word2Vec

In [3]:
def suggestWordsForCluster(wordvectors, cluster):
    cluster = set(cluster)
    candidates = set([])
    for word in cluster:
        for w, distance in wordvectors.most_similar(word):
            if w not in cluster:
                candidates.add(w)
    out = []
    for c in candidates:
        out.append((c, np.average(wordvectors.distances(c, cluster))))
    out.sort(key=lambda x: x[1])
    return out

In [27]:
dataset = 'IMDB'
modelFile = f'./models/{dataset}/Word2Vec.model'
wv = Word2Vec.load(modelFile)

In [31]:
positiveWords = ["good",'nice','great','wonderful','terrific','cool','fantastic','excellent','awesome','brilliant']
for word in suggestWordsForCluster(wv, positiveWords):
    print(word)

('top_notch', 0.5095391)
('superb', 0.51829046)
('outstanding', 0.5419428)
('pretty_good', 0.5439246)
('amazing', 0.54664433)
('fabulous', 0.55199707)
('phenomenal', 0.55372417)
('marvellous', 0.5545981)
('incredible', 0.5736557)
('first_rate', 0.5812179)
('exceptional', 0.5874356)
('marvelous', 0.5933228)
('pretty_cool', 0.60975015)
('magnificent', 0.6115242)
('splendid', 0.6163283)
('pretty_decent', 0.6173191)
('decent', 0.62340504)
('absolutely_amazing', 0.6287398)
('nicely_done', 0.63331896)
('alright', 0.6557807)
('passable', 0.66536707)
('below_average', 0.6697382)
('uniformly_excellent', 0.67031443)
('awful', 0.6732474)
('interesting', 0.6802691)
('bad', 0.6839771)
('cute', 0.69487095)
('neat', 0.69760376)
('cheesy', 0.7182783)
('funny', 0.7185991)
('pleasant', 0.73870176)
('hot', 0.77199167)
('watchable', 0.7773117)


In [35]:
negativeWords = ["bad",'awful','horrible','terrible','atrocious','horrid','horrendous','dreadful','abysmal','lousy']
for word in suggestWordsForCluster(wv, negativeWords):
    print(word)

('appalling', 0.50595665)
('god_awful', 0.52027214)
('unbelievably_bad', 0.5300555)
('extremely_poor', 0.5331172)
('sub_par', 0.53416383)
('absolutely_terrible', 0.5369204)
('pathetic', 0.5388743)
('truly_awful', 0.53970456)
('absolutely_atrocious', 0.54830515)
('laughably_bad', 0.54886687)
('poorly_done', 0.5707064)
('mediocre', 0.577185)
('lame', 0.58730936)
('amateurish', 0.58829844)
('crappy', 0.597208)
('good', 0.6034783)
('cheesy', 0.6068131)
('abominable', 0.61176044)
('dismal', 0.61886305)
('second_rate', 0.6357393)


In [36]:
with open(f'./data/{dataset}/positiveWords.txt', mode='w') as f:
    f.write(','.join(positiveWords))

In [37]:
with open(f'./data/{dataset}/negativeWords.txt', mode='w') as f:
    f.write(','.join(negativeWords))