In [1]:
import os
import numpy as np
import pandas as pd
from wordEmbedders import Word2Vec

In [2]:
def suggestWordsForCluster(wordvectors, cluster):
    cluster = set(cluster)
    candidates = set([])
    for word in cluster:
        for w, distance in wordvectors.most_similar(word):
            if w not in cluster:
                candidates.add(w)
    out = []
    for c in candidates:
        out.append((c, np.average(wordvectors.distances(c, cluster))))
    out.sort(key=lambda x: x[1])
    return out

In [3]:
modelFile = f'./models/Sentiment140/Word2Vec.model'
wv = Word2Vec.load(modelFile)

In [23]:
modelFile = f'./models/AirlineTweets/Word2Vec.model'
wv = Word2Vec.load(modelFile)

In [4]:
positiveWords = ["good", "nice", "cool", "lovely", "wonderful", "great", "awesome", "fantastic", "amazing", "fun", "excellent"]
for word in suggestWordsForCluster(wv, positiveWords):
    print(word)

('beautiful', 0.5822675)
('fabulous', 0.6081853)
('fab', 0.6144527)
('gorgeous', 0.6382786)
('enjoy', 0.65614396)
('incredible', 0.67161566)
('brilliant', 0.69185996)
('loved', 0.70654774)
('love', 0.71615195)
('interesting', 0.72527385)
('blast', 0.73187906)
('amaaaazing', 0.7374783)
('joyous', 0.7410322)
('outstanding', 0.74791694)
('awsome', 0.7500341)
('weekend', 0.7596955)
('restful', 0.7669121)
('sunny', 0.76807153)
('goooooood', 0.76990324)
('lots', 0.76995164)
('phenomenal', 0.77025163)
('adventurous', 0.77101445)
('prosperous', 0.7746502)
('aberdeenshire', 0.7783002)
('gloriously', 0.7786415)
('remarkable', 0.801076)
('sunshine', 0.8036303)
('gooood', 0.8097563)
('cornhole', 0.8120153)
('amazinggg', 0.8130605)
('morning', 0.81409395)
('well', 0.815741)
('hot', 0.8180826)
('neat', 0.82280046)
('cute', 0.82512224)
('amazingggg', 0.82757944)
('party', 0.8353131)
('danmatthews', 0.8373432)
('funny', 0.85049474)
('tiring', 0.85395247)
('bad', 0.85535854)
('cooool', 0.8745476)
('fun

In [6]:
negativeWords = ["bad", "horrible", "terrible", "awful", "worst", "shitty", "crappy", "sucks", "hate", 'despise']
for word in suggestWordsForCluster(wv, negativeWords):
    print(word)

('ugh', 0.631745)
('crap', 0.6605822)
('stupid', 0.66164094)
('worse', 0.6637148)
('horrid', 0.6654697)
('sick', 0.6738985)
('suck', 0.679361)
('dizziness', 0.68060195)
('baaad', 0.6968435)
('wretched', 0.7021251)
('crummy', 0.70287025)
('ibs', 0.71255636)
('prone', 0.72431284)
('dreadful', 0.7299045)
('pissy', 0.73042226)
('miserable', 0.7313241)
('horrific', 0.7407146)
('irritable', 0.7409125)
('really', 0.74902475)
('ineffective', 0.75003755)
('hates', 0.7612653)
('chronic', 0.7622753)
('rubbish', 0.7672777)
('good', 0.7703985)
('weirdest', 0.77220356)
('bleh', 0.77296484)
('stupidest', 0.77336454)
('sux', 0.77569866)
('royally', 0.777575)
('suckssss', 0.779333)
('stinks', 0.7818774)
('draining', 0.7841254)
('disgusting', 0.78511447)
('sympathise', 0.7932427)
('hating', 0.79670995)
('terrifying', 0.80061805)
('why', 0.80403864)
('blows', 0.8074517)
('whiney', 0.80920637)
('fractions', 0.8137314)
('sometimes', 0.8164588)
('irresponsible', 0.8178096)
('dislike', 0.8199623)
('unmotivat

In [7]:
with open('./wordlists/positiveWords.txt', mode='w') as f:
    f.write(','.join(positiveWords))

In [8]:
with open('./wordlists/negativeWords.txt', mode='w') as f:
    f.write(','.join(negativeWords))

In [9]:
positiveWords = ['good', 'great', 'nice', 'well', 'lovely', 'fantastic', 'wonderful', 'better', 'great', 'beautiful', 'gorgeous', 'fabulous', 'enjoy', 'perfect', 'amazing', 'awesome', 'fab', 'enjoying', 'fun', 'loving', 'splendid', 'relaxing', 'excellent', 'enjoyed', 'gloriously', 'best', 'cool', 'stunning', 'beatiful', 'incredible', 'breathtaking', 'brilliant', 'terrific', 'love', 'enjoyable', 'heavenly']
for word in suggestWordsForCluster(wv, positiveWords):
    print(word)

('summery', 0.75045615)
('sunny', 0.7524106)
('evening', 0.7604343)
('joyous', 0.7611217)
('fantabulous', 0.7660079)
('cooperates', 0.7701685)
('aberdeenshire', 0.7704472)
('postalguarelas', 0.772976)
('loved', 0.77560383)
('sunshine', 0.7778457)
('brill', 0.7804532)
('basking', 0.7814349)
('energizing', 0.7820011)
('sunday', 0.7840473)
('atthe', 0.7872841)
('acropolis', 0.7872985)
('wondeful', 0.78964275)
('prosperous', 0.7899814)
('weekend', 0.790099)
('redcliffe', 0.7917507)
('restful', 0.79217947)
('sunnier', 0.79360014)
('goooooood', 0.794652)
('sorrento', 0.79581106)
('scorcher', 0.7960695)
('stonehenge', 0.80023324)
('amaaaazing', 0.8006086)
('phenomenal', 0.8021485)
('daaay', 0.8022248)
('scenery', 0.8029747)
('browntowers', 0.8047501)
('afternoon', 0.8060583)
('unbelievable', 0.8077711)
('celcius', 0.8081646)
('blast', 0.8102877)
('warm', 0.8170857)
('weather', 0.81846446)
('stacijshelton', 0.81903136)
('enjoyin', 0.819066)
('levywedding', 0.81960225)
('morning', 0.81997794)
(

In [41]:
negativeWords = ['bad', 'horrible', 'terrible', 'awful', 'worse', 'worst', 'horrid', 'horrific', 'terrifying', 'ugh', 'disgusting', 'horrifying', 'nasty', 'hate', 'shitty', 'crappy', 'sucks','baaad','suck','crap', 'badddd','gross','despise','shit','dreadful','pissy', 'unpleasant','heinous', 'wretched', 'stupid', 'nauseated', 'nauseous', 'nausious']
for word in suggestWordsForCluster(wv, negativeWords):
    print(word)

('dizziness', 0.668331)
('queasy', 0.6848122)
('sick', 0.68968564)
('bunged', 0.69789445)
('sorethroat', 0.70960397)
('sneezes', 0.7110739)
('sandpaper', 0.71762073)
('dizzyness', 0.72349185)
('ibs', 0.7245305)
('hurl', 0.72550714)
('prone', 0.72634935)
('scratchy', 0.726472)
('lightheaded', 0.7284561)
('icky', 0.73016423)
('lingering', 0.7311427)
('ineffective', 0.73308516)
('coughy', 0.7378253)
('achoo', 0.73866314)
('achey', 0.7423365)
('hurtss', 0.7425254)
('hurtssss', 0.7433706)
('feverish', 0.7454103)
('alleviate', 0.7467811)
('sniffly', 0.7484043)
('crummy', 0.74846566)
('crabby', 0.7510733)
('stinging', 0.7522747)
('debilitating', 0.7531113)
('crampy', 0.75342226)
('unsettled', 0.7537643)
('owch', 0.75395507)
('feeeeel', 0.75447047)
('blahhh', 0.7555515)
('tired', 0.75746083)
('hurts', 0.7575512)
('indigestion', 0.7576462)
('nosed', 0.75789624)
('achy', 0.7592042)
('chronic', 0.7615639)
('unusually', 0.7619205)
('irritable', 0.76218563)
('spew', 0.7635607)
('better', 0.7635806)