In [1]:
import numpy as np
import pandas as pd
from gensim.models import Word2Vec
from sklearn.cluster import KMeans

In [2]:
word_vectors = Word2Vec.load("word2vec.model").wv

In [7]:
model = KMeans(n_clusters=2, max_iter=1000, random_state=True, n_init=50).fit(X=word_vectors.vectors.astype('double'))

In [10]:
word_vectors.similar_by_vector(model.cluster_centers_[1], topn=10, restrict_vocab=None)

[('help_gave', 0.3629717230796814),
 ('spread', 0.3598177134990692),
 ('travel', 0.3429921269416809),
 ('masks', 0.30952897667884827),
 ('less', 0.30892136693000793),
 ('according', 0.3031187057495117),
 ('ceremony', 0.2890661060810089),
 ('year', 0.28855928778648376),
 ('group', 0.2878999412059784),
 ('social_media', 0.27964162826538086)]

In [11]:
positive_cluster_index = 1
positive_cluster_center = model.cluster_centers_[positive_cluster_index]
negative_cluster_center = model.cluster_centers_[1-positive_cluster_index]

In [13]:
words = pd.DataFrame(word_vectors.index_to_key)
words.columns = ['words']
words['vectors'] = words.words.apply(lambda x: word_vectors[f'{x}'])
words['cluster'] = words.vectors.apply(lambda x: model.predict([np.array(x)]))
words.cluster = words.cluster.apply(lambda x: x[0])

In [14]:
# Assign each word sentiment score — negative or positive value (-1 or 1) 
# based on the cluster to which they belong. 
# To weigh this score I multiplied it by how close they were to their cluster (to weigh how potentially positive/negative they are). 
# As the score that K-means algorithm outputs is distance from both clusters, to properly weigh them I multiplied them by the inverse of closeness score (divided sentiment score by closeness score).

words['cluster_value'] = [1 if i==positive_cluster_index else -1 for i in words.cluster]
words['closeness_score'] = words.apply(lambda x: 1/(model.transform([x.vectors]).min()), axis=1)
words['sentiment_coeff'] = words.closeness_score * words.cluster_value

In [15]:
words

Unnamed: 0,words,vectors,cluster,cluster_value,closeness_score,sentiment_coeff
0,new,"[-0.014966709, 0.010970789, 0.09808832, 0.1662...",0,-1,1.009614,-1.009614
1,said,"[-0.1531129, 0.06944403, 0.09789845, 0.1087224...",1,1,1.014201,1.014201
2,u,"[-0.0062855785, 0.06274626, -0.10941577, -0.01...",0,-1,0.999861,-0.999861
3,one,"[-0.14592943, 0.16501668, 0.0027042925, -0.029...",0,-1,1.006403,-1.006403
4,first,"[-0.12830456, 0.030621363, -0.11716264, -0.033...",1,1,1.018103,1.018103
...,...,...,...,...,...,...
425,often,"[-0.024559451, 0.0804692, -0.14266136, 0.06234...",0,-1,1.011132,-1.011132
426,experts,"[-0.019062854, 0.04005569, -0.07883035, -0.013...",0,-1,1.020028,-1.020028
427,running,"[-0.056957595, 0.15651073, 0.046570297, -0.162...",1,1,0.999435,0.999435
428,live,"[0.0005727796, 0.041767076, -0.109537534, -0.1...",1,1,1.003040,1.003040


In [16]:
words[['words', 'sentiment_coeff']].to_csv('sentiment_dictionary.csv', index=False)