In [11]:
from gensim.models import KeyedVectors

filename = 'word2vecSmall.bin.gz'
model = KeyedVectors.load_word2vec_format(filename, binary=True)

In [12]:
import nltk
nltk.download('sentiwordnet')
nltk.download('wordnet')

[nltk_data] Downloading package sentiwordnet to
[nltk_data]     /Users/jackieoh/nltk_data...
[nltk_data]   Package sentiwordnet is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/jackieoh/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [17]:
from nltk.corpus import sentiwordnet as swn
import numpy as np

sentiment_scores = {}

for word in model.vocab:
    score = list(swn.senti_synsets(word))
    if score:
        compound_score = score[0].pos_score() - score[0].neg_score()
        if compound_score != 0:
            sentiment_scores[word] = compound_score

words_to_word_embeddings = []

for word in sentiment_scores.keys():
    words_to_word_embeddings.append([word, np.array(model[word]), sentiment_scores[word]])

In [18]:
import pandas as pd

df = pd.DataFrame(words_to_word_embeddings, columns=['word', 'embedding', 'score'])

In [34]:
# These are the words we want to test our model with 
blacklist = ['nice', 'mean', 'bad', 'good', 'sad', 'happy', 'fantastic', 'terrible']

In [71]:
# These are the 
test_df = df[df['word'].str.lower().str.strip().isin(blacklist)]
test_df

Unnamed: 0,word,embedding,score
16,good,"[0.040527344, 0.0625, -0.017456055, 0.07861328...",0.5
106,bad,"[0.06298828, 0.12451172, 0.11328125, 0.0732421...",-0.875
164,happy,"[-0.0005187988, 0.16015625, 0.0016098022, 0.02...",0.875
649,Good,"[-0.10888672, -0.07470703, -0.045410156, -0.00...",0.5
717,fantastic,"[-0.122558594, -0.037841797, -0.12402344, 0.02...",0.375
750,sad,"[0.18945312, 0.045898438, 0.06689453, -0.04467...",-0.625
907,terrible,"[0.1640625, 0.19238281, 0.092285156, 0.1308593...",-0.625
1833,Bad,"[-0.078125, -0.11279297, 0.018676758, 0.080566...",-0.875
2288,Happy,"[0.05078125, -0.109375, -0.12597656, 0.1240234...",0.875
2638,GOOD,"[-0.34179688, -0.41015625, 0.45117188, -0.2871...",0.5


In [72]:
train_df = df[~df['word'].str.lower().str.strip().isin(blacklist)]
train_df

Unnamed: 0,word,embedding,score
0,is,"[0.0070495605, -0.07324219, 0.171875, 0.022583...",0.125
1,not,"[0.08496094, -0.095214844, 0.119140625, 0.1118...",-0.625
2,will,"[0.048828125, 0.16699219, 0.16894531, 0.087402...",0.125
3,an,"[0.12597656, 0.19042969, 0.06982422, 0.0722656...",-0.125
4,had,"[-0.05810547, 0.05810547, 0.013305664, -0.0003...",0.250
5,were,"[-0.10058594, -0.024658203, 0.092285156, -0.04...",0.125
6,been,"[-0.10107422, 0.017700195, 0.014709473, 0.0275...",0.125
7,new,"[0.011291504, 0.028930664, 0.083496094, -0.049...",0.375
8,other,"[-0.04248047, -0.08251953, 0.043945312, 0.1318...",-0.625
9,just,"[0.10107422, -0.0038146973, 0.018188477, 0.129...",0.625


In [70]:
~df['word'].str.lower().str.strip().isin(blacklist)

0         True
1         True
2         True
3         True
4         True
5         True
6         True
7         True
8         True
9         True
10        True
11        True
12        True
13        True
14        True
15        True
16       False
17        True
18        True
19        True
20        True
21        True
22        True
23        True
24        True
25        True
26        True
27        True
28        True
29        True
         ...  
13495     True
13496     True
13497     True
13498     True
13499     True
13500     True
13501     True
13502     True
13503     True
13504     True
13505     True
13506     True
13507     True
13508     True
13509     True
13510     True
13511     True
13512     True
13513     True
13514     True
13515     True
13516     True
13517     True
13518     True
13519     True
13520     True
13521     True
13522     True
13523     True
13524     True
Name: word, Length: 13525, dtype: bool

In [41]:
#from sklearn.model_selection import train_test_split
#from sklearn.linear_model import LinearRegression

#X_train, X_test, y_train, y_test = train_test_split(np.array(list(df['embedding'].values)), np.array(df['score'].values), test_size=0.2)
                                                    
                                                    

In [89]:
X_train = np.array(list(train_df['embedding'].values))
X_test = np.array(list(train_df['score'].values))
X_train.shape, X_test.shape

((13510, 300), (13510,))

In [90]:
X_train.shape

(13510, 300)

In [91]:
classifier = LinearRegression()
#classifier.fit(X_train, y_train)
classifier.fit(X_train, X_test)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [51]:
print("Score:", classifier.score(X_test, y_test))

Score: 0.3181486154131491


In [64]:
classifier.predict([np.array(model['rich'])])

array([0.15140586], dtype=float32)

In [96]:
classifier.coef_.shape

(300,)