Words to numbers
================

In [1]:
from utils.word_2_vec import *

## Get the list of words from text sorted by their count

In [2]:
text_corpus = load_data("./resources/grimm_fairy_tales.txt")
unique_words = list(set(text_corpus))
unique_word_counts = [(word, count_word(word, text_corpus)) for word in unique_words]
sort_word_counts(unique_word_counts)

In [3]:
pprint.pprint(unique_word_counts[:10])

[('the', 3674),
 ('and', 2613),
 ('to', 1328),
 ('a', 931),
 ('he', 915),
 ('of', 836),
 ('she', 768),
 ('her', 693),
 ('was', 675),
 ('in', 668)]


## Count coocurences of each unique word with preselected list of words

In [4]:
selected_words = ['replied', 'himself', 'can', 'daughter', 'queen', 'beautiful', 'should', 'over', 'told', 
                  'work', 'take', 'cried', 'mother', 'long', 'more', 'last', 'asked', 'however', 'once', 
                  'woman', 'too', 'us', 'good', 'heard', 'been', 'tree', 'nothing', 'called', 'than', 'put', 
                  'back', 'water', 'am', 'tailor', 'morning', 'saying', 'kings', 'wife', 'only', 'children', 
                  'way', 'still', 'ran', 'may', 'made', 'gutenbergtm', 'make', 'well', 'began', 'give', 'set', 
                  'while', 'forest', 'has', 'dear', 'gold', 'quite', 'through', 'golden', 'looked', 'sat', 
                  'sister', 'prince', 'till', 'gave', 'snowwhite', 'oh', 'fire', 'here', 'answered', 'much', 
                  'got', 'found', 'ah', 'first', 'three', 'bed', 'like', 'fell', 'might', 'done', 'get', 
                  'herself', 'house', 'every', 'night', 'bride', 'hansel', 'young', 'heart', 'eat', 'son', 
                  'john', 'eyes', 'stood', 'each', 'know', 'dwarf', 'bread', 'full']

In [5]:
word_vectors_file_path = "./resources/serialized_word_vectors.pickle"

if os.path.exists(word_vectors_file_path):
    word_vectors = pickle.load(open(word_vectors_file_path, "rb"))
else:
    word_vectors = dict()
    for word in tqdm(unique_words):
        word_vectors[word] = [count_coocurences(word, selected_word, text_corpus, 5) 
                              for selected_word in selected_words]
    pickle.dump(word_vectors, open(word_vectors_file_path, "wb"))
    
word_vectors = {w: np.array(v) for w, v in word_vectors.items()}

In [6]:
print(word_vectors["silver"])

[0 0 0 0 0 1 0 2 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 2 0 1 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 9 0 0 2 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
 0 0 0 2 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0]


## Let's look at the result

In [7]:
def similarity(wordA, wordB, w2v):
    vecA = w2v[wordA]
    vecB = w2v[wordB]
    return np.sum(vecA * vecB) / np.sqrt(np.sum(vecA**2) * np.sum(vecB**2))  

In [8]:
similarity("gold", "silver", word_vectors)

0.9015749926333749

In [9]:
get_words_sorted_by_similarity("silver", word_vectors)[:10]

  dist = 1.0 - uv / np.sqrt(uu * vv)


['silver',
 'gold',
 'lighted',
 'thingsjewels',
 'fruits',
 'treasures',
 'sticking',
 'glistening',
 'woven',
 'garmented']

## Dividing words by their temperature

In [10]:
hot_words = ["hot", "fire"]
cold_words = ["cold", "ice"]

def word_temperature(word): 
    hotness = np.max([similarity(word, hot_word, word_vectors) for hot_word in hot_words])
    coldness = np.max([similarity(word, cold_word, word_vectors) for cold_word in cold_words])
    return hotness - coldness

In [11]:
word_temperature("snow")

-0.14805308450943983

In [12]:
word_temperature("frozen")

-0.16666666666666666

In [13]:
word_temperature("shiver")

-0.06319202925257428

In [14]:
word_temperature("sun")

0.04438185846777611

In [15]:
word_temperature("bright")

0.12931200729757436

In [16]:
word_temperature("red")

0.024182541670333724

In [17]:
word_temperature("blue")

0.0

In [18]:
word_temperature("yellow")

0.047794944556367035

# https://www.twinword.com/api/sentiment-analysis.php