# Word Embeddings on Harry Potter

##### (Notebook by Itay Hazan)

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf

  from ._conv import register_converters as _register_converters


## Step 1: Use gensim implementation

In [3]:
import gensim

The following code reads the entire Harry Potter series into a list, split by periods:

In [60]:
def get_harry_potter_books():
    books = []
    for i in range(7):
        with open('HarryPotter/{}.txt'.format(i+1)) as f:
            books += f.read().split('.')
            
    return books

In [61]:
books = get_harry_potter_books()

Complete the following function, that does some basic pre-processing on the texts:

In [111]:
def pre_processing(books): 
    # TODO: lowercase
    # TODO: remove all end-of-line characters
    # TODO: remove all punctuation
    # TODO: tokenize words (=split by whitespaces)
    lst = []
    
    #return a list of lists: element i of the outer list is a list of word in the i'th book
    return lst

In [63]:
books = pre_processing(books)

Print the first ten sentences (after pre-processing)

In [67]:
for i in range(10):
    print(books[i])

['the', 'boy', 'who', 'lived', 'mr']
['and', 'mrs']
['dursley', 'of', 'number', 'four', 'privet', 'drive', 'were', 'proud', 'to', 'say', 'that', 'they', 'were', 'perfectly', 'normal', 'thank', 'you', 'very', 'much']
['they', 'were', 'the', 'last', 'people', 'you', 'expect', 'to', 'be', 'involved', 'in', 'anything', 'strange', 'or', 'mysterious', 'because', 'they', 'just', 'didn', 'hold', 'with', 'such', 'nonsense']
['mr']
['dursley', 'was', 'the', 'director', 'of', 'firm', 'called', 'grunnings', 'which', 'made', 'drills']
['he', 'was', 'big', 'beefy', 'man', 'with', 'hardly', 'any', 'neck', 'although', 'he', 'did', 'have', 'very', 'large', 'mustache']
['mrs']
['dursley', 'was', 'thin', 'and', 'blonde', 'and', 'had', 'nearly', 'twice', 'the', 'usual', 'amount', 'of', 'neck', 'which', 'came', 'in', 'very', 'useful', 'as', 'she', 'spent', 'so', 'much', 'of', 'her', 'time', 'craning', 'over', 'garden', 'fences', 'spying', 'on', 'the', 'neighbors']
['the', 'dursleys', 'had', 'small', 'son',

In [69]:
len(books)

86672

Next, we initialize a Word2Vec model from gensim:

In [109]:
model = gensim.models.Word2Vec(books, size=150, window=10, min_count=2, workers=10)
model.train(books, total_examples=len(books),epochs=10)

(8025370, 10597000)

In [133]:
wv = lambda x: model.wv.word_vec(x)

# w1 = "lord"
# model.wv.most_similar(positive=[wv("harry")-wv("magic")+wv("muggle")])

model.wv.most_similar(positive="hagrid")



[('grawp', 0.600788950920105),
 ('yeh', 0.5458800196647644),
 ('fang', 0.5186895132064819),
 ('boarhound', 0.49009260535240173),
 ('anythin', 0.4781782925128937),
 ('bane', 0.4736838936805725),
 ('wha', 0.47167322039604187),
 ('buckbeak', 0.4709952473640442),
 ('em', 0.46213701367378235),
 ('hippogriff', 0.45905590057373047)]

## Step 2 (Optional): Implementing CBOW with Negative Sampling

### Step 2.1: Setting things up 

First, get a list of all unique words in the dataset, and sort them alphabetically.

In [None]:
all_words = 

num_words = len(all_words)

Next, create an inverted index of the words:

In [None]:
vocabulary = {}
# TODO: vocab[word] = index

Now, compute the number of occurances of each word in our dataset (a histogram):

In [106]:
hist = []
# TODO: complete hist

Now, given the histogram $h$, write a function that returns a probability distribution over the words in the histgram such that a more popular word will have a higher probability of being chosen:
$$ \Pr [\text{sampling $i$'th word}] = \frac{hist[i]}{\sum_i hist[i]} $$

Remark: it is customary to take the elemets in the right-hand side of the equality to some power smaller than 1, e.g.:
$$ \Pr [\text{sampling $i$'th word}] = \frac{hist[i] ^{3/4}}{\sum_i hist[i]^{3/4}} $$
You may use this in your code as well (ampirically gives better performance).

In [None]:
distribution = 

### Step 2.2: Construct the train set
We define the window size and the dimension of the embedding

In [113]:
window_size = 10
neg_sample_size = 5

Our train set will consist of labeled pairs: ` (x=(context, center), y=0/1`)`: 

To create the train set:

 1. For every `window=(context, center)` of the input
   1. Add the pair `(x=(context, center), y=1)` to the dataset.
   1. Sample `neg_sample_size` words, `w_1, ..., w_k`, from the distribution we computed in step 2.1, and add the all the pairs `(x=(context, w_i), y=0)` to the dataset. 

In [114]:
# TODO: generate the dataset

### Step 2.3: Construct the neural net
We are going to create the following network architecture for negative sampling.

![Negative Sampling Architecture](neg_sampling.png "Negative Sampling")

In [None]:
size = 150 # dimension of the embedding

### Step 2.4: Train and evaluate

Write a function that, given a word, returns the 10 most similar words to it.

In [134]:
def most_similar(word):
    return None

Play with it :)