In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

#### Using external modules
To make our lives simpler, and to make these notebooks a little less crowded, we cana stick functions we use frequently into separate modules. (This are files with the extention .py.) Then we can import individual functions from them. Or we can import everything.

In [None]:
from utilities import *

<h1>Vector space models</h1>

<h2>Vectors review</h2>

In [None]:
v1 = [3, 4]
v2 = [5, 2]

In [None]:
def plot_vec(v):
    plt.arrow(0, 0, v[0], v[1], head_width=.2, head_length=.4, length_includes_head=True)
    return
plt.grid(True)
plt.axis([0, 10, 0, 10])
plot_vec(v1)
plot_vec(v2)

To get the magnitude (length) of a vector you use the **pythagorean theorem**

In [None]:
from math import sqrt
magnitude_of_v1 = sqrt(3.0**2 + 4.0**2)
magnitude_of_v1

### The dot product

The dot product of two vectors can be found in one of two ways:

$\vec{v}\bullet\vec{w}= {v_1}{w_1}+{v_2}{w_2}+{v_3}{w_3}+...$

$\vec{v}\bullet\vec{w}=|\vec{v}||\vec{w}|cos(\theta)$

In python, it looks like this

In [None]:
v1[0] * v2[0] + v1[1] * v2[1]

numpy has a function to do this for you, `dot`

In [None]:
from numpy import dot
dot(v1, v2)

You can use `dot` to compute the magnitude of a vector in a compact manner

In [None]:
v1_mag = sqrt(dot(v1, v1))
v1_mag

### Normalizing vectors

Normalizing a vector, means converting the vector to point in the same direction, but to have a length of 1.

To accomplish this, we divide each dimension of the vector by the lenght of the vector

In [None]:
v1_normalized = [v1[0] / v1_mag, v1[1] / v1_mag]
v1_normalized

In [None]:
dot(v1_normalized, v1_normalized)

### Vectors in numpy

What happens if we try to add and multiply vectors without numpy?

It doesn't do what we want it to do.

In [None]:
v1 + v2

In [None]:
3 * v1

With numpy, vectors behave like vectors

In [None]:
import numpy as np
v1 = np.array([3, 4])
v2 = np.array([5, 2])

In [None]:
v1 + v2

In [None]:
3 * v1

In [None]:
v1_mag = sqrt(dot(v1, v1))
print(v1_mag)
v1_mag = np.linalg.norm(v1)
print(v1_mag)

In [None]:
v1_normalized = v1 / v1_mag
print(v1_normalized)

In [None]:
v2_normalized = v2 / np.linalg.norm(v2)

In [None]:
plt.grid(True)
plt.axis([0, 3, 0, 3])
plot_vec(v1_normalized)
plot_vec(v2_normalized)

## Converting text to a vector

In [None]:
import nltk

In [None]:
t1 = "now is the time for all good men to come to the aid of their country"
t1w = nltk.word_tokenize(t1)
t2 = "now is the time for all good women to come to the aid of their country"
t2w = nltk.word_tokenize(t2)
t3 = "is it time for the women to lead us all"
t3w = nltk.word_tokenize(t3)

In [None]:
vocab = sorted(list(set(t1w + t2w + t3w)))

In [None]:
print(vocab)

In [None]:
v1 = np.array([t1w.count(word) for word in vocab])
print(v1)

In [None]:
def norm_vec(v):
    return v / np.linalg.norm(v)
np.set_printoptions(precision=3)

In [None]:
v1 = norm_vec(v1)
print(v1)

In [None]:
v2 = norm_vec(np.array([t2w.count(word) for word in vocab]))
v3 = norm_vec(np.array([t3w.count(word) for word in vocab]))

In [None]:
print("dot product of v1 and v2 is ", dot(v1, v2))
print("dot product of v1 and v3 is ", dot(v1, v3))
print("dot product of v2 and v3 is ", dot(v1, v3))