In [1]:
#https://colab.research.google.com/notebooks/mlcc/intro_to_sparse_data_and_embeddings.ipynb?utm_source=mlcc&utm_campaign=colab-external&utm_medium=referral&utm_content=embeddings-colab&hl=en#scrollTo=jGWqDqFFL_NZ

### Word Embeddigns, Bias in ML, Why You Don't Like Math, & Why AI Needs You by Rachel Thomas
- https://github.com/fastai/word-embeddings-workshop
- one-hot-encoding disregard the notion of similarity  

- word2vec: not deep learning, although it can be used in deep learning, not an algorithm although algorithms were used to train it
    - training Word2vec takes lots of data, time, computational power (already trained version)
- GloVe is a similar set of embeddings


In [2]:
import numpy as np
import re
import json

In [3]:
np.set_printoptions(precision=4, suppress=True)

In [4]:
vecs = np.load('./resource/glove_vectors_100d.npy')
vecs_50 = np.load('./resource/glove_vectors_50d.npy')

In [5]:
with open('./resource/words.txt', 'r') as f:
    content = f.readlines()
words = [x.strip() for x in content]
len(words)

400000

In [6]:
words[600:610]

['together',
 'congress',
 'index',
 'australia',
 'results',
 'hard',
 'hours',
 'land',
 'action',
 'higher']

In [7]:
wordidx = json.load(open('./resource/wordsidx.txt'))
type(wordidx)

dict

In [8]:
wordidx['hours']

606

In [9]:
wordidx['drug']

780

In [10]:
words[606]

'hours'

### Words as vectors

In [11]:
type(vecs)

numpy.ndarray

In [12]:
wordidx['intelligence']

1226

In [13]:
vecs[1226]

array([-0.311 , -0.4329,  0.7773, -0.3112,  0.0529, -0.8502, -0.3537,
       -0.7053,  0.0845,  0.8877,  0.8353, -0.4164,  0.367 ,  0.6083,
        0.0085,  0.9429,  0.5314, -0.7532, -0.8676,  0.3483, -0.2986,
       -0.4344,  0.3514, -1.1228, -1.2564, -0.0942,  0.294 ,  0.3199,
        0.0867,  0.3192,  0.5607,  0.033 , -0.9438, -0.5811,  0.1127,
        0.0061, -0.7935,  0.7037,  0.5969,  0.605 , -0.2286, -0.2647,
        0.0452,  0.5812,  0.2676, -0.4724,  0.2936, -0.2834, -0.2282,
       -0.5953,  1.0845,  0.2154,  0.5789,  1.5825,  0.1532, -1.3246,
        0.4259, -0.2483,  1.3285,  0.4874,  0.1711,  0.7304,  0.5175,
       -0.5017,  0.2325, -0.3318, -0.3177,  0.3471,  0.9589,  1.5972,
        0.7646, -0.1559, -0.1355, -0.9765, -0.2955,  0.0973, -0.1711,
        0.1769, -1.1941,  0.4109,  1.0578,  0.5555,  0.0343, -0.186 ,
       -1.7366,  0.227 ,  1.0213,  0.8021, -0.0174, -0.4557, -0.1136,
        0.0321, -0.3708,  0.2216, -0.003 ,  0.2329,  0.1698, -1.0727,
       -0.1842,  0.4

In [14]:
from scipy.spatial.distance import cosine as dist

In [15]:
#small numbers mean two words are closer together, larger numbers mean they are further apart
# distance between words

print(dist(vecs[wordidx['drug']], vecs[wordidx['medicine']]))
print(dist(vecs[wordidx['queen']], vecs[wordidx['princess']]))
print(dist(vecs[wordidx['drug']], vecs[wordidx['medication']]))

0.5097289383411407
0.20527541637420654
0.3426266312599182


In [18]:
import plotly
import plotly.graph_objs as go
from IPython.display import IFrame

In [19]:
def plotly_3d(Y, cat_labels):
    trace_dict = {}
    for i, label in enumerate(cat_labels):
        trace_dict[i] = go.Scatter3d(
            x=Y[i*5:(i+1)*5, 0],
            y=Y[i*5:(i+1)*5, 1],
            z=Y[i*5:(i+1)*5, 2],
            mode='markers',
            marker=dict(
                size=8,
                line=dict(
                    color='rgba('+ str(i*40) + ',' + str(i*40) + ',' + str(i*40) + ', 0.14)',
                    width=0.5
                ),
                opacity=0.8
            ),
            text = my_words[i*5:(i+1)*5],
            name = label
        )

    data = [item for item in trace_dict.values()]
    layout = go.Layout(
        margin=dict(
            l=0,
            r=0,
            b=0,
            t=0
        )
    )

    plotly.offline.plot({
        "data": data,
        "layout": layout
    })

In [20]:
def plotly_2d(Y, cat_labels):
    trace_dict = {}
    for i, label in enumerate(cat_labels):
        trace_dict[i] = go.Scatter(
            x=Y[i*5:(i+1)*5, 0],
            y=Y[i*5:(i+1)*5, 1],
            mode='markers',
            marker=dict(
                size=8,
                line=dict(
                    color='rgba('+ str(i*40) + ',' + str(i*40) + ',' + str(i*40) + ', 0.14)',
                    width=0.5
                ),
                opacity=0.8
            ),
            text = my_words[i*5:(i+1)*5],
            name = label
        )

    data = [item for item in trace_dict.values()]
    layout = go.Layout(
        margin=dict(
            l=0,
            r=0,
            b=0,
            t=0
        )
    )

    plotly.offline.plot({
        "data": data,
        "layout": layout
    })

In [21]:
categories = [
              "bugs", "music", 
              "pleasant", "unpleasant", 
              "science", "arts"
             ]

In [22]:
my_words = [
            "maggot", "flea", "tarantula", "bedbug", "mosquito", 
            "violin", "cello", "flute", "harp", "mandolin",
            "joy", "love", "peace", "pleasure", "wonderful",
            "agony", "terrible", "horrible", "nasty", "failure", 
            "physics", "chemistry", "science", "technology", "engineering",
            "poetry", "art", "literature", "dance", "symphony",
           ]

In [23]:
X = np.array([wordidx[word] for word in my_words])

In [24]:
embeddings = np.concatenate((vecs[X], vecs[:10000,:]), axis=0); embeddings.shape

(10030, 100)

### viewing the words in 3D
- TSNE or PCA

In [25]:
from sklearn import manifold

In [28]:
tsne = manifold.TSNE(n_components=3, init='pca', random_state=0)
Y = tsne.fit_transform(embeddings)
plotly_3d(Y, categories)

In [29]:
IFrame('temp-plot.html', width=600, height=400)

In [31]:
from sklearn import decomposition

In [34]:
pca = decomposition.PCA(n_components=3).fit(embeddings.T)
components = pca.components_
plotly_3d(components.T[:len(my_words), :], categories)

In [35]:
IFrame('temp-plot.html', width=600, height=400)

### Nearest neighbors

In [36]:
from sklearn.neighbors import NearestNeighbors

In [37]:
neigh = NearestNeighbors(n_neighbors=10, radius=0.5, metric='cosine', algorithm='brute')
neigh.fit(vecs)

NearestNeighbors(algorithm='brute', leaf_size=30, metric='cosine',
         metric_params=None, n_jobs=1, n_neighbors=10, p=2, radius=0.5)

In [40]:
distances, indices = neigh.kneighbors([vecs[wordidx['drug']]])

In [41]:
[(words[int(ind)], dist) for ind, dist in zip(list(indices[0]), list(distances[0]))]

[('drug', 1.1920929e-07),
 ('drugs', 0.09482932),
 ('cocaine', 0.26914638),
 ('trafficking', 0.2765547),
 ('narcotics', 0.28780347),
 ('prescription', 0.31239307),
 ('traffickers', 0.31349838),
 ('heroin', 0.32361674),
 ('treatment', 0.32445478),
 ('marijuana', 0.33303165)]

#### combine words

In [42]:
new_vec = vecs[wordidx['drug']] + vecs[wordidx['medication']]
new_vec

array([ 1.6815, -0.5358, -0.192 ,  0.2729, -0.5727,  1.3718,  0.7215,
        0.307 , -0.2641, -0.1579, -0.3427, -0.0422, -1.0191,  0.9318,
        1.1623,  1.1909,  0.4329, -1.1958, -0.3869, -0.7594,  0.6601,
       -1.2179, -1.2232,  1.5385, -2.5321,  1.8371,  0.9078, -2.8833,
       -1.224 , -0.2556,  1.2389,  0.7767, -0.6275,  0.0336,  0.4029,
        1.4274, -0.3828, -0.085 , -0.1743,  0.6235,  0.0252,  1.0936,
        0.4326, -2.2592,  0.7658,  0.6213, -0.3143, -1.0698, -0.6511,
       -0.5507,  0.7336,  1.2444, -0.733 ,  1.5247,  0.6895, -1.9009,
       -1.1109, -0.4492,  2.8318,  0.0107,  0.9959,  0.6507,  1.2779,
       -0.4184,  0.1191,  0.5387,  1.1988, -0.843 ,  1.1415, -0.396 ,
       -0.5024, -1.514 , -0.3846,  1.7065, -0.0007, -0.4634, -1.136 ,
       -0.454 , -1.7857, -0.5474,  1.5766,  0.0763, -2.1095,  0.7399,
       -4.2753,  0.253 ,  0.4625,  0.1909, -3.2695, -0.6329, -0.5326,
       -0.4154,  0.0811, -0.5186,  0.5659,  0.977 ,  0.688 , -0.3712,
        1.2955,  0.2

In [43]:
distances, indices = neigh.kneighbors([new_vec])

In [44]:
[(words[int(ind)], dist) for ind, dist in zip(list(indices[0]), list(distances[0]))]

[('drug', 0.07643634),
 ('drugs', 0.08592087),
 ('medication', 0.10392296),
 ('medications', 0.16097057),
 ('prescription', 0.20981598),
 ('treatment', 0.22952163),
 ('pills', 0.27385557),
 ('medicines', 0.29003102),
 ('prescribed', 0.2973038),
 ('patients', 0.30121684)]

### Movie Reviews Sentiment Analysis

- with IMDB dataset

In [None]:
from keras.datasets import imdb
from keras.utils.data_utils import get_file
idx = imdb.get_word_index()