In [1]:
#https://colab.research.google.com/notebooks/mlcc/intro_to_sparse_data_and_embeddings.ipynb?utm_source=mlcc&utm_campaign=colab-external&utm_medium=referral&utm_content=embeddings-colab&hl=en#scrollTo=jGWqDqFFL_NZ

### Word Embeddigns, Bias in ML, Why You Don't Like Math, & Why AI Needs You by Rachel Thomas
- https://github.com/fastai/word-embeddings-workshop
- one-hot-encoding disregard the notion of similarity  

- word2vec: not deep learning, although it can be used in deep learning, not an algorithm although algorithms were used to train it
    - training Word2vec takes lots of data, time, computational power (already trained version)
- GloVe is a similar set of embeddings


In [2]:
import numpy as np
import re
import json

In [3]:
np.set_printoptions(precision=4, suppress=True)

In [4]:
vecs = np.load('./resource/glove_vectors_100d.npy')
vecs_50 = np.load('./resource/glove_vectors_50d.npy')

In [5]:
with open('./resource/words.txt', 'r') as f:
    content = f.readlines()
words = [x.strip() for x in content]
len(words)

400000

In [6]:
words[600:610]

['together',
 'congress',
 'index',
 'australia',
 'results',
 'hard',
 'hours',
 'land',
 'action',
 'higher']

In [7]:
wordidx = json.load(open('./resource/wordsidx.txt'))
type(wordidx)

dict

In [8]:
wordidx['hours']

606

In [9]:
wordidx['drug']

780

In [10]:
words[606]

'hours'

### Words as vectors

In [11]:
type(vecs)

numpy.ndarray

In [12]:
wordidx['intelligence']

1226

In [13]:
vecs[1226]

array([-0.311 , -0.4329,  0.7773, -0.3112,  0.0529, -0.8502, -0.3537,
       -0.7053,  0.0845,  0.8877,  0.8353, -0.4164,  0.367 ,  0.6083,
        0.0085,  0.9429,  0.5314, -0.7532, -0.8676,  0.3483, -0.2986,
       -0.4344,  0.3514, -1.1228, -1.2564, -0.0942,  0.294 ,  0.3199,
        0.0867,  0.3192,  0.5607,  0.033 , -0.9438, -0.5811,  0.1127,
        0.0061, -0.7935,  0.7037,  0.5969,  0.605 , -0.2286, -0.2647,
        0.0452,  0.5812,  0.2676, -0.4724,  0.2936, -0.2834, -0.2282,
       -0.5953,  1.0845,  0.2154,  0.5789,  1.5825,  0.1532, -1.3246,
        0.4259, -0.2483,  1.3285,  0.4874,  0.1711,  0.7304,  0.5175,
       -0.5017,  0.2325, -0.3318, -0.3177,  0.3471,  0.9589,  1.5972,
        0.7646, -0.1559, -0.1355, -0.9765, -0.2955,  0.0973, -0.1711,
        0.1769, -1.1941,  0.4109,  1.0578,  0.5555,  0.0343, -0.186 ,
       -1.7366,  0.227 ,  1.0213,  0.8021, -0.0174, -0.4557, -0.1136,
        0.0321, -0.3708,  0.2216, -0.003 ,  0.2329,  0.1698, -1.0727,
       -0.1842,  0.4

In [14]:
from scipy.spatial.distance import cosine as dist

In [15]:
#small numbers mean two words are closer together, larger numbers mean they are further apart
# distance between words

print(dist(vecs[wordidx['drug']], vecs[wordidx['medicine']]))
print(dist(vecs[wordidx['queen']], vecs[wordidx['princess']]))
print(dist(vecs[wordidx['drug']], vecs[wordidx['medication']]))

0.5097289383411407
0.20527541637420654
0.3426266312599182


In [16]:
import plotly
import plotly.graph_objs as go
from IPython.display import IFrame

In [17]:
def plotly_3d(Y, cat_labels):
    trace_dict = {}
    for i, label in enumerate(cat_labels):
        trace_dict[i] = go.Scatter3d(
            x=Y[i*5:(i+1)*5, 0],
            y=Y[i*5:(i+1)*5, 1],
            z=Y[i*5:(i+1)*5, 2],
            mode='markers',
            marker=dict(
                size=8,
                line=dict(
                    color='rgba('+ str(i*40) + ',' + str(i*40) + ',' + str(i*40) + ', 0.14)',
                    width=0.5
                ),
                opacity=0.8
            ),
            text = my_words[i*5:(i+1)*5],
            name = label
        )

    data = [item for item in trace_dict.values()]
    layout = go.Layout(
        margin=dict(
            l=0,
            r=0,
            b=0,
            t=0
        )
    )

    plotly.offline.plot({
        "data": data,
        "layout": layout
    })

In [18]:
def plotly_2d(Y, cat_labels):
    trace_dict = {}
    for i, label in enumerate(cat_labels):
        trace_dict[i] = go.Scatter(
            x=Y[i*5:(i+1)*5, 0],
            y=Y[i*5:(i+1)*5, 1],
            mode='markers',
            marker=dict(
                size=8,
                line=dict(
                    color='rgba('+ str(i*40) + ',' + str(i*40) + ',' + str(i*40) + ', 0.14)',
                    width=0.5
                ),
                opacity=0.8
            ),
            text = my_words[i*5:(i+1)*5],
            name = label
        )

    data = [item for item in trace_dict.values()]
    layout = go.Layout(
        margin=dict(
            l=0,
            r=0,
            b=0,
            t=0
        )
    )

    plotly.offline.plot({
        "data": data,
        "layout": layout
    })

In [19]:
categories = [
              "bugs", "music", 
              "pleasant", "unpleasant", 
              "science", "arts"
             ]

In [20]:
my_words = [
            "maggot", "flea", "tarantula", "bedbug", "mosquito", 
            "violin", "cello", "flute", "harp", "mandolin",
            "joy", "love", "peace", "pleasure", "wonderful",
            "agony", "terrible", "horrible", "nasty", "failure", 
            "physics", "chemistry", "science", "technology", "engineering",
            "poetry", "art", "literature", "dance", "symphony",
           ]

In [21]:
X = np.array([wordidx[word] for word in my_words])

In [22]:
embeddings = np.concatenate((vecs[X], vecs[:10000,:]), axis=0); embeddings.shape

(10030, 100)

### viewing the words in 3D
- TSNE or PCA

In [23]:
from sklearn import manifold

In [24]:
tsne = manifold.TSNE(n_components=3, init='pca', random_state=0)
Y = tsne.fit_transform(embeddings)
plotly_3d(Y, categories)

In [25]:
IFrame('temp-plot.html', width=600, height=400)

In [26]:
from sklearn import decomposition

In [27]:
pca = decomposition.PCA(n_components=3).fit(embeddings.T)
components = pca.components_
plotly_3d(components.T[:len(my_words), :], categories)

In [28]:
IFrame('temp-plot.html', width=600, height=400)

### Nearest neighbors

In [29]:
from sklearn.neighbors import NearestNeighbors

In [30]:
neigh = NearestNeighbors(n_neighbors=10, radius=0.5, metric='cosine', algorithm='brute')
neigh.fit(vecs)

NearestNeighbors(algorithm='brute', leaf_size=30, metric='cosine',
         metric_params=None, n_jobs=1, n_neighbors=10, p=2, radius=0.5)

In [31]:
distances, indices = neigh.kneighbors([vecs[wordidx['drug']]])

In [32]:
[(words[int(ind)], dist) for ind, dist in zip(list(indices[0]), list(distances[0]))]

[('drug', 1.1920929e-07),
 ('drugs', 0.09482932),
 ('cocaine', 0.26914638),
 ('trafficking', 0.2765547),
 ('narcotics', 0.28780347),
 ('prescription', 0.31239307),
 ('traffickers', 0.31349838),
 ('heroin', 0.32361674),
 ('treatment', 0.32445478),
 ('marijuana', 0.33303165)]

#### combine words

In [33]:
new_vec = vecs[wordidx['drug']] + vecs[wordidx['medication']]
new_vec

array([ 1.6815, -0.5358, -0.192 ,  0.2729, -0.5727,  1.3718,  0.7215,
        0.307 , -0.2641, -0.1579, -0.3427, -0.0422, -1.0191,  0.9318,
        1.1623,  1.1909,  0.4329, -1.1958, -0.3869, -0.7594,  0.6601,
       -1.2179, -1.2232,  1.5385, -2.5321,  1.8371,  0.9078, -2.8833,
       -1.224 , -0.2556,  1.2389,  0.7767, -0.6275,  0.0336,  0.4029,
        1.4274, -0.3828, -0.085 , -0.1743,  0.6235,  0.0252,  1.0936,
        0.4326, -2.2592,  0.7658,  0.6213, -0.3143, -1.0698, -0.6511,
       -0.5507,  0.7336,  1.2444, -0.733 ,  1.5247,  0.6895, -1.9009,
       -1.1109, -0.4492,  2.8318,  0.0107,  0.9959,  0.6507,  1.2779,
       -0.4184,  0.1191,  0.5387,  1.1988, -0.843 ,  1.1415, -0.396 ,
       -0.5024, -1.514 , -0.3846,  1.7065, -0.0007, -0.4634, -1.136 ,
       -0.454 , -1.7857, -0.5474,  1.5766,  0.0763, -2.1095,  0.7399,
       -4.2753,  0.253 ,  0.4625,  0.1909, -3.2695, -0.6329, -0.5326,
       -0.4154,  0.0811, -0.5186,  0.5659,  0.977 ,  0.688 , -0.3712,
        1.2955,  0.2

In [34]:
distances, indices = neigh.kneighbors([new_vec])

In [35]:
[(words[int(ind)], dist) for ind, dist in zip(list(indices[0]), list(distances[0]))]

[('drug', 0.07643634),
 ('drugs', 0.08592087),
 ('medication', 0.10392296),
 ('medications', 0.16097057),
 ('prescription', 0.20981598),
 ('treatment', 0.22952163),
 ('pills', 0.27385557),
 ('medicines', 0.29003102),
 ('prescribed', 0.2973038),
 ('patients', 0.30121684)]

### Movie Reviews Sentiment Analysis

- with IMDB dataset

In [36]:
import pickle
from keras.datasets import imdb
from keras.utils.data_utils import get_file
idx = imdb.get_word_index()


Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.

Using TensorFlow backend.

compiletime version 3.5 of module 'tensorflow.python.framework.fast_tensor_util' does not match runtime version 3.6



In [37]:
type(idx)

dict

In [38]:
import keras.backend as K

def limit_mem():
    K.get_session().close()
    cfg = K.tf.ConfigProto()
    cfg.gpu_options.allow_growth = True
#     cfg.gpu_options.per_process_gpu_memeory_fraction = 0.6
    K.set_session(K.tf.Session(config=cfg))
    
limit_mem()

In [39]:
idx_arr = sorted(idx, key=idx.get)
idx_arr[:10]

['the', 'and', 'a', 'of', 'to', 'is', 'br', 'in', 'it', 'i']

In [40]:
idx2word = {v: k for k, v in idx.items()}

download dataset from https://keras.io/datasets/

In [41]:
path = get_file('imdb_full.pkl',\
               origin='https://s3.amazonaws.com/text-datasets/imdb_full.pkl',\
               md5_hash='d091312047c43cf9e4e38fef92437263')
f = open(path, 'rb')
(x_train, labels_train), (x_test, labels_test) = pickle.load(f)

In [42]:
len(x_train)

25000

In [43]:
#check first review
', '.join(map(str, x_train[0]))

'23022, 309, 6, 3, 1069, 209, 9, 2175, 30, 1, 169, 55, 14, 46, 82, 5869, 41, 393, 110, 138, 14, 5359, 58, 4477, 150, 8, 1, 5032, 5948, 482, 69, 5, 261, 12, 23022, 73935, 2003, 6, 73, 2436, 5, 632, 71, 6, 5359, 1, 25279, 5, 2004, 10471, 1, 5941, 1534, 34, 67, 64, 205, 140, 65, 1232, 63526, 21145, 1, 49265, 4, 1, 223, 901, 29, 3024, 69, 4, 1, 5863, 10, 694, 2, 65, 1534, 51, 10, 216, 1, 387, 8, 60, 3, 1472, 3724, 802, 5, 3521, 177, 1, 393, 10, 1238, 14030, 30, 309, 3, 353, 344, 2989, 143, 130, 5, 7804, 28, 4, 126, 5359, 1472, 2375, 5, 23022, 309, 10, 532, 12, 108, 1470, 4, 58, 556, 101, 12, 23022, 309, 6, 227, 4187, 48, 3, 2237, 12, 9, 215'

In [44]:
idx2word[23022]

'bromwell'

In [45]:
' '.join(idx2word[o] for o in x_train[0])

"bromwell high is a cartoon comedy it ran at the same time as some other programs about school life such as teachers my 35 years in the teaching profession lead me to believe that bromwell high's satire is much closer to reality than is teachers the scramble to survive financially the insightful students who can see right through their pathetic teachers' pomp the pettiness of the whole situation all remind me of the schools i knew and their students when i saw the episode in which a student repeatedly tried to burn down the school i immediately recalled at high a classic line inspector i'm here to sack one of your teachers student welcome to bromwell high i expect that many adults of my age think that bromwell high is far fetched what a pity that it isn't"

In [46]:
#labels are 1 for positive and 0 for negative
labels_train[:10]

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

In [47]:
#reduce vocab size by setting rare words to max index

vocab_size = 5000

trn = [np.array([i if i<vocab_size-1 else vocab_size-1 for i in s]) for s in x_train]
test = [np.array([i if i<vocab_size-1 else vocab_size-1 for i in s]) for s in x_test]

In [48]:
trn[:10]

[array([4999,  309,    6,    3, 1069,  209,    9, 2175,   30,    1,  169,
          55,   14,   46,   82, 4999,   41,  393,  110,  138,   14, 4999,
          58, 4477,  150,    8,    1, 4999, 4999,  482,   69,    5,  261,
          12, 4999, 4999, 2003,    6,   73, 2436,    5,  632,   71,    6,
        4999,    1, 4999,    5, 2004, 4999,    1, 4999, 1534,   34,   67,
          64,  205,  140,   65, 1232, 4999, 4999,    1, 4999,    4,    1,
         223,  901,   29, 3024,   69,    4,    1, 4999,   10,  694,    2,
          65, 1534,   51,   10,  216,    1,  387,    8,   60,    3, 1472,
        3724,  802,    5, 3521,  177,    1,  393,   10, 1238, 4999,   30,
         309,    3,  353,  344, 2989,  143,  130,    5, 4999,   28,    4,
         126, 4999, 1472, 2375,    5, 4999,  309,   10,  532,   12,  108,
        1470,    4,   58,  556,  101,   12, 4999,  309,    6,  227, 4187,
          48,    3, 2237,   12,    9,  215]),
 array([4999,   39, 4999,   14,  739, 4999, 3428,   44,   74,   32

In [49]:
lens = np.array([len(review) for review in trn])

In [50]:
(lens.max(), lens.min(), lens.mean())

(2493, 10, 237.71364)

In [51]:
#pad with zeros or truncate each sentence to make consistent length
from keras.preprocessing import sequence

In [52]:
seq_len = 500

trn = sequence.pad_sequences(trn, maxlen=seq_len, value=0)
test = sequence.pad_sequences(test, maxlen=seq_len, value=0)

In [53]:
#reviews shorter than 500 words are pre-padded with zeros, those greater are truncated
trn.shape

(25000, 500)

#### Create a Model

- 1D CNN since a sequence of words is 1D

In [54]:
from keras.models import Sequential
from keras.layers.embeddings import Embedding
from keras.layers.core import Flatten, Dense, Dropout
from keras.layers.convolutional import Conv1D, MaxPooling1D
from keras.layers import SpatialDropout1D
from keras.optimizers import Adam

In [55]:
conv1 = Sequential([
    Embedding(vocab_size, 50, input_length=seq_len),
    SpatialDropout1D(0.4),
    Conv1D(64, 5, padding='same', activation='relu'),
    Dropout(0.2),
    MaxPooling1D(),
    Flatten(),
    Dense(100, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

Instructions for updating:
keep_dims is deprecated, use keepdims instead


In [56]:
conv1.compile(loss='binary_crossentropy', optimizer=Adam(), metrics=['accuracy'])

Instructions for updating:
keep_dims is deprecated, use keepdims instead


In [57]:
conv1.fit(trn, labels_train, validation_data=(test, labels_test), epochs=4, batch_size=64)

Train on 25000 samples, validate on 25000 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x153e95470>

In [58]:
# once close to the answer, take smaller steps

In [59]:
conv1.optimizer.lr = 1e-4

In [60]:
conv1.fit(trn, labels_train, validation_data=(test, labels_test), epochs=4, batch_size=64)

Train on 25000 samples, validate on 25000 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x153c53630>

### Use GloVe word embedding
- version of 50 dimensions

In [61]:
# gloVe and imdb has differnet indexes
def create_emb():
    n_fact = vecs_50.shape[1]
    emb = np.zeros((vocab_size, n_fact))
    
    for i in range(1, len(emb)):
        word = idx2word[i]
        if word and re.match(r"^[a-zA-Z0-9\-]*$", word):
            emb[i] = vecs_50[wordidx[word]]
        else:
            # if we can't find the word in gloVe, randomly initialize
            emb[i] = np.random.normal(scale=0.6, size=(n_fact,))
    
    #rare words
    emb[-1] = np.random.normal(scale=0.6, size=(n_fact,))
    emb/=3
    return emb

In [62]:
emb = create_emb()

In [63]:
#set to non-trainable
model = Sequential([
    Embedding(vocab_size, 50, input_length=seq_len, weights=[emb]),
    SpatialDropout1D(0.4),
    Conv1D(64, 5, padding='same', activation='relu'),
    Dropout(0.2),
    MaxPooling1D(),
    Flatten(),
    Dense(100, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

In [64]:
model.compile(loss='binary_crossentropy', optimizer=Adam(), metrics=['accuracy'])

In [65]:
model.fit(trn, labels_train, validation_data=(test, labels_test), epochs=4, batch_size=64)

Train on 25000 samples, validate on 25000 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x160128860>

In [66]:
model.optimizer.lr = 1e-4

In [67]:
model.fit(trn, labels_train, validation_data=(test, labels_test), nb_epoch=4, batch_size=64)

Train on 25000 samples, validate on 25000 samples
Epoch 1/4
   64/25000 [..............................] - ETA: 62s - loss: 0.1673 - acc: 0.9375


The `nb_epoch` argument in `fit` has been renamed `epochs`.



Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x160128668>