[View in Colaboratory](https://colab.research.google.com/github/inpaner/nlp-ml/blob/master/Word_embedding_classification.ipynb)

# Word Embeddings

In [0]:
import pickle
import numpy as np
import re
import json
np.set_printoptions(precision=4, suppress=True)

In [0]:
!wget -nc http://nlp.stanford.edu/data/glove.6B.zip
!unzip -p glove.6B.zip glove.6B.50d.txt > glove.6B.50d.txt

In [0]:
import codecs
def load_embeddings(file_name, dimensions):
  words = []
  vecs = []
    
  with codecs.open(file_name, 'r', 'utf-8', errors='ignore') as f:
    for count, line in enumerate(f):
      values = line.split()
      if count % 100000 == 0 and count != 0:
          print('loaded {} embeddings'.format(count))
            
      if len(values[1:]) != 50:
          continue

      try:
          word = values[0]
          vec = np.asarray(values[1:], dtype=np.float32)
          words.append(word)
          vecs.append(vec)
      except Error as e:
          continue
  
  print(len(words), len(vecs))
  assert len(words) == len(vecs)
  np_vecs = np.stack(vecs)
  wordidx = {o: i for i, o in enumerate(words)}
  return words, np_vecs, wordidx

In [0]:
words, vecs, wordidx = load_embeddings('glove.6B.50d.txt', 50)

In [0]:
len(words)

In [0]:
words[:10]

In [0]:
words[600:610]

In [0]:
wordidx['cow']

In [0]:
words[6472]

In [0]:
vecs[6472]

In [0]:
word_to_vec = {}
for word, vec in zip(words, vecs):
  word_to_vec[word] = vec

In [0]:
word_to_vec['cow']

In [0]:
from scipy.spatial.distance import cosine as dist

In [0]:
dist(word_to_vec['president'], word_to_vec['obama'])

In [0]:
dist(word_to_vec['president'], word_to_vec['trump'])

In [0]:
dist(word_to_vec['man'], word_to_vec['genius'])

In [0]:
dist(word_to_vec['woman'], word_to_vec['genius'])

In [0]:
from sklearn.neighbors import NearestNeighbors
neighbors = NearestNeighbors(n_neighbors=10, radius=0.5, metric='cosine', algorithm='brute')
neighbors.fit(vecs)

In [0]:
def find_nearest(vector):
  distances, indices = neighbors.kneighbors([vector])
  return [(words[int(index)], distance) for index, distance in zip(list(indices[0]), list(distances[0]))]

In [0]:
find_nearest(word_to_vec["frog"])

In [0]:
find_nearest(word_to_vec["artificial"])

In [0]:
find_nearest(word_to_vec["intelligence"])

In [0]:
new_vec = (word_to_vec['artificial'] + word_to_vec['intelligence'])/2
find_nearest(new_vec)

In [0]:
find_nearest(word_to_vec['king'])

In [0]:
new_vec = word_to_vec['king'] - word_to_vec['man'] + word_to_vec['woman']
find_nearest(new_vec)

In [0]:
new_vec = word_to_vec['philippines'] - word_to_vec['manila'] + word_to_vec['paris']
find_nearest(new_vec)

# Load data

In [0]:
from keras.datasets import imdb
from keras.utils.data_utils import get_file
idx = imdb.get_word_index()

In [0]:
import keras.backend as K

def limit_mem():
    K.get_session().close()
    cfg = K.tf.ConfigProto()
    cfg.gpu_options.allow_growth = True
    cfg.gpu_options.per_process_gpu_memory_fraction = 0.6
    K.set_session(K.tf.Session(config=cfg))
    
limit_mem()

In [0]:
idx_arr = sorted(idx, key=idx.get)
idx2word = {v: k for k, v in idx.items()}

In [0]:
path = get_file('imdb_full.pkl',
                origin='https://s3.amazonaws.com/text-datasets/imdb_full.pkl',
                md5_hash='d091312047c43cf9e4e38fef92437263')
f = open(path, 'rb')
(x_train, labels_train), (x_test, labels_test) = pickle.load(f)

In [0]:
vocab_size = 5000

trn = [np.array([i if i<vocab_size-1 else vocab_size-1 for i in s]) for s in x_train]
test = [np.array([i if i<vocab_size-1 else vocab_size-1 for i in s]) for s in x_test]

In [0]:
lens = np.array([len(review) for review in trn])

In [0]:
(lens.max(), lens.min(), lens.mean())

In [0]:
from keras.preprocessing import sequence


seq_len = 500
trn = sequence.pad_sequences(trn, maxlen=seq_len, value=0)
test = sequence.pad_sequences(test, maxlen=seq_len, value=0)

# Neural network

In [0]:
from keras.models import Sequential
from keras.layers.embeddings import Embedding
from keras.layers.core import Flatten, Dense, Dropout, SpatialDropout1D
from keras.layers.convolutional import Conv1D, MaxPooling1D
from keras.optimizers import Adam

In [0]:
def create_emb():
    n_fact = vecs.shape[1]
    emb = np.zeros((vocab_size, n_fact))

    for i in range(1,len(emb)):
        word = idx2word[i]
        if word and re.match(r"^[a-zA-Z0-9\-]*$", word):
            src_idx = wordidx[word]
            emb[i] = vecs[src_idx]
        else:
            emb[i] = np.random.normal(scale=0.6, size=(n_fact,))

    emb[-1] = np.random.normal(scale=0.6, size=(n_fact,))
    emb/=3
    return emb

In [0]:
emb = create_emb()

In [0]:
model = Sequential([
    Embedding(vocab_size, 50, input_length=seq_len, weights=[emb]),
    Flatten(),
    Dense(100, activation='relu'),
    Dense(100, activation='relu'),
    Dense(1, activation='sigmoid')])

In [0]:
model.compile(loss='binary_crossentropy', optimizer=Adam(), metrics=['accuracy'])


In [0]:
model.fit(trn, labels_train, validation_data=(test, labels_test), epochs=2, batch_size=32)

In [0]:
model = Sequential([
    Embedding(vocab_size, 50, input_length=seq_len, weights=[emb]),
    SpatialDropout1D(0.4),
    Conv1D(64, 5, padding='same', activation='relu'),
    Dropout(0.2),
    MaxPooling1D(),
    Flatten(),
    Dense(100, activation='relu'),
    Dense(1, activation='sigmoid')])
model.compile(loss='binary_crossentropy', optimizer=Adam(), metrics=['accuracy'])
model.fit(trn, labels_train, validation_data=(test, labels_test), epochs=2, batch_size=64)