# Use tensorflow to visualize word embeddings

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [13]:
import nltk
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.datasets import fetch_20newsgroups
import gensim
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
import re

## Helper functions

In [3]:
class LemmaTokenizer(object):
    def __init__(self):
         self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
         return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]

In [16]:
def build_model(sentences, size):
    from nltk.corpus import stopwords
    corpus_words = []
    stopwords = set(stopwords.words('english'))
    wnl = WordNetLemmatizer()
    for session in sentences:
        words = word_tokenize(session)
        session_words = []
        for word in words:
            regex_match = re.match(r"\b[a-zA-Z]+\b", word)
            if (regex_match is not None) & (len(word) > 1) & (word not in stopwords):
                session_words.append(wnl.lemmatize(word.lower()))
        corpus_words.append(session_words)
    model = gensim.models.Word2Vec(corpus_words, size=size, window=5, seed=42,
                                   min_count=2, workers=4)
    return model, corpus_words

In [5]:
def average_vectors(word_list, word2vec_dict, vec_length=5):
    avg_vector = np.zeros(vec_length)
    if len(word_list) == 0:
        return avg_vector
    for word in word_list:
        if word in word2vec_dict.keys():
            avg_vector = np.add(word2vec_dict[word], avg_vector)
    return np.divide(avg_vector, len(word_list))

## Create test and train sets

In [25]:
import math

In [7]:
categories = ['rec.motorcycles', 'rec.autos']
newsgroups_train = fetch_20newsgroups(subset='train',
                                      categories=categories,
                                      remove=('headers', 'footers', 'quotes'))
newsgroups_test = fetch_20newsgroups(subset='test',
                                     remove=('headers', 'footers', 'quotes'),
                                      categories=categories)

## Build the word embeddings 

Process raw inputs into a datset

In [30]:
import collections

Determine the number of words in the corpus

In [None]:
n_words = 50000

In [33]:
words = newsgroups_train.data[0]

In [38]:
words.split()

['Stuff', 'deleted...']

In [34]:
count = [['UNK', -1]]
count.extend(collections.Counter(words).most_common(n_words - 1))

In [35]:
count

[['UNK', -1],
 ('e', 3),
 ('.', 3),
 ('t', 2),
 ('f', 2),
 ('d', 2),
 ('S', 1),
 ('u', 1),
 (' ', 1),
 ('l', 1)]