There are primarily two main classes of methods to generate *Word Embeddings*

  * Frequency based methods
  * Prediction based methods

In [None]:
# Importing libraries

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import pandas as pd
import numpy as np

#### Frequency Based Methods:
  1. Count Vectors (Bag of Words)
  2. Tf-IDF Vectors
  3. Co-Occurrence Vector

Count Vectors

In [None]:
# Initalising the corpus
corpus = ['This movie is very Scary and long',
          'This movie is not scary and is slow',
          'This movie is spooky and good'
          ]

# Defining the method to generate 1-gram BoW Tokens
vectorizer = CountVectorizer(lowercase = True, ngram_range = (1,1))

X = vectorizer.fit_transform(corpus)
print(vectorizer.get_feature_names())

print(X.toarray())


['and', 'good', 'is', 'long', 'movie', 'not', 'scary', 'slow', 'spooky', 'this', 'very']
[[1 0 1 1 1 0 1 0 0 1 1]
 [1 0 2 0 1 1 1 1 0 1 0]
 [1 1 1 0 1 0 0 0 1 1 0]]


In [None]:
# bi-gram Bag Of Word Models

vectorizer2 = CountVectorizer(analyzer='word', ngram_range=(2, 2), lowercase = True)
X2 = vectorizer2.fit_transform(corpus)
print(vectorizer2.get_feature_names())
print(X2.toarray())

['and good', 'and is', 'and long', 'is not', 'is slow', 'is spooky', 'is very', 'movie is', 'not scary', 'scary and', 'spooky and', 'this movie', 'very scary']
[[0 0 1 0 0 0 1 1 0 1 0 1 1]
 [0 1 0 1 1 0 0 1 1 1 0 1 0]
 [1 0 0 0 0 1 0 1 0 0 1 1 0]]


Tf-IDF Vectors

In [None]:
corpus = ['This movie is very Scary and long',
          'This movie is not scary and is slow',
          'This movie is spooky and good'
          ]

tfidf_vectorizer = TfidfVectorizer(lowercase = True, ngram_range= (1,1))

X_tfidf = tfidf_vectorizer.fit_transform(corpus)

print(tfidf_vectorizer.get_feature_names())

print(X_tfidf.toarray())


['and', 'good', 'is', 'long', 'movie', 'not', 'scary', 'slow', 'spooky', 'this', 'very']
[[0.29628336 0.         0.29628336 0.50165133 0.29628336 0.
  0.38151877 0.         0.         0.29628336 0.50165133]
 [0.26359985 0.         0.5271997  0.         0.26359985 0.44631334
  0.3394328  0.44631334 0.         0.26359985 0.        ]
 [0.32052772 0.54270061 0.32052772 0.         0.32052772 0.
  0.         0.         0.54270061 0.32052772 0.        ]]


Co-Occurence Matrix

In [None]:
import numpy as np
import nltk
from nltk import bigrams
import itertools
import pandas as pd
 
 
def generate_co_occurrence_matrix(corpus):

    vocab = set(corpus)
    vocab = list(vocab)
    vocab_index = {word: i for i, word in enumerate(vocab)}
 
    # Create bigrams from all words in corpus
    bi_grams = list(bigrams(corpus))
 
    # Frequency distribution of bigrams ((word1, word2), num_occurrences)
    bigram_freq = nltk.FreqDist(bi_grams).most_common(len(bi_grams))
 
    # Initialise co-occurrence matrix
    # co_occurrence_matrix[current][previous]
    co_occurrence_matrix = np.zeros((len(vocab), len(vocab)))
 
    # Loop through the bigrams taking the current and previous word,
    # and the number of occurrences of the bigram.
    for bigram in bigram_freq:
        current = bigram[0][1]
        previous = bigram[0][0]
        count = bigram[1]
        pos_current = vocab_index[current]
        pos_previous = vocab_index[previous]
        co_occurrence_matrix[pos_current][pos_previous] = count
    co_occurrence_matrix = np.matrix(co_occurrence_matrix)
 
    # return the matrix and the index
    return co_occurrence_matrix, vocab_index

In [None]:
corpus = [['Where', 'Python', 'is', 'used'],
             ['What', 'is', 'Python' 'used', 'in'],
             ['Why', 'Python', 'is', 'best'],
             ['What', 'companies', 'use', 'Python']]

data = list(itertools.chain.from_iterable(corpus))

matrix, vocab_index = generate_co_occurrence_matrix(data)

In [None]:
matrix

matrix([[0., 0., 0., 1., 0., 0., 1., 0., 0., 1., 0.],
        [0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.],
        [2., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.],
        [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 1.],
        [0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.],
        [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.]])

In [None]:
pd.options.display.max_rows = 100
pd.options.display.max_columns = 100

In [None]:
data_matrix = pd.DataFrame(matrix, index=vocab_index,
                             columns=vocab_index)
print(data_matrix)

            Python  companies   is  use  best  Pythonused  Where  What   in  \
Python      0.0     0.0        0.0  1.0  0.0   0.0         1.0    0.0   0.0   
companies   0.0     0.0        0.0  0.0  0.0   0.0         0.0    1.0   0.0   
is          2.0     0.0        0.0  0.0  0.0   0.0         0.0    1.0   0.0   
use         0.0     1.0        0.0  0.0  0.0   0.0         0.0    0.0   0.0   
best        0.0     0.0        1.0  0.0  0.0   0.0         0.0    0.0   0.0   
Pythonused  0.0     0.0        1.0  0.0  0.0   0.0         0.0    0.0   0.0   
Where       0.0     0.0        0.0  0.0  0.0   0.0         0.0    0.0   0.0   
What        0.0     0.0        0.0  0.0  1.0   0.0         0.0    0.0   0.0   
in          0.0     0.0        0.0  0.0  0.0   1.0         0.0    0.0   0.0   
Why         0.0     0.0        0.0  0.0  0.0   0.0         0.0    0.0   1.0   
used        0.0     0.0        1.0  0.0  0.0   0.0         0.0    0.0   0.0   

            Why  used  
Python      1.0  0.0   
com