<a href="https://colab.research.google.com/github/janlucasdeinhard/pyfiles/blob/master/12_PROJECT_NLP/KnowledgeEmbedding/Knowledge_Embedding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Outline

This script should construct an effective knowledge embedding. It should take in string representations of arbitrary length of knowledge, and output a vector representation of that knowledge in fixed dimensions. The vector representation should be set up in a useful way too!

In [None]:
# Imports

import numpy as np
from numpy import random as rnd
from matplotlib import pyplot as plt
import warnings,os,sys,time,datetime,itertools,math

import torch
import torch.nn as nn
import torch.nn.functional as F

import pandas as pd

import plotly.express as px

In [None]:
from gensim.models import Word2Vec,FastText
from gensim.test.utils import common_texts

In [None]:
import nltk

In [None]:
class FastTextModel():
  def __init__(self,embedding_dim,verbose=True):
    self.embedding_dim = embedding_dim  # Select embedding dimension
    self.DEFAULT_WORD = np.zeros(self.embedding_dim)  # Set default word for when model can't encode the word due to lack of n-grams in training data
    self.original_training_data = []
    self.verbose = verbose
    if self.verbose: print('Model created with embedding dimension {0}, please provide training corpus using model.train(<list of training sentences>)!'.format(self.embedding_dim))
    return
  def train(self,data):
    # Train the model
    self.model = FastText(size=self.embedding_dim,window=3,min_count=1,sentences=data)
    # Store training data for review
    self.original_training_data = data
    if self.verbose: print('Model trained, use model.encode(<sentence>) to encode any sentence into dimension {0}'.format(self.embedding_dim))
    return
  def encode(self,sentence):
    # Iterate s
    sent_enc = []
    sent_log = []
    for c_word in sentence.split(' '):
      try:
        # Encode word using model
        c_word_enc = self.model.wv[c_word]
      except:
        # Couldn't encode word using model
        c_word_enc = self.DEFAULT_WORD
      sent_enc.append(c_word_enc)
      sent_log.append(c_word)
    # Turn into numpy array
    sent_enc_np = np.array(sent_enc)
    sent_encoding = pd.DataFrame(sent_enc)
    sent_encoding['Word'] = sent_log
    return sent_enc_np,sent_encoding

In [None]:
nltk.download('gutenberg')

[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Unzipping corpora/gutenberg.zip.


True

In [None]:
training_corpus_raw = nltk.corpus.gutenberg.raw('carroll-alice.txt').lower()
training_sents = training_corpus_raw.split('\n\n')

training_data = []
for c_sent in training_sents:
  training_data.append(c_sent.split(' '))

In [None]:
model = FastTextModel(embedding_dim=7,verbose=False)
model.train(training_data)

In [None]:
test_corpus_raw = nltk.corpus.gutenberg.raw('bible-kjv.txt').lower()
test_sents = test_corpus_raw.split('\n\n')

In [None]:
from sklearn.manifold import TSNE

In [None]:
c_sentence = ' '.join(test_sents[0:50])

In [None]:
X,df = model.encode(c_sentence)

In [None]:
df = df.rename(columns={
    0:'X[0]',1:'X[1]',2:'X[2]',3:'X[3]',4:'X[4]',5:'X[5]',6:'X[6]'
})

In [None]:
X_tsne = TSNE(n_components=2, learning_rate='auto',init='random').fit_transform(df.values[:,:7])

In [None]:
df['TSNE(X,0)'] = X_tsne[0,:]
df['TSNE(X,1)'] = X_tsne[1,:]

In [None]:
px.scatter( 
    data_frame=df,
    x='TSNE(X,0)',
    y='TSNE(X,1)',
    hover_name='Word'
)

In [None]:
from sklearn.decomposition import PCA

In [None]:
pca = PCA(n_components=2)

In [None]:
X_pca = pca.fit_transform(df.values[:,:7])

In [None]:
df['PCA(X,0)'] = X_pca[:,0]
df['PCA(X,1)'] = X_pca[:,1]

In [None]:
px.scatter( 
    data_frame=df,
    x='PCA(X,0)',
    y='PCA(X,1)',
    hover_name='Word'
)

# Summary

The Word2Vec model can't encode unseen words where Facebook's FastText model can do exactly that. The FastText model can be imported from gensim.models.FastText and is easily trained.