### 1. Read in Data

In [39]:
import pandas as pd
import numpy as np
from scipy import spatial
import spacy 
from spacy.lang.en import English
import seaborn as sns
import re
import time
import matplotlib.pyplot as plt
import nltk
import os
from nltk.corpus import stopwords
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [43]:
# nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/jupyter/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [58]:
# # Download Glove embeddings
# ! wget http://nlp.stanford.edu/data/glove.6B.zip data/glove.6B.zip

# ! unzip glove.6B.zip -d /home/jupyter/sb-entity-classification/data/glove.6B

In [20]:
BASE_DIR = '/home/jupyter/sb-entity-classification/data/'
GLOVE_DIR = os.path.join(BASE_DIR, 'glove.6B')
MAX_NUM_WORDS = 30000
EMBEDDING_DIM = 50

In [3]:
nlp = spacy.load("en_core_web_sm")

In [4]:
df = pd.read_csv('/home/jupyter/sb-entity-classification/data/data.csv')
df.columns = ['class','name']

classes_list = pd.read_csv('/home/jupyter/sb-entity-classification/data/classes.txt', header = None)
classes_list['class'] = classes_list.index
classes_list.columns = ['class_name', 'class']
classes_list['class'] = classes_list['class'] + 1  # based on information provided in the brief

df = df.merge(classes_list, on = 'class', how = 'left')

### 2. Clean, Tokenize, and Pad Texts in Names 

In [24]:
df['name_cleaned'] = df['name'].str.lower()
df['name_cleaned'] = df['name_cleaned'].apply(lambda x : re.sub("[^a-z\s]","",x)) # only english words for now

stopwords = set(stopwords.words())
df['name_cleaned'] = df['name_cleaned'].apply(lambda x : " ".join(word for word in x.split() if word not in stopwords ))

In [28]:
texts = df['name_cleaned']
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

word_index = tokenizer.word_index
print('Found {} unique tokens.'.format(len(word_index)))

max_seq_length = df['name_cleaned'].apply(lambda x: len(x.split(' '))).max()
print('Max number of words in a name is {}'.format(max_seq_length))
emb_texts = pad_sequences(sequences, maxlen=max_seq_length)
df['name_seq'] = emb_texts.tolist() 

Found 291320 unique tokens.
Max number of words in a name is 16


In [34]:
print('Indexing word vectors..')

embeddings_index = {}
with open(os.path.join(GLOVE_DIR, 'glove.6B.{}d.txt'.format(EMBEDDING_DIM))) as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, 'f', sep=' ')
        embeddings_index[word] = coefs

print('Found %s word vectors.' % len(embeddings_index))

Indexing word vectors..
Found 400000 word vectors.


### 3. Prepare GloVe Embedding dictionary for tokens that have appeared in names

In [61]:
# prepare embedding matrix
num_words = min(MAX_NUM_WORDS, len(word_index) + 1)
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))

In [64]:
for word, i in word_index.items():
    if i >= MAX_NUM_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
        
print('Prepared embedding matrix')

Prepared embedding matrix


In [80]:
spatial.distance.euclidean(embedding_matrix[1],embedding_matrix[6])

3.7354855011281862

To try: universal sentence encoder
https://www.analyticsvidhya.com/blog/2020/08/top-4-sentence-embedding-techniques-using-python/