<a href="https://colab.research.google.com/github/kstyle2198/NLP_TIPS/blob/main/Explore_Semantic_Relations_in_Corpora_with_Embedding_Models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

https://medium.com/towards-data-science/explore-semantic-relations-in-corpora-with-embedding-models-0a6d64c3ec7f

In [1]:
!pip install glovpy gensim scikit-learn

Collecting glovpy
  Downloading glovpy-0.2.0-py3-none-any.whl (7.7 kB)
Installing collected packages: glovpy
Successfully installed glovpy-0.2.0


In [2]:
from gensim.utils import tokenize
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

In [3]:
def clean_tokenize(text: str) -> list[str]:
    """This function tokenizes texts and removes stop words from them"""
    tokens = tokenize(text, lower=True, deacc=True)
    tokens = [token for token in tokens if token not in ENGLISH_STOP_WORDS]
    return tokens

In [4]:
# Loading the dataset
dataset = fetch_20newsgroups(
    remove=("headers", "footers", "quotes"), categories=["sci.med"]
)
newsgroups = dataset.data
len(newsgroups)

594

In [5]:
# Tokenizing the dataset
tokenized_corpus = [clean_tokenize(text) for text in newsgroups]
len(tokenized_corpus)

594

In [6]:
tokenized_corpus[0]

['reply',
 'keith',
 'actrix',
 'gen',
 'nz',
 'keith',
 'stewart',
 'help',
 'asking',
 'medical',
 'information',
 'subject',
 'ask',
 'specific',
 'questions',
 'likely',
 'type',
 'textbook',
 'chapter',
 'covering',
 'aspects',
 'subject',
 'looking',
 'comprehensive',
 'review',
 'ask',
 'local',
 'hospital',
 'librarian',
 'happy',
 'help',
 'request',
 'sort',
 'briefly',
 'condition',
 'patients',
 'significant',
 'residual',
 'weakness',
 'childhood',
 'polio',
 'notice',
 'progression',
 'weakness',
 'older',
 'theory',
 'remaining',
 'motor',
 'neurons',
 'work',
 'harder',
 'die',
 'sooner']

In [7]:
from glovpy import GloVe

In [8]:
# Training word embeddings
model = GloVe(vector_size=25)
model.train(tokenized_corpus)

Collecting Vocabulary...
Collecting cooccurrences...
Shuffling cooccurrences...
Training model...


In [9]:
model.wv.most_similar("child")

[('adult', 0.9202715158462524),
 ('restraint', 0.9079991579055786),
 ('consistent', 0.8805399537086487),
 ('hand', 0.874118447303772),
 ('children', 0.8557751178741455),
 ('belt', 0.8536299467086792),
 ('use', 0.8462283611297607),
 ('age', 0.8388909697532654),
 ('safety', 0.8354132771492004),
 ('law', 0.8179876208305359)]

In [10]:
!pip install embedding_explorer

Collecting embedding_explorer
  Downloading embedding_explorer-0.5.2-py3-none-any.whl (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.4/42.4 kB[0m [31m782.3 kB/s[0m eta [36m0:00:00[0m
[?25hCollecting dash<3.0.0,>=2.11.1 (from embedding_explorer)
  Downloading dash-2.14.2-py3-none-any.whl (10.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.2/10.2 MB[0m [31m30.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting dash-extensions<0.2.0,>=0.1.10 (from embedding_explorer)
  Downloading dash_extensions-0.1.13-py3-none-any.whl (1.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m87.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting dash-iconify<0.2.0,>=0.1.2 (from embedding_explorer)
  Downloading dash_iconify-0.1.2-py3-none-any.whl (18 kB)
Collecting dash-mantine-components<0.12.0,>=0.11.1 (from embedding_explorer)
  Downloading dash_mantine_components-0.11.1-py3-none-any.whl (443 kB)
[2K     [90m━

In [11]:
from embedding_explorer import show_network_explorer

vocabulary = model.wv.index_to_key
embeddings = model.wv.vectors
show_network_explorer(vocabulary, embeddings=embeddings)

Creating explorer with name: 


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<Thread(Thread-10 (_run_silent), stopped 140254591649344)>

In [12]:
from sklearn.feature_extraction.text import CountVectorizer

# First we train a model on the corpus that learns all 4-grams
# We will only take the 4000 most frequent ones into account for now,
# But you can freely experiment with this
feature_extractor = CountVectorizer(ngram_range=(4,4), max_features=4000)
feature_extractor.fit(newsgroups)
# Then we get the vectorizer's vocabulary
four_grams = feature_extractor.get_feature_names_out()

In [15]:
!pip install embetter[text]

Collecting embetter[text]
  Downloading embetter-0.6.0-py2.py3-none-any.whl (40 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.3/40.3 kB[0m [31m476.1 kB/s[0m eta [36m0:00:00[0m
Collecting skops>=0.8.0 (from embetter[text])
  Downloading skops-0.9.0-py3-none-any.whl (120 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m120.7/120.7 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting bpemb>=0.3.3 (from embetter[text])
  Downloading bpemb-0.3.4-py3-none-any.whl (19 kB)
Collecting keras-nlp>=0.6.0 (from embetter[text])
  Downloading keras_nlp-0.6.4-py3-none-any.whl (584 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m584.8/584.8 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sense2vec==2.0.0 (from embetter[text])
  Downloading sense2vec-2.0.0-py2.py3-none-any.whl (39 kB)
Collecting sentence-transformers>=2.2.2 (from embetter[text])
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     

In [16]:
from embetter.text import SentenceEncoder

encoder = SentenceEncoder("all-MiniLM-L6-v2")

Using TensorFlow backend


.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [17]:
from embedding_explorer import show_network_explorer

show_network_explorer(four_grams, vectorizer=encoder)

Creating explorer with name: 


<IPython.core.display.Javascript object>

Open in browser:


<IPython.core.display.Javascript object>

<Thread(Thread-55 (_run_silent), stopped 140250623358528)>

In [18]:
import pandas as pd
import numpy as np

corpus = tokenized_corpus

# Extracting text lengths in number of characters.
lengths = [len(text) for text in corpus]

# Extracting first 400 characters from each text.
text_starts = [text[:400] for text in corpus]

# Extracting the group each text belongs to
# Sklearn gives the labels back as integers, we have to map them back to
# the actual textual label.
group_labels = np.array(dataset.target_names)[dataset.target]

# We build a dataframe with the available metadata
metadata = pd.DataFrame(dict(length=lengths, text=text_starts, group=group_labels))

In [19]:
metadata.head()

Unnamed: 0,length,text,group
0,53,"[reply, keith, actrix, gen, nz, keith, stewart...",sci.med
1,41,"[allergy, shots, years, starting, sophomore, h...",sci.med
2,135,"[vida, develop, inner, ear, problems, flying, ...",sci.med
3,73,"[rousseaua, immunex, com, writes, heat, shock,...",sci.med
4,104,"[probably, years, possible, disassemble, assem...",sci.med


In [20]:
metadata.group.unique()

array(['sci.med'], dtype=object)

In [21]:
from embedding_explorer import show_clustering

show_clustering(
  newsgroups,
  vectorizer=encoder,
  metadata=metadata,
  hover_name="group", # Title of hover box is going to be the group
  hover_data=["text", "length"] # We would also like to see these on hover
)

<IPython.core.display.Javascript object>

Open in browser:


<IPython.core.display.Javascript object>

<Thread(Thread-77 (_run_silent), stopped 140250623358528)>