# Information Retrieval project 2022

## 🧑‍🎓 Names
+ Mashra Marwan
+ Kiselov Nikita
+ Serré Gaëtan

## Imports

In [25]:
import sys
import time
import json
import pprint
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
import spacy
from spacy.matcher import PhraseMatcher, Matcher
from spacy.util import filter_spans
from spacy import displacy
from spacy.tokens import DocBin
from spacy.tokens import Span
from collections import Counter
from nltk.tokenize import MWETokenizer
from nltk.util import Trie
import nltk
from nltk.corpus import wordnet as wn
nltk.download('wordnet')
nltk.download('omw-1.4')
tqdm.pandas()
spacy.__version__


[nltk_data] Downloading package wordnet to /Users/nkise/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/nkise/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


'3.2.4'

Our first goal here is to create a model capable of recognizing the term in our patents. For that, we will be using a Named Entity Recognition model given by `spacy`.

## 🗃️ Read and preprocess data

In [26]:
# if you've already unzipped the file
patent_data = open('G06K.txt').read().strip()

# split into patents texts | 1 entry = 1 patent
patent_texts = patent_data.split('\n\n')

# split each patent into lines
patent_lines = patent_data.split('\n')


## 👀 Extract features

In [27]:
# here are the potential terms
mwes = open('manyterms.lower.txt').read().lower().strip().split('\n')
print(mwes[44444:44456])
print(len(mwes), 'mwes')


['antonio superchi', 'antonio tarver', 'antonio torres jurado', 'antonio valdes', 'antonio valdes y fernandez bazan', 'antonio valdez', 'antonio valdés y bazán', 'antonio valdés y fernández bazán', 'antonio valente', 'antonio vitali', 'antonio vivaldi', 'antonio xavier machado e cerveira']
743274 mwes


### We extract the terms from our patents using manyterms

In [4]:
# Here lowercase=False option is used to keep the original case of the terms, since we possibly could have term abbreviations. Like API, CAT, etc.
cvectorizer = CountVectorizer(ngram_range=(
    1, 4), stop_words="english", vocabulary=mwes, lowercase=True)
X = cvectorizer.fit_transform(patent_texts)

# Show top-25 most frequent terms
termdf_cv = pd.DataFrame(np.sum(X, axis=0), columns=cvectorizer.get_feature_names(
)).T.sort_values(by=0, ascending=False)
termdf_cv.head(25)




Unnamed: 0,0
electronic device,16280
image processing,12224
control unit,9263
mobile terminal,9165
information processing,7732
neural network,6734
user interface,6177
computer readable,6103
fingerprint sensor,5980
display device,5666


## 🪄 SpaCy NER

Let's start from understanding. Here is an example of showing part of text on one patent with default NER.

In [5]:
nlp = spacy.load("en_core_web_lg")
doc = nlp(patent_texts[0][18000:20000])
displacy.render(doc, style="ent", jupyter=True)


We want to create a such model capable of recognizing the terms that are in the context of our patents. For that, we need to create a dataset and we will be using `manyterms` as a terms database.

### Create DataSet

We need to create propper dataset that is compatible with SpaCy 3.0 to train a NER model.

In [6]:
matcher = PhraseMatcher(nlp.vocab, attr="LOWER")
patterns = [nlp.make_doc(text) for text in termdf_cv.index]
matcher.add("Tech", patterns)


In [7]:
train_lines, test_lines = train_test_split(
    patent_lines, test_size=0.3, random_state=42)


We are using PharsesMatcher to find entities similar to one from mayterms.txt  
Then Span is labeled and saved into the binary `.spacy` format

In [8]:
def create_dataset(text, n_lines, filename, offset=0):
    LABEL = "TECH"
    doc_bin = DocBin()  # create a DocBin object

    for training_example in tqdm(text[offset:offset+n_lines]):
        doc = nlp.make_doc(training_example)
        ents = []

        for match_id, start, end in matcher(doc):
            span = Span(doc, start, end, label=LABEL)
            if span is None:
                print("Skipping entity")
            else:
                ents.append(span)

        filtered_ents = filter_spans(ents)
        doc.ents = filtered_ents
        doc_bin.add(doc)
    doc_bin.to_disk(filename)


Uncomment to re-save the dataset:

In [9]:
# create_dataset(train_lines, 40_000, "training_data.spacy")
# create_dataset(test_lines, 12_000, "valid_data.spacy")

  0%|          | 0/40000 [00:00<?, ?it/s]

  0%|          | 0/12000 [00:00<?, ?it/s]

Now that our datasets are created, we can train a spacy NER model.

### Train the model

Donwnload __base_config.cfg__ for your system at https://spacy.io/usage/training#quickstart

In [10]:
# Run to generate full training config
# !python -m spacy init fill-config base_config.cfg config.cfg


[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


Run the training. The best and last model will be stored into __./spacy_output__  

In [11]:
# !python -m spacy train config.cfg --output ./spacy_output --paths.train ./training_data.spacy --paths.dev ./valid_data.spacy --gpu-id 0


### 💻 Load pre-trained model

In [13]:
!git lfs install
!git clone https://huggingface.co/kinivi/ner_patent_g06k

Updated git hooks.
Git LFS initialized.
fatal: destination path 'ner_patent_g06k' already exists and is not an empty directory.


### Test the model

In [5]:
nlp = spacy.load("./ner_patent_g06k/spacy_output/model-best")

doc = nlp("Wi-Fi Direct (registered trademark, which will be hereinafter referred to as WFD) \
           corresponding to a technology for directly performing a communication based on a \
           wireless LAN between communication devices without intermediation of an access \
           point (hereinafter referred to as AP) is standardized in Wi-Fi Alliance serving \
           as a wireless LAN industry group.")

colors = {"TECH": "#F67DE3"}
options = {"colors": colors}

spacy.displacy.render(doc, style="ent", options=options, jupyter=True)


  ret = umr_sum(arr, axis, dtype, out, keepdims, where=where)
  arrmean = umr_sum(arr, axis, dtype, keepdims=True, where=where)
  x = um.multiply(x, x, out=x)
  Xhat = (X - mu) * var ** (-1.0 / 2.0)


In our patents, a lot of terms are abbreviated.
For example, `WI-FI Direct` is mentioned as `WFD`, or `P2P Group Owner` as `GO`.

Our original model does not recognize these abbreviations and therefore a huge part of the terms are ignored.
So we will fine-tune our model using prodigy

## 🦄 Prodigy

We create a train dataset for fine-tuning our ner model.

In [None]:
# !prodigy ner.correct fine_tune_g06k2 spacy_output/model-best G06K.txt --loader txt --label TECH


<img src=img/annotations.png style="width: 40%; height:40%"></img>

Now let's fine-tune our model!

In [None]:
# !prodigy train ./prodigy_output/ --ner fine_tune_g06k --base-model spacy_output/model-best --gpu-id 0


Now our refined model should recognize better the abbreviations.

In [13]:
nlp = spacy.load("./ner_patent_g06k/prodigy_output/model-best")

doc = nlp("According to the WFD, the communication is performed when one of the \
           communication devices that directly perform the wireless LAN communication \
           operates as the AP. According to the WFD, a role of the device that operates \
           as the AP will be referred to as P2P Group Owner (hereinafter, referred to as GO). \
           On the other hand, a role of the device that participates in a network generated by \
           the GO will be referred to as P2P Client (hereinafter, referred to as CL). \
           According to the WFD, a communication parameter necessary for participating in \
           the network generated by the GO is shared between the devices by transmitting the \
           communication parameter from the GO to the CL, and thereafter, the wireless \
           communication according to the WFD is executed on the basis of the shared communication parameter.")

colors = {"TECH": "#F67DE3"}
options = {"colors": colors}

spacy.displacy.render(doc, style="ent", options=options, jupyter=True)


This fine tuned model is more accurate than the first one. We will use it in our Hearst patterns recognition.

## 🧬 Hearst Patterns

In [16]:
class Hearst_Patterns:
    """ Extracts hearst patterns from a corpus
    """

    def __init__(self, patterns_file="patterns.json", model_path="spacy/model-new", text_path="G06K.txt"):
        """ creates an instance of the class Hearst_Patterns

        Args:
            patterns_file (path, optional): the json file containing the patterns. Defaults to "patterns.json".
            model_path (path, optional): the folder containing the NER model to use. Defaults to "spacy/model-new".
            text_path (path, optional): the file containing the corpus analyse and extract patterns. Defaults to "G06K.txt".
        """

        # read the text file
        g06k = open(text_path).read().strip()
        self.patent_lines = g06k.split('\n')

        # load the models
        self.nlp = spacy.load(model_path)
        self.en_nlp = spacy.load("en_core_web_lg")
        self.nlp.add_pipe("merge_entities")
        self.en_nlp.add_pipe('merge_noun_chunks')

        self.matcher = Matcher(self.nlp.vocab)
        self.patterns = self.load_patterns_from_json(patterns_file)
        for name, pattern in self.patterns:
            self.matcher.add(name, pattern)

        # this list is used in the method get_matches
        self.continue_words = [',', 'and', 'or', ';', 'also', 'as well']

    def load_patterns_from_json(self, patterns_file):
        """ read the json file and return the list of

        Args:
            patterns_file (path): the json file containing the patterns

        Returns:
            List: a list of the hearst patterns found in the json file  
        """
        f = open(patterns_file)
        data = json.load(f)
        patterns = []
        for name, pattern in data.items():
            patterns.append((name, pattern))

        return patterns

    def extract_patterns(self, size=10, save_folder=".", start=0):
        """ look for matches in a corpus (text file)

        Args:
            size (int, optional): the minimum number of matches to be found. Defaults to 10.
            save_folder (path, optional): the folder in which save the resulted csv file. Defaults to ".".
            start (int, optional): the first line in which we start to look for matches (useful to continue where you stopped). Defaults to 0.
        """
        extraced_patterns = []

        # chose a start
        line = start
        count = 0

        # for the output
        print(f'{count} pattern extracted...', end='\r')
        sys.stdout.flush()

        while count < size:
            while True:  # we read line by line until finding a match, to keep track of the count
                try:  # it bugs very rarely, don't know why XD

                    # look for a match
                    patterns = self.get_matches(self.patent_lines[line])
                    if patterns:
                        extraced_patterns += patterns
                        break
                    print(f'{count} patterns extracted...{line}', end='\r')
                    sys.stdout.flush()
                except:
                    print("An error has occurred")

                line += 1
            count = len(extraced_patterns)
            print(f'{count} patterns extracted...{line}', end='\r')
            sys.stdout.flush()

        print(f'({count}) patterns extracted from lines ({start}-{line}))')
        save_file = f"{save_folder}/hearst_patterns.{len(extraced_patterns)}.csv"
        print(f'Patterns saved to {save_file}')
        df = pd.DataFrame(extraced_patterns, columns=[
                          'word1', 'word2', 'relation', 'label', 'text'])
        df.to_csv(save_file)

    def get_matches(self, text):
        label = {
            'rhyper': -1,
            'hyper': 1,
        }
        # because patterns like < !(bla bla) X > don't work when X is in the beginning of the sentence
        doc = self.nlp('. '+text)

        matches = self.matcher(doc)
        relations = []
        for match_id, start, end in matches:

            # get all entities indices in the doc
            ent_indices = [i for i in range(start, end) if doc[i].text in [
                ent.text for ent in doc[start:end].ents]]
            if not ent_indices:  # no entity found
                return []

            # extract X...Y from a match ..X...Y.., so now we know that the first and the last token are the entities
            span = doc[min(ent_indices):max(ent_indices)+1]

            # Get string representation
            match_info = self.nlp.vocab.strings[match_id]
            match_name = match_info.split('-')[0]   # hyper or rhyper
            match_type = match_info.split('-')[1]   # single or multi

            np_0 = span[0]  # left term
            np_1 = span[-1]  # right term (or first right term if multiple)

            # all the right terms (ex. for Y...X1, X2, ...Xn) X1...Xn are the right terms
            right_terms = [np_1.text]
            if match_type == "multi":  # look for other terms (X2,X3..etc)

                # we use the en_core_web_lg model to get the noun chunks
                doc_en = self.en_nlp(doc[end:].text)
                for d in doc_en:
                    # look for entities inside the noun chunk
                    matching_ents = [
                        ent.text for ent in doc.ents if ent.text in d.text]
                    if matching_ents:
                        right_terms.append(matching_ents[0])
                    elif d.text not in self.continue_words:  # stop when seeing a word that's not in the list
                        break

            for term in right_terms:
                relations.append(
                    (np_0.text, term, match_name, label[match_name], text))

        relations = set(relations)
        return list(relations)


### ⛷️ start the extraction

In [20]:
hp = Hearst_Patterns(patterns_file="patterns.json",
                     model_path="./ner_patent_g06k/spacy_output/model-best", text_path="./G06K.txt")
hp.extract_patterns(size=50, start=5896, save_folder="hearst_patterns/")


0 patterns extracted...5902

  ret = umr_sum(arr, axis, dtype, out, keepdims, where=where)
  arrmean = umr_sum(arr, axis, dtype, keepdims=True, where=where)
  x = um.multiply(x, x, out=x)
  Xhat = (X - mu) * var ** (-1.0 / 2.0)


(51) patterns extracted from lines (5896-6148))
Patterns saved to hearst_patterns//hearst_patterns.51.csv


### ✨ Results

in the first <b>26331</b>, we found <b>245</b> different matches, <b>155</b> unique relation. Here are some of the results:

In [4]:
df_results = pd.read_csv("hearst_patterns/hearst_patterns.155.csv", index_col=0)[:50]
df_results[['Hypernym','Hyponym','Frequency']].head(10)

Unnamed: 0,Hypernym,Hyponym,Frequency
0,home appliance,mobile robot,15
1,computer network,the Internet,10
2,artificial intelligence,Deep learning,6
3,electronic device,electronic component,4
4,storage device,hard disk,4
5,communication network,the Internet,4
6,point light source,light source,4
7,the environment,foot traffic,3
8,storage media,hard drives,3
9,laser projector,display device,3


## 🏆 Evaluation

#### Load model and data

In [28]:
# load best model
nlp_ner = spacy.load("./ner_patent_g06k/prodigy_output/model-best")

# load test lines
with open('test_lines.txt', 'r') as f:
    test_lines = f.readlines()
f.close()

# read homonym_list.txt with pairs of homonyms
homonyms_df = pd.read_csv('./hearst_patterns/hearst_patterns.155.csv')
homonyms_df


Unnamed: 0.1,Unnamed: 0,Hypernym,Hyponym,Frequency,word1,word2,relation,label
0,0,home appliance,mobile robot,15,home appliance,mobile robot,rhyper,-1
1,1,computer network,the Internet,10,the Internet,computer network,hyper,1
2,2,artificial intelligence,Deep learning,6,artificial intelligence,Deep learning,rhyper,-1
3,3,electronic device,electronic component,4,electronic device,electronic component,rhyper,-1
4,4,storage device,hard disk,4,storage device,hard disk,rhyper,-1
...,...,...,...,...,...,...,...,...
150,150,computer readable,optical disk,1,computer readable,optical disk,rhyper,-1
151,151,optical system,plane mirror,1,plane mirror,optical system,hyper,1
152,152,light source,laser diode,1,laser diode,light source,hyper,1
153,153,recording medium,semiconductor memory,1,recording medium,semiconductor memory,rhyper,-1


#### 🌐 Word-Net

Here we are evaluating extracted Hypernyms using WordNet. Here is an example how it works

In [29]:
cd_rom = wn.synsets('CD-ROM', pos='n')
computer = wn.synsets('computer', pos='n')


In [30]:
for synset in cd_rom:
    for synset2 in computer:
        print(synset, synset2)
        print("Score:", synset.wup_similarity(synset2))


Synset('cd-rom.n.01') Synset('computer.n.01')
Score: 0.7
Synset('cd-rom.n.01') Synset('calculator.n.01')
Score: 0.4


In [31]:
cd_rom[0].shortest_path_distance(computer[0])


6

Run on our list of hypernyms

In [32]:
def wordnet_distance(word1, word2):
    parent = wn.synsets(word1.replace(' ', '_'))
    subclass = wn.synsets(word2.replace(' ', '_'))
    scores = [0]
    try:
        for synset in parent:
            for synset2 in subclass:
                scores.append(synset.wup_similarity(synset2))
    except:
        return np.nan

    return np.round(max(scores), 2)


In [33]:
# iterate throw homonyms_df
results_wordnet = []
for index, row in tqdm(homonyms_df.iterrows()):
    if row["label"] == -1:
        parent = row['word1']
        subclass = row['word2']
    else:
        parent = row['word2']
        subclass = row['word1']
    res = wordnet_distance(parent, subclass)
    results_wordnet.append(res)
    print(parent, "⬅️", subclass, ": ", res)


0it [00:00, ?it/s]

home appliance ⬅️ mobile robot :  0
computer network ⬅️ the Internet :  0
artificial intelligence ⬅️ Deep learning :  0
electronic device ⬅️ electronic component :  0
storage device ⬅️ hard disk :  0.89
communication network ⬅️ the Internet :  0
point light source ⬅️ light source :  0
the environment ⬅️ foot traffic :  0
storage media ⬅️ hard drives :  0
laser projector ⬅️ display device :  0
semiconductor memory ⬅️ flash memory :  0
storage medium ⬅️ hard disk :  0.67
storage medium ⬅️ magnetic disk :  0.71
machine learning ⬅️ deep learning :  0
electronic devices ⬅️ mobile phone :  0.63
machine learning ⬅️ artificial intelligence :  0
test pattern ⬅️ test pattern :  0
deep learning ⬅️ Convolutional Neural Network :  0
optical system ⬅️ optical system :  0
environmental conditions ⬅️ carbon dioxide :  0.27
flexible display ⬅️ display device :  0
storage medium ⬅️ non-volatile memory :  0
storage medium ⬅️ optical disk :  0.71
storage device ⬅️ main memory :  0
storage device ⬅️ input 

Save to the dataframe

In [34]:
# save to dataframe
homonyms_df['wordnet_distance'] = results_wordnet


#### 🧪 Spacy embeddings
What if model already has links between words? Since it's trained on the corpus data, it should be able to find similarity between words.  
This is what can be useful while evaluationg our hyponyms list. We could run it and find low-similar elemts for further analysis.

In [35]:
word_1 = nlp_ner("cloud platform")
word_2 = nlp_ner("service provider")

print(word_1, "<->", word_2, word_1.similarity(word_2))


cloud platform <-> service provider 0.48239135894127


Run on our list of hypernyms

In [36]:
def spacy_score(word1, word2):
    word1 = nlp_ner(word1)
    word2 = nlp_ner(word2)
    return word1.similarity(word2)


In [37]:
# iterate throw homonyms_df
results_spacy = []
for index, row in tqdm(homonyms_df.iterrows()):
    if row["label"] == -1:
        parent = row['word1']
        subclass = row['word2']
    else:
        parent = row['word2']
        subclass = row['word1']

    res = spacy_score(parent, subclass)
    results_spacy.append(res)
    print(parent, "⬅️", subclass, ": ", res)


0it [00:00, ?it/s]

home appliance ⬅️ mobile robot :  0.24053525712770824
computer network ⬅️ the Internet :  0.3473185514133592
artificial intelligence ⬅️ Deep learning :  0.6340177910135353
electronic device ⬅️ electronic component :  0.596316942755877
storage device ⬅️ hard disk :  0.5226111831551006
communication network ⬅️ the Internet :  0.39472354731538645
point light source ⬅️ light source :  0.9023671626758542
the environment ⬅️ foot traffic :  0.1970939420135585
storage media ⬅️ hard drives :  0.21867868544769628
laser projector ⬅️ display device :  0.047422703300193096
semiconductor memory ⬅️ flash memory :  0.8473547281050336


  x = um.multiply(x, x, out=x)


storage medium ⬅️ hard disk :  0.5238650175071684
storage medium ⬅️ magnetic disk :  0.6181492239340104
machine learning ⬅️ deep learning :  0.7875986458980156
electronic devices ⬅️ mobile phone :  0.4429614842595396
machine learning ⬅️ artificial intelligence :  0.5784102365301521
test pattern ⬅️ test pattern :  1.0
deep learning ⬅️ Convolutional Neural Network :  0.5148216892496057
optical system ⬅️ optical system :  1.0
environmental conditions ⬅️ carbon dioxide :  0.621414958294819
flexible display ⬅️ display device :  0.6244238171573945
storage medium ⬅️ non-volatile memory :  0.1679327110712199
storage medium ⬅️ optical disk :  0.6074251079772134
storage device ⬅️ main memory :  0.3310919302404015
storage device ⬅️ input device :  0.6659649925777138
inclination angle ⬅️ inclination angle :  1.0
convolutional neural network ⬅️ neural network :  0.9166362725093549
non-volatile RAM ⬅️ read-only memory :  0.8338553504212518
moving images ⬅️ digital camcorder :  0.75523297475568
wirel

Add results to the dataframe

In [38]:
homonyms_df['spacy_distance'] = results_spacy


In [39]:
homonyms_df.head()


Unnamed: 0.1,Unnamed: 0,Hypernym,Hyponym,Frequency,word1,word2,relation,label,wordnet_distance,spacy_distance
0,0,home appliance,mobile robot,15,home appliance,mobile robot,rhyper,-1,0.0,0.240535
1,1,computer network,the Internet,10,the Internet,computer network,hyper,1,0.0,0.347319
2,2,artificial intelligence,Deep learning,6,artificial intelligence,Deep learning,rhyper,-1,0.0,0.634018
3,3,electronic device,electronic component,4,electronic device,electronic component,rhyper,-1,0.0,0.596317
4,4,storage device,hard disk,4,storage device,hard disk,rhyper,-1,0.89,0.522611


#### 📜 Wikidata

In [40]:
from qwikidata.entity import WikidataItem, WikidataLexeme, WikidataProperty
from qwikidata.linked_data_interface import get_entity_dict_from_api
from qwikidata.sparql import (get_subclasses_of_item,
                              return_sparql_query_results)
import wptools


Here we are using wptools to find the page of the term and extract the ID.

In [41]:
parent_name = "computer"
candidate_name = "iPad"

# get Wikidata item for parent
page = wptools.page(parent_name)
data = page.get_parse(show=False)
q_parent_class = data.data['wikibase']
q_parent_class


en.wikipedia.org (parse) computer


'Q68'

In [42]:
# use convenience function to get subclasses of an item as a list of item ids
subclasses_list = get_subclasses_of_item(q_parent_class)
len(subclasses_list)


3452

Print sub-classes of the term

In [43]:
# print some of this classes
for subclass in subclasses_list[:5]:
    q42_dict = get_entity_dict_from_api(subclass)
    print(WikidataItem(q42_dict).get_label())


computer
personal computer
IBM 704
microcomputer
analog computer


In [44]:
page = wptools.page(candidate_name)
data = page.get_parse(show=False)
data.data['wikibase']


en.wikipedia.org (parse) iPad


'Q2796'

In [45]:
print(f"Is class `{parent_name}` is a subclass of `{candidate_name}`: ",
      data.data['wikibase'] in subclasses_list)


Is class `computer` is a subclass of `iPad`:  True


Run on our list of hypernyms

In [46]:
def wikidata_is_subclass(word_1, word_2):
    parent_name = word_1
    candidate_name = word_2

    # get Wikidata item for parent
    page = wptools.page(parent_name)
    try:
        time.sleep(1)
        data_parent = page.get_parse(show=False)
    except:
        print(f"Could not find Wikidata item for `{parent_name}`")
        return "⚠️"
    q_parent_id = data_parent.data['wikibase']

    # get Wikidata item for candidate
    page = wptools.page(candidate_name)
    try:
        time.sleep(1)
        data_subclass = page.get_parse(show=False)
    except:
        print(f"Could not find Wikidata item for `{candidate_name}`")
        return "⚠️"
    q_subclass_id = data_subclass.data['wikibase']

    # use convenience function to get subclasses of an item as a list of item ids
    subclasses_list = get_subclasses_of_item(q_parent_id)

    res = q_subclass_id in subclasses_list
    if res:
        return "✅"
    else:
        return "❌"


In [None]:
# iterate throw homonyms_df
wikidata_results = []
for index, row in tqdm(homonyms_df.iterrows()):
    if row["label"] == -1:
        parent = row['word1']
        subclass = row['word2']
    else:
        parent = row['word2']
        subclass = row['word1']

    # pause for a bit to avoid hitting Wikidata API rate limit
    time.sleep(index)
    res = wikidata_is_subclass(parent, subclass)
    wikidata_results.append(res)
    print(parent, "⬅️", subclass, ": ", wikidata_is_subclass(parent, subclass))


Add results to the dataframe

In [None]:
homonyms_df['wikidata_is_subclass'] = wikidata_results
homonyms_df

#### 💾 Save table

In [None]:
# create columns wordnet_is_subclass, fill with "⚠️" if not 0 or np.nan, otherwise fill with "✅" if wordnet_distance > 0.7, otherwise fill with "❌"
homonyms_df['wordnet_is_subclass'] = homonyms_df['wordnet_distance'].apply(
    lambda x: "⚠️" if x == 0 or np.isnan(x) else "✅" if x > 0.7 else "❌")

# do the same for spacy
homonyms_df['spacy_is_subclass'] = homonyms_df['spacy_distance'].apply(
    lambda x: "⚠️" if x == 0 or np.isnan(x) else "✅" if x > 0.3 else "❌")


homonyms_df


In [None]:
homonyms_df[["label", "word1", "word2",  "wordnet_distance", "spacy_distance", "wordnet_is_subclass",
             "spacy_is_subclass", "wikidata_is_subclass"]].to_csv("./homonyms_results_detailed.csv", index=False)
