# Inforet 2022: Project

## Imports

In [None]:
import re, regex, timeit, gzip, random
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from collections import Counter
from tqdm.notebook import tqdm
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import spacy
from spacy.lang.en import English
from spacy.matcher import PhraseMatcher
from collections import Counter
from nltk.tokenize import MWETokenizer
from nltk.util import Trie
import nltk
from nltk.corpus import wordnet as wn
nltk.download('wordnet')
nltk.download('omw-1.4')
tqdm.pandas()
spacy.__version__ 

## Read and preprocess data

In [None]:
# if you've already unzipped the file
patent_data=open('G06K.txt').read().strip()

# split into patents texts | 1 entry = 1 patent
patent_texts = patent_data.split('\n\n')

# split each patent into lines
patent_lines = patent_data.split('\n')

In [None]:
print(len(patent_lines),'patent lines')
print(len(patent_texts),'texts of patents')

## 👀 Extract features

In [None]:
cvectorizer = CountVectorizer(ngram_range=(2, 3), min_df=10, stop_words="english")
X=cvectorizer.fit_transform(patent_lines)

Xdf = pd.DataFrame(np.sum(X, axis=0), columns=cvectorizer.get_feature_names()).T.sort_values(by = 0, ascending = False)
Xdf.head(25)

### Manyterms

In [None]:
# here are the potential terms
mwes = open('manyterms.lower.txt').read().lower().strip().split('\n')
print(mwes[44444:44456])
print(len(mwes),'mwes')

In [None]:

# Here lowercase=False option is used to keep the original case of the terms, since we possibly could have term abbreviations. Like API, CAT, etc.
cvectorizer = CountVectorizer(ngram_range=(1, 4), stop_words="english", vocabulary=mwes, lowercase=True)
X=cvectorizer.fit_transform(patent_texts)

# Show top-25 most frequent terms
termdf_cv = pd.DataFrame(np.sum(X, axis=0), columns=cvectorizer.get_feature_names()).T.sort_values(by = 0, ascending = False)
termdf_cv.head(25)

- [EXPERIMENT] Longer words - more specific terms?

In [None]:
# Count vectorizer with vocabulary
# Here lowercase=False option is used to keep the original case of the terms, since we possibly could have term abbreviations. Like API, CAT, etc.
cvectorizer = CountVectorizer(ngram_range=(3, 4), stop_words="english", vocabulary=mwes, lowercase=False)
X=cvectorizer.fit_transform(patent_lines)

# Show top-25 most frequent terms
term_cv_long = pd.DataFrame(np.sum(X, axis=0), columns=cvectorizer.get_feature_names()).T.sort_values(by = 0, ascending = False)
term_cv_long.head(25)

## 🪄 SpaCy NER

Instead of using EntityRuler, we can use the built-in PharaseMatcher and Span for annotation and saving it to the binary `.spacy` format

Let's start from understanding. Here is an example of showing part of text on one patent with default NER 

In [None]:
from spacy.util import filter_spans
from spacy import displacy
from spacy.tokens import DocBin
from spacy.tokens import Span


nlp = spacy.load("en_core_web_lg")
doc = nlp(patent_texts[0][18000:20000]) # 
displacy.render(doc, style="ent", jupyter = True)

### Create DataSet

We need to create propper dataset that is compatible with SpaCy 3.0

In [None]:
matcher = PhraseMatcher(nlp.vocab, attr="LOWER")
patterns = [nlp.make_doc(text) for text in termdf_cv.index]
matcher.add("Tech", patterns)

In [None]:
# split train and test patent_lines with sciki-learn
from sklearn.model_selection import train_test_split
train_lines, test_lines = train_test_split(patent_lines, test_size=0.3, random_state=42)


We are using PharsesMatcher to find entities similar to one from mayterms.txt  
Then Span is labeled and saved into the binary `.spacy` format

Training data

In [None]:
LABEL = "TECH"
doc_bin_train = DocBin() # create a DocBin object

# nlp.max_length = 2000000
for training_example  in tqdm(train_lines[:40000]): #~50 patents
    doc = nlp.make_doc(training_example) 
    ents = []
    
    for match_id, start, end in matcher(doc):
        #print(i,"Matched based on lowercase token text:", doc[:10], '::::::::',doc[start:end],start, end)
        span = Span(doc, start, end, label=LABEL)
        #print(span, span.label_)
        if span is None:
            print("Skipping entity")
        else:
            ents.append(span)

    filtered_ents = filter_spans(ents)
    print(filtered_ents[:3])
    doc.ents = filtered_ents 
    doc_bin_train.add(doc)

Validation

In [None]:
LABEL = "TECH"
doc_bin_valid = DocBin() # create a DocBin object
nlp = spacy.blank("en")
# nlp.max_length = 2000000
for training_example  in tqdm(test_lines[:12000]): #~15 patents
    doc = nlp.make_doc(training_example) 
    ents = []
    
    for match_id, start, end in matcher(doc):
        #print(i,"Matched based on lowercase token text:", doc[:10], '::::::::',doc[start:end],start, end)
        span = Span(doc, start, end, label=LABEL)
        #print(span, span.label_)
        if span is None:
            print("Skipping entity")
        else:
            ents.append(span)

    filtered_ents = filter_spans(ents)
    print(filtered_ents[:3])
    doc.ents = filtered_ents 
    doc_bin_valid.add(doc)

Test

In [None]:
LABEL = "TECH"
doc_bin_test = DocBin() # create a DocBin object
nlp = spacy.blank("en")
# nlp.max_length = 2000000
for training_example  in tqdm(test_lines[12000:24000]): #~5 patents
    doc = nlp.make_doc(training_example) 
    ents = []
    
    for match_id, start, end in matcher(doc):
        #print(i,"Matched based on lowercase token text:", doc[:10], '::::::::',doc[start:end],start, end)
        span = Span(doc, start, end, label=LABEL)
        #print(span, span.label_)
        if span is None:
            print("Skipping entity")
        else:
            ents.append(span)

    filtered_ents = filter_spans(ents)
    print(filtered_ents[:3])
    doc.ents = filtered_ents 
    doc_bin_test.add(doc)

Save data

In [None]:
doc_bin_train.to_disk("training_data.spacy") # save the docbin object
doc_bin_valid.to_disk("valid_data.spacy") # save the docbin object
doc_bin_test.to_disk("test_data.spacy") # save the docbin object

# save train_lines to txt file
with open('train_lines.txt', 'w') as f:
    for line in train_lines:
        f.write(line)
        f.write('\n')
f.close()

# save train_lines to txt file
with open('valid_lines.txt', 'w') as f:
    for line in test_lines[:12000]:
        f.write(line)
        f.write('\n')
f.close()

# save test_lines to txt file
with open('test_lines.txt', 'w') as f:
    for line in test_lines[12000:24000]:
        f.write(line)
        f.write('\n')
f.close()

### Configuration

Donwnload __base_config.cfg__ for your system at https://spacy.io/usage/training#quickstart

In [None]:
# Run to generate full training config
!python -m spacy init fill-config base_config.cfg config.cfg

### Training

Run training. All results are stored into __./spacy_output__ 

In [None]:
!python -m spacy train config.cfg --output ./spacy_output --paths.train ./training_data.spacy --paths.dev ./valid_data.spacy --gpu-id 0

### Testing

In [None]:
nlp_ner = spacy.load("./spacy_output/model-best")

colors = {"TECH": "#7DF6D9", "MEDICALCONDITION":"#FFFFFF"}
options = {"colors": colors} 

for line in test_lines[10000:10005]:
    doc = nlp_ner(line)
    spacy.displacy.render(doc, style="ent", options= options, jupyter=True)


## 🦄 Prodigy: Make it even better

For this part i have used this tutorial: https://newscatcherapi.com/blog/train-custom-named-entity-recognition-ner-model-with-spacy-v3  
and official documentation: https://spacy.io/usage/training#custom-ner-model

### 📖 Teach it! 

One of the besst feature of Prodigy is that you can focus annotation on the most uncertain enitties.  
For this one we use __ner.teach__

As a dataset, used valid_data.txt, since model is already fitter on the training data

In [None]:
!prodigy ner.teach ner_tech  ./spacy_output/model-best  valid_lines.txt --label TECH

<img src="./img/binary.png" height=400>

Also, instead of binary judging, we can correct model prediction manualy by using __ner.correct__

In [None]:
!prodigy ner.correct gold_tech  ./spacy_output/model-best  valid_lines.txt --label TECH

<img src="./img/annotation.png" height=420>

### 🤝 Merge it!

Now, we need to merge our binary annotation into the __gold dataset__  
This means that now we are fixing annotation manually in the text which we are rejected during __ner.teach__  

Those annotation can be directly merged into already created dataset(by ner.correct)

In [None]:
!prodigy ner.silver-to-gold gold_tech ner_tech ./spacy_output/model-best --label TECH 

### 🏋️‍♀️ .. or train it with Prodigy

We can finetune\ train our existing SpaCy model(pipeline) inside prodigy  

Here we train existing `/model_best` and output our finte-tuned model into `spacy_output`

In [None]:
!prodigy train ./prodigy_output --ner gold_tech --eval-split 0.3 --base-model ./spacy_output/model-best 

<img src="./img/terminal_training.png" height=400>


### ✍️ Evaluation 
- Let's evaluate on the test-lines. They are already randomized, so it's a good place to start.

In [None]:
nlp_ner = spacy.load("./prodigy_output/model-best")

# load test lines
with open('test_lines.txt', 'r') as f:
    test_lines = f.readlines()
f.close()


colors = {"TECH": "#7DF6D9", "MEDICALCONDITION":"#FFFFFF"}
options = {"colors": colors} 

for line in test_lines[10000:10010]:
    doc = nlp_ner(line)
    spacy.displacy.render(doc, style="ent", options= options, jupyter=True)

## 🏆 Evaluation

#### Load model and data

In [None]:
# load best model
nlp_ner = spacy.load("./spacy_output/model-best")

# load test lines
with open('test_lines.txt', 'r') as f:
    test_lines = f.readlines()
f.close()

# read homonym_list.txt with pairs of homonyms
homonyms_df = pd.read_csv('hearst_patterns.30.csv')
homonyms_df

#### 🏅 Manual Gold dataset
Here we are evaluating NER model on the manually created gold dataset. 

In [None]:
!TODO__TODO

#### 🌐 Word-Net

Here we are evaluating extracted Hypernyms using WordNet. Here is an example how it works

In [None]:
cd_rom = wn.synsets('CD-ROM', pos='n')
computer = wn.synsets('computer', pos='n')

In [None]:
for synset in cd_rom:
    for synset2 in computer:
        print(synset, synset2)
        print("Score:", synset.wup_similarity(synset2))

In [None]:
cd_rom[0].shortest_path_distance(computer[0])

Run on our list of hypernyms

In [None]:
def wordnet_distance(word1, word2):
    parent = wn.synsets(word1.replace(' ', '_'))
    subclass = wn.synsets(word2.replace(' ', '_'))
    scores = [0]
    try:
        for synset in parent:
            for synset2 in subclass:
                scores.append(synset.wup_similarity(synset2))
    except:
        return np.nan

    return np.round(max(scores), 2)

In [None]:
# iterate throw homonyms_df
results_wordnet = []
for index, row in tqdm(homonyms_df.iterrows()):
    if row["label"] == -1:
        parent = row['word1']
        subclass = row['word2']
    else:
        parent = row['word2']
        subclass = row['word1']
    res = wordnet_distance(parent, subclass)
    results_wordnet.append(res)
    print(parent, "⬅️", subclass, ": ", res)


Save to the dataframe

In [None]:
# save to dataframe
homonyms_df['wordnet_distance'] = results_wordnet 

#### 🧪 Spacy embeddings
What if model already has links between words? Since it's trained on the corpus data, it should be able to find similarity between words.  
This is what can be useful while evaluationg our hyponyms list. We could run it and find low-similar elemts for further analysis.

In [None]:
# load large model for comparison
nlp = spacy.load("./spacy_output/model-best")

word_1 = nlp("cloud platform")
word_2 = nlp("service provider")

print(word_1, "<->", word_2, word_1.similarity(word_2))

Run on our list of hypernyms

In [None]:
def spacy_score(word1, word2):
    word1 = nlp(word1)
    word2 = nlp(word2)
    return word1.similarity(word2)

In [None]:
# iterate throw homonyms_df
results_spacy = []
for index, row in tqdm(homonyms_df.iterrows()):
    if row["label"] == -1:
        parent = row['word1']
        subclass = row['word2']
    else:
        parent = row['word2']
        subclass = row['word1']

    res = spacy_score(parent, subclass)
    results_spacy.append(res)
    print(parent, "⬅️", subclass, ": ", res)


Add results to the dataframe

In [None]:
homonyms_df['spacy_distance'] = results_spacy

In [None]:
homonyms_df

#### 📜 Wikidata

In [None]:
from qwikidata.entity import WikidataItem, WikidataLexeme, WikidataProperty
from qwikidata.linked_data_interface import get_entity_dict_from_api
from qwikidata.sparql import (get_subclasses_of_item,
                              return_sparql_query_results)
import wptools

In [None]:
parent_name = "computer"
candidate_name = "iPad"

# get Wikidata item for parent
page = wptools.page(parent_name)
data = page.get_parse(show=False)
q_parent_class = data.data['wikibase']
q_parent_class

In [None]:
# use convenience function to get subclasses of an item as a list of item ids
subclasses_list = get_subclasses_of_item(q_parent_class)
len(subclasses_list)

In [None]:
#print some of this classes
for subclass in subclasses_list[:5]:
    q42_dict = get_entity_dict_from_api(subclass)
    print(WikidataItem(q42_dict).get_label())

In [None]:
page = wptools.page(candidate_name)
data = page.get_parse(show=False)
data.data['wikibase']

In [None]:
print(f"Is class `{parent_name}` is a subclass of `{candidate_name}`: ", data.data['wikibase'] in subclasses_list)

Run on our list of hypernyms

In [None]:
def wikidata_is_subclass(word_1, word_2):
    parent_name = word_1
    candidate_name = word_2

    # get Wikidata item for parent
    page = wptools.page(parent_name)
    try:
        data_parent = page.get_parse(show=False)
    except:
        print(f"Could not find Wikidata item for `{parent_name}`")
        return "⚠️"
    q_parent_id = data_parent.data['wikibase']

    # get Wikidata item for candidate
    page = wptools.page(candidate_name)
    try:
        data_subclass = page.get_parse(show=False)
    except:
        print(f"Could not find Wikidata item for `{candidate_name}`")
        return "⚠️"
    q_subclass_id = data_subclass.data['wikibase']

    # use convenience function to get subclasses of an item as a list of item ids
    subclasses_list = get_subclasses_of_item(q_parent_id)

    res = q_subclass_id in subclasses_list
    if res:
        return "✅"
    else:
        return "❌"

In [None]:
# iterate throw homonyms_df
wikidata_results = []
for index, row in tqdm(homonyms_df.iterrows()):
    if row["label"] == -1:
        parent = row['word1']
        subclass = row['word2']
    else:
        parent = row['word2']
        subclass = row['word1']

    res = wikidata_is_subclass(parent, subclass)
    wikidata_results.append(res)
    print(parent, "⬅️", subclass, ": ",wikidata_is_subclass(parent, subclass))


Add results to the dataframe

In [None]:
homonyms_df['wikidata_is_subclass'] = wikidata_results

In [None]:
homonyms_df

#### Process table

In [None]:
# create columns wordnet_is_subclass, fill with "⚠️" if not 0 or np.nan, otherwise fill with "✅" if wordnet_distance > 0.7, otherwise fill with "❌"
homonyms_df['wordnet_is_subclass'] = homonyms_df['wordnet_distance'].apply(lambda x: "⚠️" if x == 0 or np.isnan(x) else "✅" if x > 0.7 else "❌")

# do the same for spacy
homonyms_df['spacy_is_subclass'] = homonyms_df['spacy_distance'].apply(lambda x: "⚠️" if x == 0 or np.isnan(x) else "✅" if x > 0.3 else "❌")


homonyms_df

In [None]:
homonyms_df[[ "label", "word1", "word2", "wordnet_is_subclass", "spacy_is_subclass", "wikidata_is_subclass"]].to_csv("./homonyms_results.csv", index=False)

In [None]:
homonyms_df[["label", "word1", "word2",  "wordnet_distance", "spacy_distance", "wordnet_is_subclass", "spacy_is_subclass", "wikidata_is_subclass"]].to_csv("./homonyms_results_detailed.csv", index=False)