# Inforet 2022: Project

## Imports

In [1]:
import re, regex, timeit, gzip, random
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from collections import Counter
from tqdm.notebook import tqdm
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import spacy
from spacy.lang.en import English
from spacy.matcher import PhraseMatcher
from sense2vec import Sense2Vec
from collections import Counter
from nltk.tokenize import MWETokenizer
from nltk.util import Trie
tqdm.pandas()
spacy.__version__ 

'3.2.4'

## Read and preprocess data

In [2]:
# if you've already unzipped the file
patent_data=open('G06K.txt').read().strip()

# split into patents texts | 1 entry = 1 patent
patent_texts = patent_data.split('\n\n')

# split each patent into lines
patent_lines = patent_data.split('\n')

In [3]:
print(len(patent_lines),'patent lines')
print(len(patent_texts),'texts of patents')

288792 patent lines
2003 texts of patents


## Extract features

In [4]:
cvectorizer = CountVectorizer(ngram_range=(2, 3), min_df=10, stop_words="english")
X=cvectorizer.fit_transform(patent_lines)

Xdf = pd.DataFrame(np.sum(X, axis=0), columns=cvectorizer.get_feature_names()).T.sort_values(by = 0, ascending = False)
Xdf.head(25)



Unnamed: 0,0
present disclosure,17882
present invention,17244
image data,16898
electronic device,16280
according embodiment,14713
image processing,12214
embodiment present,11881
shown fig,10904
control unit,9263
mobile terminal,9165


### Manyterms

In [5]:
# here are the potential terms
mwes = open('manyterms.lower.txt').read().lower().strip().split('\n')
print(mwes[44444:44456])
print(len(mwes),'mwes')

['antonio superchi', 'antonio tarver', 'antonio torres jurado', 'antonio valdes', 'antonio valdes y fernandez bazan', 'antonio valdez', 'antonio valdés y bazán', 'antonio valdés y fernández bazán', 'antonio valente', 'antonio vitali', 'antonio vivaldi', 'antonio xavier machado e cerveira']
743274 mwes


In [6]:

# Here lowercase=False option is used to keep the original case of the terms, since we possibly could have term abbreviations. Like API, CAT, etc.
cvectorizer = CountVectorizer(ngram_range=(1, 4), stop_words="english", vocabulary=mwes, lowercase=True)
X=cvectorizer.fit_transform(patent_texts)

# Show top-25 most frequent terms
termdf_cv = pd.DataFrame(np.sum(X, axis=0), columns=cvectorizer.get_feature_names()).T.sort_values(by = 0, ascending = False)
termdf_cv.head(25)



Unnamed: 0,0
electronic device,16280
image processing,12224
control unit,9263
mobile terminal,9165
information processing,7732
neural network,6734
user interface,6177
computer readable,6103
fingerprint sensor,5980
display device,5666


- [EXPERIMENT] Longer words - more specific terms?

In [7]:
# Count vectorizer with vocabulary
# Here lowercase=False option is used to keep the original case of the terms, since we possibly could have term abbreviations. Like API, CAT, etc.
cvectorizer = CountVectorizer(ngram_range=(3, 4), stop_words="english", vocabulary=mwes, lowercase=False)
X=cvectorizer.fit_transform(patent_lines)

# Show top-25 most frequent terms
term_cv_long = pd.DataFrame(np.sum(X, axis=0), columns=cvectorizer.get_feature_names()).T.sort_values(by = 0, ascending = False)
term_cv_long.head(25)



Unnamed: 0,0
point cloud data,1593
convolutional neural network,1005
printed circuit board,816
central processing unit,698
deep neural network,636
light emitting diode,626
liquid crystal display,574
local area network,479
machine learning model,478
mobile computing device,470


## 🪄 SpaCy NER

Instead of using EntityRuler, we can use the built-in PharaseMatcher and Span for annotation and saving it to the binary `.spacy` format

Let's start from understanding. Here is an example of showing part of text on one patent with default NER 

In [8]:
from spacy.util import filter_spans
from spacy import displacy
from spacy.tokens import DocBin
from spacy.tokens import Span


nlp = spacy.load("en_core_web_lg")
doc = nlp(patent_texts[0][18000:20000]) # 
displacy.render(doc, style="ent", jupyter = True)

### Create DataSet

We need to create propper dataset that is compatible with SpaCy 3.0

In [9]:
matcher = PhraseMatcher(nlp.vocab, attr="LOWER")
patterns = [nlp.make_doc(text) for text in termdf_cv.index]
matcher.add("Tech", patterns)

In [10]:
# split train and test patent_lines with sciki-learn
from sklearn.model_selection import train_test_split
train_lines, test_lines = train_test_split(patent_lines, test_size=0.3, random_state=42)


We are using PharsesMatcher to find entities similar to one from mayterms.txt  
Then Span is labeled and saved into the binary `.spacy` format

Training data

In [15]:
LABEL = "TECH"
doc_bin_train = DocBin() # create a DocBin object

# nlp.max_length = 2000000
for training_example  in tqdm(train_lines[:40000]): #~50 patents
    doc = nlp.make_doc(training_example) 
    ents = []
    
    for match_id, start, end in matcher(doc):
        #print(i,"Matched based on lowercase token text:", doc[:10], '::::::::',doc[start:end],start, end)
        span = Span(doc, start, end, label=LABEL)
        #print(span, span.label_)
        if span is None:
            print("Skipping entity")
        else:
            ents.append(span)

    filtered_ents = filter_spans(ents)
    print(filtered_ents[:3])
    doc.ents = filtered_ents 
    doc_bin_train.add(doc)

  0%|          | 0/40000 [00:00<?, ?it/s]

[the present, the image]
[wide angle, high resolution]
[the image, for every]
[]
[in parallel, laser beams, laser beams]
[execution unit]
[autonomous driving, wireless network]
[image processing, image processing, image quality]
[the normal, exposure time, exposure time]
[Driver assistance, Driver assistance, driver assistance system]
[]
[]
[]
[index matrix, index matrix]
[the image, machine learning, machine learning]
[The system, the matrix, rejection rate]
[]
[]
[rechargeable battery]
[light sources]
[]
[]
[color image, number of, image analysis]
[computer system, The normal, the normal]
[]
[]
[fuel consumption, traffic congestion, a vehicle]
[]
[surgical instrument, data collection, the cloud]
[]
[there exists, data storage, data storage]
[]
[]
[graphics pipeline, inter-thread communication, in service]
[information display, number of, number of]
[]
[]
[the image]
[machine learning, nasal cavities, machine learning]
[]
[]
[the envelope]
[]
[]
[]
[The present]
[]
[STORAGE MEDIUM]
[T

Validation

In [16]:
LABEL = "TECH"
doc_bin_valid = DocBin() # create a DocBin object
nlp = spacy.blank("en")
# nlp.max_length = 2000000
for training_example  in tqdm(test_lines[:12000]): #~15 patents
    doc = nlp.make_doc(training_example) 
    ents = []
    
    for match_id, start, end in matcher(doc):
        #print(i,"Matched based on lowercase token text:", doc[:10], '::::::::',doc[start:end],start, end)
        span = Span(doc, start, end, label=LABEL)
        #print(span, span.label_)
        if span is None:
            print("Skipping entity")
        else:
            ents.append(span)

    filtered_ents = filter_spans(ents)
    print(filtered_ents[:3])
    doc.ents = filtered_ents 
    doc_bin_valid.add(doc)

  0%|          | 0/12000 [00:00<?, ?it/s]

[one-dimensional array, one-dimensional array, one-dimensional array]
[display screen, display screen, first generation]
[]
[communication network, a wireless local area network, Long Term Evolution]
[mobile device, mobile device, mobile device]
[image processing, the traffic, second detection]
[general-purpose computer]
[the image, topographic map, the image]
[biometric data, the computer, acquisition process]
[Network Access, network access]
[white balance]
[]
[TRACKING DEVICE]
[the image, the image, video content]
[real time]
[user data, the security]
[the present, computation time]
[the present]
[neural network, input data]
[the present, electronic device, wireless communication]
[display device, the image, the image]
[error message]
[the present, the cockpit, flight deck]
[the image, the transition, the image]
[digital communications, parallel interface, binary number]
[]
[the plant]
[]
[mobile phone, mobile phone]
[data pre-processing, the following, acceleration sensor]
[focal d

Test

In [17]:
LABEL = "TECH"
doc_bin_test = DocBin() # create a DocBin object
nlp = spacy.blank("en")
# nlp.max_length = 2000000
for training_example  in tqdm(test_lines[12000:24000]): #~5 patents
    doc = nlp.make_doc(training_example) 
    ents = []
    
    for match_id, start, end in matcher(doc):
        #print(i,"Matched based on lowercase token text:", doc[:10], '::::::::',doc[start:end],start, end)
        span = Span(doc, start, end, label=LABEL)
        #print(span, span.label_)
        if span is None:
            print("Skipping entity")
        else:
            ents.append(span)

    filtered_ents = filter_spans(ents)
    print(filtered_ents[:3])
    doc.ents = filtered_ents 
    doc_bin_test.add(doc)

  0%|          | 0/12000 [00:00<?, ?it/s]

[machine learning]
[first light, first light, the image]
[The reference frame, the reference frame, the reference frame]
[spatial frequency]
[the following]
[]
[the present]
[]
[magnetic strip]
[PET scanner, positron emission tomography, physical properties]
[classification problem, classification problem, input data]
[the present, the spirit, the present]
[yet another, number of, frequency domain]
[threshold value, threshold value, threshold value]
[]
[]
[feature extraction, image enhancement, edge detection]
[]
[input device]
[]
[user interface, sensor fusion]
[the power]
[]
[]
[]
[computer readable, recording medium, malicious code]
[collision avoidance, velocity vector, video frame]
[the present]
[data extraction]
[driver circuit, the present, resonant circuit]
[calibration target]
[vehicle lane]
[primary cell, secondary cell, fuel cell]
[the novel]
[smart home, smart home, smart home]
[dielectric material, polyethylene terephthalate]
[Augmented reality display, user interface, vid

Save data

In [18]:
doc_bin_train.to_disk("training_data.spacy") # save the docbin object
doc_bin_valid.to_disk("valid_data.spacy") # save the docbin object
doc_bin_test.to_disk("test_data.spacy") # save the docbin object

# save train_lines to txt file
with open('train_lines.txt', 'w') as f:
    for line in train_lines:
        f.write(line)
        f.write('\n')
f.close()

# save train_lines to txt file
with open('valid_lines.txt', 'w') as f:
    for line in test_lines[:12000]:
        f.write(line)
        f.write('\n')
f.close()

# save test_lines to txt file
with open('test_lines.txt', 'w') as f:
    for line in test_lines[12000:24000]:
        f.write(line)
        f.write('\n')
f.close()

### Configuration

Donwnload __base_config.cfg__ for your system at https://spacy.io/usage/training#quickstart

In [17]:
# Run to generate full training config
!python -m spacy init fill-config base_config.cfg config.cfg

/home/gaetan_serre93_gmail_com/miniconda3/bin/python: No module named spacy


### Training

Run training. All results are stored into __./spacy_output__ 

In [17]:
!python -m spacy train config.cfg --output ./spacy_output --paths.train ./training_data.spacy --paths.dev ./valid_data.spacy

[38;5;2m✔ Created output directory: spacy_output[0m
[38;5;4mℹ Saving to output directory: spacy_output[0m
[38;5;4mℹ Using CPU[0m
[38;5;4mℹ To switch to GPU 0, use the option: --gpu-id 0[0m
[1m
[2022-04-15 17:13:09,327] [INFO] Set up nlp object from config
[2022-04-15 17:13:09,340] [INFO] Pipeline: ['tok2vec', 'ner']
[2022-04-15 17:13:09,344] [INFO] Created vocabulary
[2022-04-15 17:13:09,345] [INFO] Finished initializing nlp object
[2022-04-15 17:13:50,927] [INFO] Initialized pipeline components: ['tok2vec', 'ner']
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     45.50    0.28    0.36    0.24    0.00
  0     200         41.48   2334.34   36.93   72.23   24.81    0.37
  0     400         48.97   1330.87   52.86   58.68   48.09    0.53
  0 

### Testing

In [18]:
nlp_ner = spacy.load("./spacy_output/model-best")

colors = {"TECH": "#7DF6D9", "MEDICALCONDITION":"#FFFFFF"}
options = {"colors": colors} 

for line in test_lines[10000:10005]:
    doc = nlp_ner(line)
    spacy.displacy.render(doc, style="ent", options= options, jupyter=True)


## 🦄 Prodigy: Make it even better

For this part i have used this tutorial: https://newscatcherapi.com/blog/train-custom-named-entity-recognition-ner-model-with-spacy-v3  
and official documentation: https://spacy.io/usage/training#custom-ner-model

### 📖 Teach it! 

One of the besst feature of Prodigy is that you can focus annotation on the most uncertain enitties.  
For this one we use __ner.teach__

As a dataset, used valid_data.txt, since model is already fitter on the training data

In [None]:
!prodigy ner.teach ner_tech  ./spacy_output/model-best  valid_lines.txt --label TECH

<img src="./img/binary.png" height=400>

Also, instead of binary judging, we can correct model prediction manualy by using __ner.correct__

In [None]:
!prodigy ner.correct gold_tech  ./spacy_output/model-best  valid_lines.txt --label TECH

<img src="./img/annotation.png" height=420>

### 🤝 Merge it!

Now, we need to merge our binary annotation into the __gold dataset__  
This means that now we are fixing annotation manually in the text which we are rejected during __ner.teach__  

Those annotation can be directly merged into already created dataset(by ner.correct)

In [None]:
!prodigy ner.silver-to-gold gold_tech ner_tech ./spacy_output/model-best --label TECH 

### 🏋️‍♀️ .. or train it with Prodigy

We can finetune\ train our existing SpaCy model(pipeline) inside prodigy  

Here we train existing `/model_best` and output our finte-tuned model into `spacy_output`

In [None]:
!prodigy train ./prodigy_output --ner gold_tech --eval-split 0.3 --base-model ./spacy_output/model-best 

<img src="./img/terminal_training.png" height=400>


### ✍️ Evaluation 
- Let's evaluate on the test-lines. They are already randomized, so it's a good place to start.

In [None]:
nlp_ner = spacy.load("./prodigy_output/model-best")

colors = {"TECH": "#7DF6D9", "MEDICALCONDITION":"#FFFFFF"}
options = {"colors": colors} 

for line in test_lines[10000:10010]:
    doc = nlp_ner(line)
    spacy.displacy.render(doc, style="ent", options= options, jupyter=True)

## TODO: Hearst Patterns 

PhraseMatcher, etc...