In [1]:
# !pip install -q ipymarkup

# Import

In [2]:
import os
import re
import spacy
from spacy.tokens import Doc, Span
from spacy import displacy

nlp = spacy.load("en_core_web_lg")

In [3]:
# path = "/kaggle/input/information-retrieval-dorin-khuong/"
path = './'
filename = "C07K.txt"
peptents_text = open(os.path.join(path, filename)).read().strip()
peptents_list = peptents_text.split("\n\n\n") #a list of peptide patents!
#if rereading peptents, delete already_done so that the text can be cleared
try:
    del already_done
except:
    print("Already done doesn't exist anyway")

def check_flawed_peptents(peptents_list):
    """
    Checks that each peptent contains one description and one claim section, not more, nor less.
    Parameters:
        peptents_list: A list of peptents
    Returns:
        descriptionless_peptents, claimless_peptents, doubles: lists of peptent indices which
            - do not contain a description
            - do not contain a claim
            - contain more than one description or claim 
    """
    #find patents without a description
    descriptionless_peptents = []
    claimless_peptents = []
    claims_string = "_____c:"
    descriptions_string = "_____d:"
    doubles = []

    #find all peptents where there is no description, no claim, or duplicates of them
    #this suggests we had a wrong delimiter
    for i,peptent in enumerate(peptents_list):
        description_count = len(re.findall(descriptions_string, peptent))
        claim_count = len(re.findall(claims_string, peptent))
        if description_count == 0:
            descriptionless_peptents.append(i)
        if claim_count == 0:
            claimless_peptents.append(i)
        if description_count > 1:
            print("Found stupid double description peptent at %d" %i)
            doubles.append(i)
        if claim_count > 1:
            print("Found stupid double claim peptent at %d" %i)
            doubles.append(i)      
    return descriptionless_peptents, claimless_peptents, doubles

descriptionless_peptents, claimless_peptents, doubles = check_flawed_peptents(peptents_list)
print("There are %d peptents with more than one description or claim" %len(doubles))
print("Peptent indices without a description:")
print(descriptionless_peptents)
print("Peptent indices without a claim:")
print(claimless_peptents)

Already done doesn't exist anyway
There are 0 peptents with more than one description or claim
Peptent indices without a description:
[180, 1132, 1481, 1482, 1483, 1987]
Peptent indices without a claim:
[180, 1132, 1480, 1481, 1482, 1987]


In [4]:
try: 
    print("Have the peptents been cleaned already?: %s" %already_done)
except:
    print("Cleaning the peptents.")
    #we haven't run this cell yet
    #after some verification, we found the following treatment:
    #drop 180, 1132: empty
    #merge 1483 into 1480: the former is the continuation of the latter
    #drop 1483: merged into 1480
    #drop 1481, 1482, 1987: empty or meaningless code
    peptents_list[1480]+= peptents_list[1483]
    #drop the elements in reverse to not mess with the indices
    peptents_list.pop(1987)
    peptents_list.pop(1483)
    peptents_list.pop(1482)
    peptents_list.pop(1481)
    peptents_list.pop(1132)
    peptents_list.pop(180)

#initialize the varaib
already_done = True

Cleaning the peptents.


In [5]:
descriptionless_peptents, claimless_peptents, doubles = check_flawed_peptents(peptents_list)
count_flawed = len(descriptionless_peptents) + len(claimless_peptents) + len(doubles)
print("Flawed peptents: %d" %count_flawed)

Flawed peptents: 0


# NER 🤗

### ʕ•́ᴥ•̀ʔっ♡ ktgiahieu/bert-for-patents-finetuned-ner

Fine-tune the model: https://huggingface.co/anferico/bert-for-patents
using dataset: https://huggingface.co/datasets/ktgiahieu/maccrobat2018_2020

Finetune notebook 📓: https://colab.research.google.com/drive/1OzCY782KJSF0FBDS0d1CoMhfp3-RtJMV

In [14]:
from transformers import (AutoModelForTokenClassification, 
                          AutoTokenizer, 
                          pipeline,
                          )

model_checkpoint = "ktgiahieu/bert-for-patents-finetuned-ner"
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint)                                                        
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

model_pipeline = pipeline(task="ner", model=model, tokenizer=tokenizer)

example_text = """
nconvenience to production and research, and also affects the quality and efficacy of the drug.
Description of the Invention
In view of the problems existing in the prior art, the present invention provides a crystal of polymyxin B1 sulfate, polymyxin B2 sulfate or a mixture thereof and a preparation method thereof, wherein polymyxin B1 sulfate, polymyxin B2 sulfate or a mixture thereof can be prepared by the method described in the patent ZL201110390624.5.
The present invention provides an anhydrous crystal 1 of polymyxin B1 sulfate, said anhydrous crystal 1 has an X-ray powder diffraction pattern having diffraction peaks at 3.396, 4.895 and 6.903 expressed by 2θ degree using Cu-Ka radiation; preferably, said anhydrous crystal 1 has an X-ray powder diffraction pattern expressed by 2θ degree using Cu-Ka radiation as shown in Figure 2A.
Preferably, said anhydrous crystal 1 has an infrared absorption spectrum having characteristic bands at 1071.93 cm-1, 1242.91 cm-1, 1384.25 cm-1, 1457.69 cm-1, 1524.29 cm-1, 1639.38 cm-1, 2957.69 cm-1, 3064.55 cm-1 and 3270.53 cm-1 as measured by KBr tableting method; and more preferably, said anhydrous crystal 1 has an infrared absorption spectrum as measured by KBr tableting method as shown in Figure 3A.
More preferably, said anhydrous crystal 1 has a melting point of 226.97 °C, and has a differential scanning calorimetry pattern as shown in Figure 4.
The present inventor conducted a more detailed analysis on anhydrous crystal 1, in which said anhydrous crystal 1 exhibits a dynamic moisture adsorption analysis spectrum as shown in Figure 5, a thermogravimetric analysis spectrum as shown in Figure 6, and an isotherm diagram as shown in Figure 7.
"""
# example_text ="""
# he present invention also provides a method for preparing the anhydrous crystal A of polymyxin B1 sulfate, said method comprises the following steps of:
# (1) adding water to polymyxin B1 sulfate to just completely dissolve the solid to obtain a saturated solution;(2) slowly adding an organic solvent dropwise into said saturated solution, or slowly adding said saturated solution dropwise into an organic solvent at a controlled temperature within the range of 0-60 °C to precipitate a solid; wherein, said organic solvent is selected from ethanol, ethanol-n-butanol, n-butanol-isopropanol, methanol, acetone, butanone, or ethanol-ethyl acetate; and(3) filtering off the solid and drying it under vacuum to obtain an anhydrous crystal A of polymyxin B1 sulfate.
# In one embodiment of the method for preparing the anhydrous crystal A of polymyxin B1 sulfate according to the present invention, in step (1), after adding water to polymyxin B1 sulfate, the solid is just completely dissolved by heating at a temperature below 60 °C.
# In one embodiment of the method for preparing the anhydrous crystal A of polymyxin B1 sulfate according to the present invention, in step (2), said organic solvent is used in an amount of 0.5-20 volumes in terms of the volume of said saturated solution.
# In one embodiment of the method for preparing the anhydrous crystal A of polymyxin B1 sulfate according to the present invention, in step (2), after the solid is precipitated, stirring is continued for 0-8 hours.
# """
orig_entities = model_pipeline(example_text)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [15]:
# Define the ClassLabel feature with the label names
label_list = ['B-Activity', 'B-Administration', 'B-Age', 'B-Area', 'B-Biological_attribute', 'B-Biological_structure', 'B-Clinical_event', 'B-Color', 'B-Coreference', 'B-Date', 'B-Detailed_description', 'B-Diagnostic_procedure', 'B-Disease_disorder', 'B-Distance', 'B-Dosage', 'B-Duration', 'B-Family_history', 'B-Frequency', 'B-Height', 'B-History', 'B-Lab_value', 'B-Mass', 'B-Medication', 'B-Nonbiological_location', 'B-Occupation', 'B-Other_entity', 'B-Other_event', 'B-Outcome', 'B-Personal_background', 'B-Qualitative_concept', 'B-Quantitative_concept', 'B-Severity', 'B-Sex', 'B-Shape', 'B-Sign_symptom', 'B-Subject', 'B-Texture', 'B-Therapeutic_procedure', 'B-Time', 'B-Volume', 'B-Weight', 'I-Activity', 'I-Administration', 'I-Age', 'I-Area', 'I-Biological_attribute', 'I-Biological_structure', 'I-Clinical_event', 'I-Color', 'I-Coreference', 'I-Date', 'I-Detailed_description', 'I-Diagnostic_procedure', 'I-Disease_disorder', 'I-Distance', 'I-Dosage', 'I-Duration', 'I-Family_history', 'I-Frequency', 'I-Height', 'I-History', 'I-Lab_value', 'I-Mass', 'I-Medication', 'I-Nonbiological_location', 'I-Occupation', 'I-Other_entity', 'I-Other_event', 'I-Outcome', 'I-Personal_background', 'I-Qualitative_concept', 'I-Quantitative_concept', 'I-Severity', 'I-Shape', 'I-Sign_symptom', 'I-Subject', 'I-Texture', 'I-Therapeutic_procedure', 'I-Time', 'I-Volume', 'I-Weight', 'O']
keep_list = ['Diagnostic_procedure', 'Medication', 'Lab_value', 'Detailed_description']

entities = []
for i in range(len(orig_entities)):
    orig_entities[i]['entity'] = label_list[int(orig_entities[i]['entity'][6:])]
    if orig_entities[i]['entity'] == 'O' or orig_entities[i]['entity'][2:] not in keep_list:
        continue
    entities.append(orig_entities[i])

In [16]:
from ipymarkup import show_span_ascii_markup, show_dep_ascii_markup, show_span_box_markup

entities_markup = []
for i in range(len(entities)):
    if len(entities_markup) == 0:
        entities_markup.append([entities[i]['start'], entities[i]['end'], entities[i]['entity'][2:]])
        continue
    if (entities[i]['start'] == entities[i-1]['end'] \
        or entities[i]['start'] == entities[i-1]['end']+1 \
        # or entities[i]['start'] == entities[i-1]['end']+2 \
        # or entities[i]['start'] == entities[i-1]['end']+3 \
       ) and \
        entities[i]['entity'][2:] == entities[i-1]['entity'][2:]:
        entities_markup[-1][1] = entities[i]['end']
    else:
        entities_markup.append([entities[i]['start'], entities[i]['end'], entities[i]['entity'][2:]])

show_span_box_markup(example_text, entities_markup)

# ⚠️ WE NEED TO PRODIGY 🦄 TO FIX THE ANNOTATION
For example, `201210379231` is not a CHEMICAL

In [9]:
# download prodigy for your system: https://gerdes.fr/saclay/informationRetrieval/prodigy/
# put the files in a subfolder
# try what works. for me it's:
# !pip install ./mac/prodigy-1.11.11-cp310-cp310-macosx_11_0_arm64.whl

## Prepare the patent text in prodigy format

In [11]:
import json
with open('C07K.txt', 'r') as f:
    texts = f.read().strip()
# split text in smaller chunks, then put in chunk in a list of dicts for prodigy
texts = texts.split('\n')
texts = [text.strip() for text in texts]
texts = [text for text in texts if len(text) > 0]

with open('C07K.jsonl', 'w') as outfile:
    for i in range(len(texts)):
        json.dump({'text': texts[i]}, outfile)
        outfile.write('\n')

## Fix annotations with prodigy + ktgiahieu/bert-for-patents-finetuned-ner

In [12]:
!prodigy bert.ner.manual ner_C07K ./C07K.jsonl --hide-wp-prefix -F ner_bert_patent_manual.py

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.

✨  Starting the web server at http://localhost:8080 ...
Open the app in your browser and start annotating!

^C

[38;5;2m✔ Saved 10 annotations to database SQLite[0m
Dataset: ner_C07K
Session ID: 2023-04-16_05-13-20



## Output annotations in Huggingface format

In [13]:
import json
import spacy

from prodigy.components.db import connect

db = connect()
prodigy_annotations = db.get_dataset("ner_C07K")
examples = ((eg["text"], eg) for eg in prodigy_annotations)
nlp = spacy.blank("en")

dataset = []

for doc, eg in nlp.pipe(examples, as_tuples=True):
    try:
        doc.ents = [doc.char_span(s["start"], s["end"], s["label"]) for s in eg["spans"]]
        iob_tags = [f"{t.ent_iob_}-{t.ent_type_}" if t.ent_iob_ else "O" for t in doc]
        iob_tags = [t.strip("-") for t in iob_tags]
        tokens = [str(t) for t in doc]
        temp_data = {
            "tokens": tokens,
            "tags": iob_tags
        }
        dataset.append(temp_data)
    except:
        pass

with open('data.jsonl', 'w') as outfile:
    for entry in dataset:
        json.dump(entry, outfile)
        outfile.write('\n')


# Finally, let's fine-tune the model again using these new annotations:
- Upload `data.jsonl` to Huggingface Hub
- Combine this with `ktgiahieu/maccrobat2018_2020` gold dataset 
- Fine-tune another model with the finetune notebook 📓: https://colab.research.google.com/drive/1OzCY782KJSF0FBDS0d1CoMhfp3-RtJMV