In [2]:
!python -m spacy download en_core_web_md

Collecting en_core_web_md==2.1.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-2.1.0/en_core_web_md-2.1.0.tar.gz (95.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m95.4/95.4 MB[0m [31m54.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_md')


In [21]:
import spacy
import neuralcoref
from spacy import displacy

In [4]:
# Load SpaCy's English model
nlp = spacy.load('en_core_web_md')

# Add neuralcoref to SpaCy's pipeline
neuralcoref.add_to_pipe(nlp)

In [None]:
# Let's try something more!
# Let's scrap a longer text from website,
# and process the coref_resolution with function.
# process the ner part with function. (NER with spacy / NER with Gliner) 성능비교

In [None]:
# Input text for coreference resolution
text = """John asked Mary to go out. She said she was busy. John was disappointed but understood."""

In [5]:
text2 = "Rihanna is basically master of the fashion universe right now, so we're naturally going to pay attention to what trends she is and isn't wearing whenever she steps out of the door (or black SUV). She's having quite the epic week, first presenting her Savage x Fenty lingerie runway show then hosting her annual Diamond Ball charity event last night. Rihanna was decked out in Givenchy for the big event, but upon arrival at the venue, she wore a T-shirt, diamonds (naturally), and a scarf, leather pants, and heels in fall's biggest color trend: pistachio green."

In [11]:
text3 = "When Sebastian Thrun started working on self-driving cars at Google in 2007, few people outside of the company took him seriously."

In [12]:
# Process the text with SpaCy
doc = nlp(text3)

In [13]:
for cluster in doc._.coref_clusters:
    for reference in cluster:
    #each of these is a Span object in Spacy
        print(reference)
        #starting index of this reference in the text
        print(reference.start) 
        #ending index of this reference in the text
        print(reference.end)

Sebastian Thrun
1
3
him
22
23
Google
11
12
the company
19
21


In [14]:
# Print coreference clusters
if doc._.has_coref:
    for cluster in doc._.coref_clusters:
        print(f"Coreference cluster: {cluster}")

Coreference cluster: Sebastian Thrun: [Sebastian Thrun, him]
Coreference cluster: Google: [Google, the company]


In [16]:
# Print the resolved text
resolved_doc = doc._.coref_resolved
print("Resolved document:")
print(resolved_doc)

Resolved document:
When Sebastian Thrun started working on self-driving cars at Google in 2007, few people outside of Google took Sebastian Thrun seriously.


### NER

In [19]:
# text = "When Sebastian Thrun started working on self-driving cars at Google in 2007, few people outside of the company took him seriously."

In [20]:
# doc = nlp(text)

In [18]:
for entity in doc.ents : 
    print(f"{entity.text}({entity.label_})")

Sebastian Thrun(PERSON)
Google(ORG)
2007(DATE)


In [20]:
displacy.render(doc, style="ent")

In [4]:
# import urllib.request
# from bs4 import BeautifulSoup
# import spacy
# import neuralcoref
# nlp = spacy.load('en_core_web_lg')
# neuralcoref.add_to_pipe(nlp)

# # html = urllib.request.urlopen('https://www.law.cornell.edu/supremecourt/text/418/683').read()

# html = urllib.request.urlopen('https://www.nbcnews.com/business/business-news/biden-preparing-block-us-steel-sale-japanese-company-rcna169595').read()

# soup = BeautifulSoup(html, 'html.parser')
# text = ''.join([t for t in soup.find_all(text=True) if t.parent.name == 'p' and len(t) >= 25])
# doc = nlp(text)
# resolved_text = doc._.coref_resolved
# sentences = [sent.string.strip() for sent in nlp(resolved_text).sents]
# output = [sent for sent in sentences if 'president' in 
#           (' '.join([token.lemma_.lower() for token in nlp(sent)]))]
# print('Fact count:', len(output))
# for fact in range(len(output)):
#     print(str(fact+1)+'.', output[fact])

In [2]:
from gliner import GLiNER

# Initialize GLiNER with the base model
# model = GLiNER.from_pretrained("urchade/gliner_mediumv2.1")
model = GLiNER.from_pretrained("urchade/gliner_small-v2.1")

Fetching 5 files: 100%|██████████| 5/5 [00:40<00:00,  8.02s/it]


In [None]:
# Labels for entity prediction
# Most GLiNER models should work best when entity types are in lower case or title case
labels = ["Person", "Award", "Date", "Competitions", "Teams"]

In [None]:
# Perform entity prediction
entities = model.predict_entities(processed_doc, labels, threshold=0.5)

# Display predicted entities and their labels
for entity in entities:
    print(entity["text"], "=>", entity["label"])

In [3]:
# Sample text for entity prediction
text = """
Cristiano Ronaldo dos Santos Aveiro (Portuguese pronunciation: [kɾiʃˈtjɐnu ʁɔˈnaldu]; born 5 February 1985) is a Portuguese professional footballer who plays as a forward for and captains both Saudi Pro League club Al Nassr and the Portugal national team. Widely regarded as one of the greatest players of all time, Ronaldo has won five Ballon d'Or awards,[note 3] a record three UEFA Men's Player of the Year Awards, and four European Golden Shoes, the most by a European player. He has won 33 trophies in his career, including seven league titles, five UEFA Champions Leagues, the UEFA European Championship and the UEFA Nations League. Ronaldo holds the records for most appearances (183), goals (140) and assists (42) in the Champions League, goals in the European Championship (14), international goals (128) and international appearances (205). He is one of the few players to have made over 1,200 professional career appearances, the most by an outfield player, and has scored over 850 official senior career goals for club and country, making him the top goalscorer of all time.
"""

In [4]:
# Labels for entity prediction
# Most GLiNER models should work best when entity types are in lower case or title case
labels = ["Person", "Award", "Date", "Competitions", "Teams"]

In [5]:
# Perform entity prediction
entities = model.predict_entities(text, labels, threshold=0.5)

# Display predicted entities and their labels
for entity in entities:
    print(entity["text"], "=>", entity["label"])

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Cristiano Ronaldo dos Santos Aveiro => Person
5 February 1985 => Date
Portugal national team => Teams
Ballon d'Or => Award
UEFA Men's Player of the Year Awards => Award
European Golden Shoes => Award
UEFA Champions Leagues => Competitions
UEFA European Championship => Competitions
UEFA Nations League => Competitions
European Championship => Competitions


In [9]:
# Sample text for entity prediction
text = """
David went to the concert. He said it was an amazing experience. 
"""

In [10]:
# Labels for entity prediction
# Most GLiNER models should work best when entity types are in lower case or title case
labels = ["Person", "Award", "Date", "Competitions", "Teams"]

In [11]:
# Perform entity prediction
entities = model.predict_entities(text, labels, threshold=0.5)

# Display predicted entities and their labels
for entity in entities:
    print(entity["text"], "=>", entity["label"])



Wörl => Person
Wörl => Person
1934 => Date
1942 => Date
Wörl => Person
Wörl => Person
Wörl => Person
Wörl => Person
