In [1]:
from collections import defaultdict
import warnings

In [2]:
warnings.filterwarnings("ignore", category=FutureWarning, module="huggingface_hub.file_download")
warnings.filterwarnings("ignore", category=FutureWarning, module="transformers.utils.generic")
warnings.filterwarnings("ignore", category=FutureWarning, module="transformers.modeling_utils")

In [3]:
import spacy
import coreferee
import torch

In [11]:
import transformers
import datasets
import rebel_spacy
from config import load_config

In [12]:
config, secrets = load_config()

## Declare the text and the model

In [13]:
text = "Hello! This is a sample text. It contains multiple sentences. How many will spaCy find?"

In [14]:
if torch.cuda.is_available():
    print("Using CUDA")
    spacy_device = 0
    torch_device = torch.device("cuda")
else:
    print("Using CPU")
    spacy_device = -1
    torch_device = torch.device("cpu")


Using CUDA


In [15]:
nlp = spacy.load(config.spacy.model_name)
nlp.add_pipe("rebel", after="senter", config={"device": spacy_device, "model_name": config.rebel.model_name})

<rebel_spacy.RebelComponent at 0x736554b4acd0>

In [16]:
doc = nlp(text)

In [17]:
[chunk.text for chunk in doc.noun_chunks]

['This', 'a sample text', 'It', 'multiple sentences']

In [22]:
tokenizer = transformers.AutoTokenizer.from_pretrained(config.transformer.model_name, legacy=False)
tokenizer.pad_token = tokenizer.eos_token

In [23]:
model = transformers.AutoModel.from_pretrained(
    config.transformer.model_name,
    torch_dtype=torch.float16,
    device_map='auto'
)
model.config.pad_token_id = model.config.eos_token_id

Some parameters are on the meta device because they were offloaded to the cpu.


In [24]:
input_encoded = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)

In [25]:
input_encoded

{'input_ids': tensor([[    1,  8479, 29578,   660,   325,   260,  5505,  1880, 29520,   596,
          4824,  3567, 17501, 29520,  1058,   931,   477, 15877, 21391,   977,
         29584]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [26]:
embed_layer = model.get_input_embeddings()

In [27]:
input_encoded = {k: v.to(torch_device) for k, v in input_encoded.items()}

In [28]:
with torch.no_grad():
    embeddings = embed_layer(input_encoded["input_ids"])

In [29]:
input_encoded["input_ids"].shape

torch.Size([1, 21])

In [30]:
len(embeddings[0, 0, :])

3200

In [31]:
tokenized_text = [[token for token in sentence] for sentence in doc.sents]

In [32]:
for sent in tokenized_text:
    for word in sent:
        print(f"'{word}' -  {type(word)}")

'Hello' -  <class 'spacy.tokens.token.Token'>
'!' -  <class 'spacy.tokens.token.Token'>
'This' -  <class 'spacy.tokens.token.Token'>
'is' -  <class 'spacy.tokens.token.Token'>
'a' -  <class 'spacy.tokens.token.Token'>
'sample' -  <class 'spacy.tokens.token.Token'>
'text' -  <class 'spacy.tokens.token.Token'>
'.' -  <class 'spacy.tokens.token.Token'>
'It' -  <class 'spacy.tokens.token.Token'>
'contains' -  <class 'spacy.tokens.token.Token'>
'multiple' -  <class 'spacy.tokens.token.Token'>
'sentences' -  <class 'spacy.tokens.token.Token'>
'.' -  <class 'spacy.tokens.token.Token'>
'How' -  <class 'spacy.tokens.token.Token'>
'many' -  <class 'spacy.tokens.token.Token'>
'will' -  <class 'spacy.tokens.token.Token'>
'spaCy' -  <class 'spacy.tokens.token.Token'>
'find' -  <class 'spacy.tokens.token.Token'>
'?' -  <class 'spacy.tokens.token.Token'>


In [42]:
#[token for sent in doc.sents for token in sent]

In [41]:
[[token for token in sent] for sent in doc.sents]

[[Hello, !],
 [This, is, a, sample, text, .],
 [It, contains, multiple, sentences, .],
 [How, many, will, spaCy, find, ?]]

In [43]:
pos_tags = [(token.text, token.pos_) for token in doc]
print(pos_tags)

[('Hello', 'INTJ'), ('!', 'PUNCT'), ('This', 'PRON'), ('is', 'AUX'), ('a', 'DET'), ('sample', 'NOUN'), ('text', 'NOUN'), ('.', 'PUNCT'), ('It', 'PRON'), ('contains', 'VERB'), ('multiple', 'ADJ'), ('sentences', 'NOUN'), ('.', 'PUNCT'), ('How', 'SCONJ'), ('many', 'ADJ'), ('will', 'AUX'), ('spaCy', 'VERB'), ('find', 'VERB'), ('?', 'PUNCT')]


In [44]:
relationships = []
    
# Iterate through each token in the document
for token in doc:
    # Check if the token has a head (to avoid root)
    if token.dep_ != "ROOT":
        # Create a tuple of (governor, relationship, dependent)
        relationship = (token.head.text, token.dep_, token.text)
        relationships.append(relationship)

print(relationships)


def analyze_semantic_relationships(text):
    # Load the English language model
    nlp = spacy.load("en_core_web_sm")
    
    # Process the text
    doc = nlp(text)
    
    # Initialize dictionaries to store nodes and relationships
    nodes = defaultdict(set)
    relationships = []
    
    # Iterate through each token in the document
    for token in doc:
        # Check if the token has a head (to avoid root)
        if token.dep_ != "ROOT":
            # Add governor and dependent to nodes
            nodes[token.head.pos_].add(token.head.text)
            nodes[token.pos_].add(token.text)
            
            # Create a tuple of (governor, relationship, dependent)
            relationship = (token.head.text, token.dep_, token.text)
            relationships.append(relationship)
    
    return nodes, relationships



# Named Entity Recognition
def analyze_named_entities(text):
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(text)
    
    print("\nNamed Entities:")
    for ent in doc.ents:
        print(f"CREATE (:{ent.label_} {{name: '{ent.text}'}})")

# Example usage for Named Entity Recognition
analyze_named_entities(text)

[('Hello', 'punct', '!'), ('is', 'nsubj', 'This'), ('text', 'det', 'a'), ('text', 'compound', 'sample'), ('is', 'attr', 'text'), ('is', 'punct', '.'), ('contains', 'nsubj', 'It'), ('sentences', 'amod', 'multiple'), ('contains', 'dobj', 'sentences'), ('contains', 'punct', '.'), ('many', 'advmod', 'How'), ('find', 'nsubj', 'many'), ('find', 'aux', 'will'), ('find', 'nsubj', 'spaCy'), ('find', 'punct', '?')]

Named Entities:
