In [12]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
import spacy
from dateutil import parser

text = "I have a meeting scheduled on June 15, 2023. " \
       "The conference will take place from October 5th to October 8th" \
       ", 2022. Our project deadline is set for September 30, 2022. " \
       "Yesterday's date was June 11, 2023. Next week, " \
       "we plan to have a team gathering on Friday, June 16th, 2023."

# Tokenize text into sentences
sentences = sent_tokenize(text)

# Tokenize sentences into words
words = [word_tokenize(sentence) for sentence in sentences]

# Remove stopwords
stopwords = nltk.corpus.stopwords.words('english')
filtered_words = [[word for word in sentence if word.lower() not in stopwords] for sentence in words]

# Load spaCy model and process text
nlp = spacy.load("en_core_web_sm")
doc = nlp(text)

# Extract named entities
entities = [(entity.text, entity.label_) for entity in doc.ents]
print(f"Named Entities Extracted are: {entities}")

relations = []
for token in doc:
    if token.dep_ == "nsubj" and token.head.pos_ == "VERB":
        subject = token.text
        verb = token.head.text
        relation = (subject, verb)
        relations.append(relation)
print(f"Relations Extracted are : {relations}")

dates = []
for word in text.split():
    try:
        date = parser.parse(word, fuzzy=True)
        dates.append(date.strftime("%B %d, %Y"))
    except ValueError:
        pass

print(f"Dates Extracted are : {dates}")


Named Entities Extracted are: [('June 15, 2023', 'DATE'), ('October 5th to October 8th, 2022', 'DATE'), ('September 30, 2022', 'DATE'), ('Yesterday', 'DATE'), ('June 11, 2023', 'DATE'), ('Next week', 'DATE'), ('Friday, June 16th, 2023', 'DATE')]
Relations Extracted are : [('I', 'have'), ('conference', 'take'), ('we', 'plan')]
Dates Extracted are : ['June 12, 2023', 'June 15, 2023', 'June 12, 2023', 'October 12, 2023', 'June 05, 2023', 'October 12, 2023', 'June 08, 2023', 'June 12, 2022', 'September 12, 2023', 'June 30, 2023', 'June 12, 2022', 'June 12, 2023', 'June 11, 2023', 'June 12, 2023', 'June 16, 2023', 'June 12, 2023', 'June 16, 2023', 'June 12, 2023']


In [1]:
#Document level relation extraction
#Entity Recognition:
import spacy

# Load SpaCy's English model
nlp = spacy.load('en_core_web_sm')

# Process the document
doc = nlp("Apple Inc. is headquartered in Cupertino, California.")

# Extract named entities
entities = [(entity.text, entity.label_) for entity in doc.ents]

print(entities)


[('Apple Inc.', 'ORG'), ('Cupertino', 'GPE'), ('California', 'GPE')]


In [2]:
#sentence boundaries
from nltk import sent_tokenize

# Sample document
document = "This is the first sentence. This is the second sentence."

# Tokenize the document into sentences
sentences = sent_tokenize(document)

print(sentences)



['This is the first sentence.', 'This is the second sentence.']


In [2]:
#coreference Resolution
import spacy
from spacy import displacy
import neuralcoref

# Load the spaCy English language model
nlp = spacy.load('en_core_web_sm')

# Add neuralcoref to the pipeline
neuralcoref.add_to_pipe(nlp)

# Now you should be able to use coreference resolution
doc = nlp("John is a software engineer. He works at a tech company.")
print(doc._.coref_clusters)  # Access the coreference clusters

resolved_text = []
for sent in doc.sents:
    sent_resolved = []
    for token in sent:
        if token._.coref_clusters:
            main_mention = token._.coref_clusters[0].main.text
            sent_resolved.append(main_mention)
        else:
            sent_resolved.append(token.text)
    resolved_text.append(" ".join(sent_resolved))

print("\n".join(resolved_text))



AttributeError: [E046] Can't retrieve unregistered extension attribute 'coref_clusters'. Did you forget to call the `set_extension` method?

In [25]:
#Relation Extraction
import spacy
from spacy.matcher import Matcher

# Load SpaCy model
nlp = spacy.load('en_core_web_sm')

# Example sentence
sentence = "Apple Inc., headquartered in Cupertino, California, " \
    "was founded by Steve Jobs and Steve Wozniak.Jone is employed there! HE likes to eat pizza."

# Process the sentence
doc = nlp(sentence)

# Define the relation patterns using SpaCy's rule-based Matcher
matcher = Matcher(nlp.vocab)

# Define the patterns for the relation "headquartered in"
patterns = [
    [{'LOWER': 'headquartered'}, {'LOWER': 'in'}],
    [{'LOWER': 'founded'}, {'POS': 'ADP'}, {'ENT_TYPE': 'PERSON'}]
]
# Add the pattern to the matcher
matcher.add('Headquartered', patterns ,on_match= None)

# Find matches in the document
matches = matcher(doc)

# Extract the entities and relation
entities = []
relation = None

for match_id, start, end in matches:
    if relation is None:
        relation = doc[start:end].text
    for ent in doc[start:end].ents:
        entities.append(ent.text)

# Print the entities and relation
print("Entities:", entities)
print("Relation:", relation)


Entities: []
Relation: headquartered in


RateLimitError: You exceeded your current quota, please check your plan and billing details.

In [23]:
#Relation Extraction
import spacy
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC

# Load SpaCy model
nlp = spacy.load('en_core_web_sm')

# Example training data
data = [
    ('Barack Obama is the President of the United States.', 'Barack Obama', 'President'),
    ('Steve Jobs co-founded Apple Inc.', 'Steve Jobs', 'co-founded'),
    ('The Eiffel Tower is located in Paris.', 'Eiffel Tower', 'located in'),
    ('The book was written by J.K. Rowling.', 'book', 'written by'),
    # Add more training examples with different relations and entities
]

# Extract features from training data using SpaCy
def extract_features(sentence):
    doc = nlp(sentence)
    features = []
    for token in doc:
        features.append(token.text)
        features.append(token.pos_)
        features.append(token.dep_)
    return ' '.join(features)

# Prepare training data
sentences = []
entities = []
labels = []
for sentence, entity, label in data:
    sentences.append(extract_features(sentence))
    entities.append(entity)
    labels.append(label)

# Vectorize training data using TF-IDF
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(sentences)

# Train a classifier (e.g., Linear SVM)
classifier = LinearSVC()
classifier.fit(X, labels)

# Example test sentence
test_sentence = "Apple Inc. was founded by Steve Jobs."

# Extract features from test sentence
test_features = extract_features(test_sentence)

# Vectorize test data
test_data = vectorizer.transform([test_features])

# Predict the relation class
predicted_label = classifier.predict(test_data)

# Print the predicted relation class
print(f"Test Sentance: {test_sentence}")
print(f"Predicted lable data : {predicted_label}")
print("Predicted Relation:", predicted_label[0])

Test Sentance: Apple Inc. was founded by Steve Jobs.
Predicted lable data : ['co-founded']
Predicted Relation: co-founded
