In [1]:
pip install spacy



In [3]:
import spacy

# Load the English model for spaCy
nlp = spacy.load("en_core_web_sm")

def extract_events_and_time(text):
    # Process the text using spaCy
    doc = nlp(text)

    # Extract named entities
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    event_entities = [(ent.text, ent.label_) for ent in doc.ents if ent.label_ != "TIME"]
    time_entities = [(ent.text, ent.label_) for ent in doc.ents if ent.label_ == "TIME"]

    return event_entities, time_entities

# Sample text for testing
text = "The meeting is scheduled for tomorrow at 2pm. John and Mary are attending."

# Extract events and time expressions
events, time = extract_events_and_time(text)

print("Events:", events)
print("Time Expressions:", time)

Events: [('tomorrow', 'DATE'), ('John', 'PERSON'), ('Mary', 'PERSON')]
Time Expressions: [('2pm', 'TIME')]


In [4]:
import spacy
from spacy import displacy

# Load English tokenizer, tagger, parser, NER, and word vectors
nlp = spacy.load("en_core_web_sm")

# Example usage
input_text = "Indian Space Research Organisation (ISRO) is the space agency of India. The organisation is involved in science, engineering and technology to harvest the benefits of outer space for India and the mankind. ISRO is a major constituent of the Department of Space (DOS), Government of India. The department executes the Indian Space Programme primarily through various Centres or units within ISRO. ISRO was previously the Indian National Committee for Space Research (INCOSPAR), set up by the Government of India in 1962, as envisioned by Dr. VikramA Sarabhai. ISRO was formed on August 15, 1969 and superseded INCOSPAR with an expanded role to harness space technology. DOS was set up and ISRO was brought under DOS in 1972."
text = nlp(input_text)
displacy.render(text, style="ent", jupyter=True)

In [8]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [9]:
import json
import spacy

In [6]:
! pip install jsonlines

Collecting jsonlines
  Downloading jsonlines-4.0.0-py3-none-any.whl (8.7 kB)
Installing collected packages: jsonlines
Successfully installed jsonlines-4.0.0


In [10]:
import jsonlines

def load_data(file_path):
    data = []
    with jsonlines.open(file_path) as reader:
        for obj in reader:
            data.append(obj)
    return data

# Load training, validation, and test data
train_data = load_data('/content/drive/MyDrive/Dataset/train.jsonl')
valid_data = load_data('/content/drive/MyDrive/Dataset/valid.jsonl')
test_data = load_data('/content/drive/MyDrive/Dataset/test.jsonl')

# Example: Print the first instance in the training data
print(train_data[0])




In [11]:
import json

# Assuming dataset is a list of dictionaries
for data in train_data:
    # Accessing elements within each dictionary
    print(data['id'])
    print(data['title'])
    for content in data['content']:
        print(content['sentence'])
        print(content['tokens'])
    for event in data['events']:
        print(event['id'])
        print(event['type'])
        for mention in event['mention']:
            print(mention['id'])
            print(mention['trigger_word'])
    for negative_trigger in data['negative_triggers']:
        print(negative_trigger['id'])
        print(negative_trigger['trigger_word'])

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
b226e4064cbd1e513c5a612e77bd24fb
Getting
70609cefacbdf09c2c0663a3fe8f9cf4
made
094f2c388aec22cf43eaae719b0bb9d5
Motion_directional
05d6499add609ac3ea70c5892f3c0881
climbed
ca20fb0d516a9170946bbbbf9c1aac6e
Getting
9e85cf3d03c549a2d836b4918edc10f0
tied
c090f9f90b96df59a8b64bd7b8956092
Defending
fad3c5c249e9b6272ea2b7cf389bf0e6
defense
7b9b6b6b6275b6b77394b921e2a22d1d
Participation
2faf862b233bf20501d692aa20a63545
enter
925a7c77e4f5da3f801222f33816e84b
Self_motion
f3cd95133a1066ebb2eb78221cc94021
made
7d9e46519352e2c5d668584591d4443d
Social_event
1d847d2780bd781b9ff5f651fe434309
hosted
be23a4408ba47e4e09ddb7721ba2bc64
Social_event
4fd05744d3b60396d91e026464aff9cc
hosted
c6c4d8c0e8def34395209403362c960f
U.S.
32131719ff7aea298d9bceabeee53d24
Open
83c4ed43377346d6c8138d982ae3e63c
61st
216bb9b2cab831f78dfe8c5072a004d4
U.S.
688e1305536b187f21be83d6d6cd814f
Open
21a4d3aea7fd986101eb0ebb99e1efd5
June
42e963d421330fbba5ffabe3af20e67

In [12]:
print("Training data size:", len(train_data))
print("Example training instance:", train_data[0])

Training data size: 2913


In [13]:
import re

def preprocess_text(text):
    # Remove special characters and extra spaces
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

def preprocess_data(data):
    preprocessed_data = []
    for instance in data:
        content = instance['content']
        preprocessed_content = []
        for item in content:
            sentence = preprocess_text(item['sentence'])
            tokens = item['tokens']
            preprocessed_content.append({'sentence': sentence, 'tokens': tokens})
        instance['content'] = preprocessed_content
        preprocessed_data.append(instance)
    return preprocessed_data

# Preprocess training, validation, and test data
train_data = preprocess_data(train_data)
valid_data = preprocess_data(valid_data)
test_data = preprocess_data(test_data)

# Example: Print the preprocessed content of the first instance in the training data
print(train_data[0]['content'])




In [15]:
import spacy

# Load English tokenizer, tagger, parser, NER, and word vectors
nlp = spacy.load("en_core_web_sm")

# Extract named entities from text
def extract_entities(text):
    doc = nlp(text)
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    return entities

In [16]:
# Parse dependencies from text
def parse_dependencies(text):
    doc = nlp(text)
    dependencies = [(token.text, token.dep_, token.head.text) for token in doc]
    return dependencies

In [17]:
# Rule-based extraction based on identified entities and dependencies

def load_triggers(dataset_file):
    triggers = []
    with jsonlines.open(dataset_file) as reader:
        for obj in reader:
            for event in obj['events']:
                for mention in event['mention']:
                    triggers.append(mention['trigger_word'])
    return triggers

# Extract event details from text, entities, and dependencies
def extract_event_details(text, entities, dependencies, triggers):
    event_details = {
        'location': None,
        'date': None,
        'time': None,
        'event_type': None
    }
    for trigger in triggers:
        if trigger in text:
            event_details['event_type'] = trigger
            break
    for entity, label in entities:
        if label == 'DATE':
            event_details['date'] = entity
        elif label == 'TIME':
            event_details['time'] = entity
        elif label == 'GPE':
            event_details['location'] = entity
        elif label == 'EVENT_TYPE':
            event_details['event_type'] = entity


    # Leverage Dependencies for Event Roles
    for token, dep, head_token in dependencies:
        if dep == 'nsubj' and head_token.lower() in ('held', 'conducted', 'organized', 'occurred', 'happened', 'took place'):
            event_details['event_type'] = head_token  # Extract potential event type
        elif dep == 'dobj' and head_token.lower() in ('held', 'conducted', 'organized', 'occurred', 'happened', 'took place'):
            event_details['event_type'] = head_token  # Extract potential event type
        elif dep == 'nmod' and head_token.lower() in ('held', 'conducted', 'organized', 'occurred', 'happened', 'took place'):
            if token.lower() in ('place', 'region', 'city'):
                event_details['location'] = head_token  # Extract potential location from prepositional phrases

    # Adjust event type based on context
    if event_details['event_type'] is None:
        for token, dep, head_token in dependencies:
            if dep == 'ROOT' and token.lower() in ('held', 'conducted', 'organized', 'occurred', 'happened', 'took place'):
                event_details['event_type'] = token  # Extract event type from root verb

    return event_details

In [18]:
def extract_event_information(input_text, triggers):
    # Preprocess the input text
    cleaned_text = preprocess_text(input_text)

    # Extract entities using NER
    entities = extract_entities(cleaned_text)

    # Parse dependencies using dependency parsing
    dependencies = parse_dependencies(cleaned_text)

    # Extract event details using rules based on entities, dependencies, and triggers
    event_details = extract_event_details(cleaned_text, entities, dependencies, triggers)

    return event_details

In [20]:
triggers = load_triggers('/content/drive/MyDrive/Dataset/train.jsonl')
text = "A strong earthquake struck California on February 29, 2024, at 10:00 AM."
event_details = extract_event_information(text, triggers)
print(event_details)

{'location': 'California', 'date': 'February 29 2024', 'time': '1000 AM', 'event_type': 'earthquake'}


In [21]:
# Example random input text
input_text2 = "A music festival took place in New York City on July 4, 2023, starting at 6 PM."
event_details2 = extract_event_information(input_text2, triggers)
print(event_details2)

{'location': 'New York City', 'date': 'July 4 2023', 'time': '6 PM', 'event_type': 'took place'}


In [22]:
input_text = "The annual conference on artificial intelligence and machine learning will be held in San Francisco from October 10th to October 12th, 2023."
event_details = extract_event_information(input_text, triggers)
print(event_details)

{'location': 'San Francisco', 'date': 'October 10th to October 12th 2023', 'time': None, 'event_type': 'ran'}


In [23]:
def calculate_accuracy(ground_truth, predictions):
    correct_predictions = 0
    total_instances = len(ground_truth)

    for i in range(total_instances):
        if ground_truth[i] == predictions[i]:
            correct_predictions += 1

    accuracy = (correct_predictions / total_instances) * 100
    return accuracy


In [24]:
def calculate_precision_recall_f1(ground_truth, predictions):
    true_positives = sum(1 for i in range(len(predictions)) if predictions[i] == 1 and ground_truth[i] == 1)
    false_positives = sum(1 for i in range(len(predictions)) if predictions[i] == 1 and ground_truth[i] == 0)
    false_negatives = sum(1 for i in range(len(predictions)) if predictions[i] == 0 and ground_truth[i] == 1)

    precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
    recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0

    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    return precision, recall, f1_score

# Example ground truth and predictions (binary classification: 1 for event, 0 for non-event)
ground_truth = [1, 0, 1, 1, 0, 1, 0, 1, 0, 0]
predictions = [1, 0, 1, 0, 1, 1, 0, 1, 0, 1]

# Calculate precision, recall, and F1 score
precision, recall, f1_score = calculate_precision_recall_f1(ground_truth, predictions)

print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1_score:.2f}")


Precision: 0.67
Recall: 0.80
F1 Score: 0.73
