## Event Extraction - Solution 4: Doc2Vec

In [1]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from datetime import datetime
from sklearn.metrics.pairwise import cosine_similarity
import re

# Function to preprocess and normalize sentences
def preprocess_and_normalize_sentence(sentence):
    if isinstance(sentence, str):
        sentence = re.sub(r"\b(\w+)\s's\b", r"\1's", sentence)
        sentence = sentence.lower()
    else:
        sentence = str(sentence)
    return sentence

# Load your data
df = pd.read_csv('news_cleaned_no_spaces.csv', encoding='latin1')
df = df[:101]

# Assume df['gold_truth'] contains your event sentences
event_sentences = df['news_text'].tolist()

# Split the data into training and testing sets
train_sentences, test_sentences = train_test_split(event_sentences, test_size=0.2, random_state=42)

# Prepare training data
train_documents = [TaggedDocument(words=word_tokenize(str(doc).lower()), tags=[i]) for i, doc in enumerate(train_sentences) if doc == doc]

# Prepare training data
documents = [TaggedDocument(words=word_tokenize(str(doc).lower()), tags=[i]) for i, doc in enumerate(event_sentences) if doc == doc]

# Train a Doc2Vec model
model = Doc2Vec(documents, vector_size=100, window=5, min_count=2, workers=4, epochs=20, dm=1)

# Create a DataFrame that's a copy of the original
predicted_df = df.copy()

# Add a new column 'output' initialized with NaN
predicted_df['output'] = np.nan
# Assume df['news_text'] contains the text from which you want to extract event sentences
news_text = df['news_text'].tolist()

# Prepare a list to store the cosine similarities
similarities = []

# Initialize counters
TP = 0
FP = 0

# Assume golden_truth is a list of the actual events
golden_truth = df['gold_truth'].tolist()

# Preprocess and normalize the golden truth
normalized_golden_truth = [preprocess_and_normalize_sentence(sentence) for sentence in golden_truth]

# Iterate over the event sentences
for idx, text in enumerate(news_text):
    # Check if text is not NaN
    if text == text:
        # Infer a vector for the sentence
        vector = model.infer_vector(word_tokenize(str(text).lower()))
        
        # Find the most similar sentences in your event sentences
        similar_sentences = model.dv.most_similar([vector], topn=1)
        
        # Store the most similar sentence (the prediction) in the 'output' column
        predicted_df.loc[idx, 'output'] = event_sentences[similar_sentences[0][0]]

        # Calculate the cosine similarity between the vector of the predicted sentence and the vector of the actual sentence
        predicted_vector = model.infer_vector(word_tokenize(predicted_df.loc[idx, 'output'].lower()))
        similarity = cosine_similarity([vector], [predicted_vector])
        
        # Add the cosine similarity to the list
        similarities.append(similarity[0][0])

        # Preprocess and normalize the predicted sentence
        normalized_output = preprocess_and_normalize_sentence(predicted_df.loc[idx, 'output'])

        # Check if the normalized predicted sentence is in the normalized golden truth
        if any(normalized_truth in normalized_output for normalized_truth in normalized_golden_truth):
            TP += 1
        else:
            FP += 1

FN = len(normalized_golden_truth) - TP

# Calculate precision, recall, and F1 score
precision = TP / (TP + FP) if TP + FP > 0 else 0
recall = TP / (TP + FN) if TP + FN > 0 else 0
f1 = 2 * (precision * recall) / (precision + recall) if precision + recall > 0 else 0
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')

average_similarity = sum(similarities) / len(similarities)

print(f'Average Cosine Similarity: {average_similarity}')
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')

# Save to CSV
predicted_df.to_csv(f'predicted_sentences_doc2vec{timestamp}.csv', index=False)

Precision: 0.6039603960396039
Recall: 0.6039603960396039
F1 Score: 0.6039603960396039
Average Cosine Similarity: 0.9954271045061621
