## Sample Output

### Event Extraction

In [17]:
import sys
print(sys.executable)

/Users/ivan/anaconda3/bin/python


In [None]:
!{sys.executable} -m spacy download en_core_web_sm

#### Importing Libraries

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, confusion_matrix, accuracy_score, precision_score, recall_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
import numpy as np
from transformers import BertTokenizer, BertModel
import torch
from datetime import datetime
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import cross_val_score
from joblib import dump
import spacy
import pickle

In [6]:
import pandas as pd
import spacy
import torch
from transformers import BertTokenizer, BertModel
from keras.preprocessing.sequence import pad_sequences
import pickle

# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Load pre-trained model (weights)
model = BertModel.from_pretrained('bert-base-uncased',
                                  output_hidden_states = True, # Whether the model returns all hidden-states.
                                  )

# Put the model in "evaluation" mode, meaning feed-forward operation.
model.eval()

# Load spaCy model
nlp = spacy.load('en_core_web_sm')

# Load the trained model
with open('bert_model.pkl', 'rb') as f:
    classifier = pickle.load(f)

# Define the functions: extract_event_sentences, calculate_embeddings, clean_text, tokens_to_sentence
def extract_event_sentences(text):
    doc = nlp(text)
    event_sentences = [sent.text.strip() for sent in doc.sents if any(ent.label_ in ['EVENT', 'DATE', 'TIME'] for ent in sent.ents)]
    return ' '.join(event_sentences)

# Function to calculate the embeddings
def calculate_embeddings(text):
    # Add the special tokens.
    marked_text = "[CLS] " + str(text) + " [SEP]"

    # Split the sentence into tokens.
    tokenized_text = tokenizer.tokenize(marked_text)

    # Map the token strings to their vocabulary indeces.
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

    # Define the maximum length of sequences
    MAX_LEN = 512

    # Truncate and pad the input sequences so that they all have the same length
    indexed_tokens = pad_sequences([indexed_tokens], maxlen=MAX_LEN, dtype="long", 
                            value=0, truncating="post", padding="post")[0]

    # Mark each of the tokens as belonging to sentence "1".
    segments_ids = [1] * len(indexed_tokens)

    # Convert inputs to PyTorch tensors
    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensors = torch.tensor([segments_ids])

    # Run the text through BERT, and collect all of the hidden states produced from all 12 layers.
    with torch.no_grad():
        outputs = model(tokens_tensor, segments_tensors)

    # Evaluating the model will return a different number of objects based on how it's  configured in the `from_pretrained` call earlier. In this case, becase we set `output_hidden_states = True`, the third item will be the hidden states from all layers.
    hidden_states = outputs[2]

    # Concatenate the tensors for all layers. We use `stack` here to create a new dimension in the tensor.
    token_embeddings = torch.stack(hidden_states, dim=0)

    # Remove dimension 1, the "batches".
    token_embeddings = torch.squeeze(token_embeddings, dim=1)

    # Swap dimensions 0 and 1.
    token_embeddings = token_embeddings.permute(1,0,2)

    # Stores the token vectors, with shape [22 x 768]
    token_vecs_sum = []

    # `token_embeddings` is a [22 x 12 x 768] tensor.

    # For each token in the sentence...
    for token in token_embeddings:

        # `token` is a [12 x 768] tensor

        # Sum the vectors from the last four layers.
        sum_vec = torch.sum(token[-4:], dim=0)
        
        # Use `sum_vec` to represent `token`.
        token_vecs_sum.append(sum_vec)

    # Calculate the average embedding.
    sentence_embedding = torch.mean(torch.stack(token_vecs_sum), dim=0)

    return sentence_embedding.numpy()

def clean_text(text):
    # Split the text by space
    tokens = text.split()

    # Remove '##' and join subwords
    clean_tokens = [token.replace('##', '') if token.startswith('##') else ' ' + token for token in tokens]

    # Join tokens into a string to form the cleaned text
    cleaned_text = ''.join(clean_tokens)

    return cleaned_text

def tokens_to_sentence(text):
    # Add the special tokens.
    marked_text = "[CLS] " + str(text) + " [SEP]"

    # Split the sentence into tokens.
    tokenized_text = tokenizer.tokenize(marked_text)

    # Map the token strings to their vocabulary indeces.
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

    # Convert token IDs back to tokens
    tokens = tokenizer.convert_ids_to_tokens(indexed_tokens)

    # Remove [CLS] and [SEP] tokens
    tokens = [token for token in tokens if token not in ['[CLS]', '[SEP]']]

    # Join tokens into a string to form the sentence
    sentence = ' '.join(tokens)

    return sentence


# Now, you can use the model to predict the events from a new input text
def predict_events(input_text):
    # Extract event sentences
    event_sentences = extract_event_sentences(input_text)

    # Calculate the embeddings for the event sentences
    embeddings = calculate_embeddings(event_sentences)

    # Convert token IDs back to sentences
    output_sentence = tokens_to_sentence(event_sentences)

    # Use the trained model to predict the events
    prediction = classifier.predict([embeddings])

    # Clean the text
    cleaned_text = clean_text(output_sentence)
    return cleaned_text, prediction

# Load the data
# df = pd.read_excel('demo_data.xlsx')
df = pd.read_csv('new_demo.csv')

# Apply the predict_events function to the 'news_text' column
df['extracted_events'], df['predictions'] = zip(*df['news_text'].map(predict_events))

# Write the DataFrame
df.to_excel('final_output.xlsx', index=False)