## Event Extraction - Solution 3: BERT, spaCy, SVM

### Model Training

In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, confusion_matrix, accuracy_score, precision_score, recall_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
import numpy as np
from transformers import BertTokenizer, BertModel
import torch
from datetime import datetime
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import cross_val_score
from joblib import dump
import spacy
import pickle
# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Load pre-trained model (weights)
model = BertModel.from_pretrained('bert-base-uncased',
                                  output_hidden_states = True, # Whether the model returns all hidden-states.
                                  )

# Put the model in "evaluation" mode, meaning feed-forward operation.
model.eval()

# Load spaCy model
nlp = spacy.load('en_core_web_sm')

def extract_event_sentences(text):
    doc = nlp(text)
    event_sentences = [sent.text.strip() for sent in doc.sents if any(ent.label_ in ['EVENT', 'DATE', 'TIME'] for ent in sent.ents)]
    return ' '.join(event_sentences)

# Function to calculate the embeddings
def calculate_embeddings(text):
    # Add the special tokens.
    marked_text = "[CLS] " + str(text) + " [SEP]"

    # Split the sentence into tokens.
    tokenized_text = tokenizer.tokenize(marked_text)

    # Map the token strings to their vocabulary indeces.
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

    # Define the maximum length of sequences
    MAX_LEN = 512

    # Truncate and pad the input sequences so that they all have the same length
    indexed_tokens = pad_sequences([indexed_tokens], maxlen=MAX_LEN, dtype="long", 
                            value=0, truncating="post", padding="post")[0]

    # Mark each of the tokens as belonging to sentence "1".
    segments_ids = [1] * len(indexed_tokens)

    # Convert inputs to PyTorch tensors
    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensors = torch.tensor([segments_ids])

    # Run the text through BERT, and collect all of the hidden states produced from all 12 layers.
    with torch.no_grad():
        outputs = model(tokens_tensor, segments_tensors)

    # Evaluating the model will return a different number of objects based on how it's  configured in the `from_pretrained` call earlier. In this case, becase we set `output_hidden_states = True`, the third item will be the hidden states from all layers.
    hidden_states = outputs[2]

    # Concatenate the tensors for all layers. We use `stack` here to create a new dimension in the tensor.
    token_embeddings = torch.stack(hidden_states, dim=0)

    # Remove dimension 1, the "batches".
    token_embeddings = torch.squeeze(token_embeddings, dim=1)

    # Swap dimensions 0 and 1.
    token_embeddings = token_embeddings.permute(1,0,2)

    # Stores the token vectors, with shape [22 x 768]
    token_vecs_sum = []

    # `token_embeddings` is a [22 x 12 x 768] tensor.

    # For each token in the sentence...
    for token in token_embeddings:

        # `token` is a [12 x 768] tensor

        # Sum the vectors from the last four layers.
        sum_vec = torch.sum(token[-4:], dim=0)
        
        # Use `sum_vec` to represent `token`.
        token_vecs_sum.append(sum_vec)

    # Calculate the average embedding.
    sentence_embedding = torch.mean(torch.stack(token_vecs_sum), dim=0)

    return sentence_embedding.numpy()

def clean_text(text):
    # Split the text by space
    tokens = text.split()

    # Remove '##' and join subwords
    clean_tokens = [token.replace('##', '') if token.startswith('##') else ' ' + token for token in tokens]

    # Join tokens into a string to form the cleaned text
    cleaned_text = ''.join(clean_tokens)

    return cleaned_text

def tokens_to_sentence(text):
    # Add the special tokens.
    marked_text = "[CLS] " + str(text) + " [SEP]"

    # Split the sentence into tokens.
    tokenized_text = tokenizer.tokenize(marked_text)

    # Map the token strings to their vocabulary indeces.
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

    # Convert token IDs back to tokens
    tokens = tokenizer.convert_ids_to_tokens(indexed_tokens)

    # Remove [CLS] and [SEP] tokens
    tokens = [token for token in tokens if token not in ['[CLS]', '[SEP]']]

    # Join tokens into a string to form the sentence
    sentence = ' '.join(tokens)

    return sentence
# Load your data
df = pd.read_csv('news_cleaned_no_spaces.csv', encoding='latin1')
df['news_text'] = df['news_text'].astype(str)

# Extract event sentences
df['output_text'] = df['news_text'].apply(extract_event_sentences)

# Calculate the embeddings for each sentence
df['output'] = df['output_text'].apply(calculate_embeddings)

# Split the data into training and testing sets
train_sentences, test_sentences, train_labels, test_labels = train_test_split(df['output'].tolist(), df['gold_truth'].tolist(), test_size=0.2, random_state=30)

# ------- TRAIN CLASSIFIER ------------
# Convert list of arrays into a 2D array
train_sentences_array = np.vstack(train_sentences)
test_sentences_array = np.vstack(test_sentences)

# Train a Support Vector Machine model
classifier = SVC(random_state=42)
classifier.fit(train_sentences_array, train_labels)

# Make predictions on the test set
test_predictions = classifier.predict(test_sentences_array)

# Calculate the accuracy
accuracy = accuracy_score(test_labels, test_predictions)
print(f'Accuracy: {accuracy}')

# Calculate the precision
precision = precision_score(test_labels, test_predictions, average='weighted')
print(f'Precision: {precision}')

# Calculate the recall
recall = recall_score(test_labels, test_predictions, average='weighted')
print(f'Recall: {recall}')

# Calculate the F1 score
f1 = f1_score(test_labels, test_predictions, average='weighted')
print(f'F1 Score: {f1}')

scores = cross_val_score(classifier, train_sentences_array, train_labels, cv=5)
print("Cross-Validation Scores: ", scores)
print("Average Score: ", scores.mean())

timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')

# Convert token IDs back to sentences and store in 'output_sentence' column
df['output_sentence'] = df['output_text'].apply(tokens_to_sentence)

# Clean text
df['cleaned_text'] = df['output_sentence'].apply(clean_text)

# Save to CSV
df.to_csv(f'predicted_sentences_bert{timestamp}.csv', index=False)

# Save the model
dump(classifier, 'bert_model.joblib') 

  tokens_tensor = torch.tensor([indexed_tokens])


Accuracy: 0.9953900709219858


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Precision: 0.990801393290076
Recall: 0.9953900709219858
F1 Score: 0.9930904315187539




Cross-Validation Scores:  [0.99667553 0.9964539  0.99667479 0.99667479 0.99667479]
Average Score:  0.9966307634922357


['bert_model.joblib']

In [17]:
import sys
print(sys.executable)

/Users/ivan/anaconda3/bin/python


In [None]:
!{sys.executable} -m spacy download en_core_web_sm

#### Importing Libraries

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, confusion_matrix, accuracy_score, precision_score, recall_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
import numpy as np
from transformers import BertTokenizer, BertModel
import torch
from datetime import datetime
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import cross_val_score
from joblib import dump
import spacy
import pickle

In [4]:
# Load the model
with open('bert_model.pkl', 'rb') as f:
    classifier = pickle.load(f)


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [10]:
# Create text box for input
import ipywidgets as widgets

input_text = widgets.Textarea(
    value='',
    placeholder='Enter your text here:',
    description='Input:',
    disable=False
)

# Display the text box
display(input_text)

Textarea(value='', description='Input:', placeholder='Enter your text here:')

In [None]:
import pandas as pd
import spacy
import torch
from transformers import BertTokenizer, BertModel
from keras.preprocessing.sequence import pad_sequences
import pickle

# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Load pre-trained model (weights)
model = BertModel.from_pretrained('bert-base-uncased',
                                  output_hidden_states = True, # Whether the model returns all hidden-states.
                                  )

# Put the model in "evaluation" mode, meaning feed-forward operation.
model.eval()

# Load spaCy model
nlp = spacy.load('en_core_web_sm')

# Load the trained model
with open('bert_model.pkl', 'rb') as f:
    classifier = pickle.load(f)

# Define the functions: extract_event_sentences, calculate_embeddings, clean_text, tokens_to_sentence
def extract_event_sentences(text):
    doc = nlp(text)
    event_sentences = [sent.text.strip() for sent in doc.sents if any(ent.label_ in ['EVENT', 'DATE', 'TIME'] for ent in sent.ents)]
    return ' '.join(event_sentences)

# Function to calculate the embeddings
def calculate_embeddings(text):
    # Add the special tokens.
    marked_text = "[CLS] " + str(text) + " [SEP]"

    # Split the sentence into tokens.
    tokenized_text = tokenizer.tokenize(marked_text)

    # Map the token strings to their vocabulary indeces.
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

    # Define the maximum length of sequences
    MAX_LEN = 512

    # Truncate and pad the input sequences so that they all have the same length
    indexed_tokens = pad_sequences([indexed_tokens], maxlen=MAX_LEN, dtype="long", 
                            value=0, truncating="post", padding="post")[0]

    # Mark each of the tokens as belonging to sentence "1".
    segments_ids = [1] * len(indexed_tokens)

    # Convert inputs to PyTorch tensors
    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensors = torch.tensor([segments_ids])

    # Run the text through BERT, and collect all of the hidden states produced from all 12 layers.
    with torch.no_grad():
        outputs = model(tokens_tensor, segments_tensors)

    # Evaluating the model will return a different number of objects based on how it's  configured in the `from_pretrained` call earlier. In this case, becase we set `output_hidden_states = True`, the third item will be the hidden states from all layers.
    hidden_states = outputs[2]

    # Concatenate the tensors for all layers. We use `stack` here to create a new dimension in the tensor.
    token_embeddings = torch.stack(hidden_states, dim=0)

    # Remove dimension 1, the "batches".
    token_embeddings = torch.squeeze(token_embeddings, dim=1)

    # Swap dimensions 0 and 1.
    token_embeddings = token_embeddings.permute(1,0,2)

    # Stores the token vectors, with shape [22 x 768]
    token_vecs_sum = []

    # `token_embeddings` is a [22 x 12 x 768] tensor.

    # For each token in the sentence...
    for token in token_embeddings:

        # `token` is a [12 x 768] tensor

        # Sum the vectors from the last four layers.
        sum_vec = torch.sum(token[-4:], dim=0)
        
        # Use `sum_vec` to represent `token`.
        token_vecs_sum.append(sum_vec)

    # Calculate the average embedding.
    sentence_embedding = torch.mean(torch.stack(token_vecs_sum), dim=0)

    return sentence_embedding.numpy()

def clean_text(text):
    # Split the text by space
    tokens = text.split()

    # Remove '##' and join subwords
    clean_tokens = [token.replace('##', '') if token.startswith('##') else ' ' + token for token in tokens]

    # Join tokens into a string to form the cleaned text
    cleaned_text = ''.join(clean_tokens)

    return cleaned_text

def tokens_to_sentence(text):
    # Add the special tokens.
    marked_text = "[CLS] " + str(text) + " [SEP]"

    # Split the sentence into tokens.
    tokenized_text = tokenizer.tokenize(marked_text)

    # Map the token strings to their vocabulary indeces.
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

    # Convert token IDs back to tokens
    tokens = tokenizer.convert_ids_to_tokens(indexed_tokens)

    # Remove [CLS] and [SEP] tokens
    tokens = [token for token in tokens if token not in ['[CLS]', '[SEP]']]

    # Join tokens into a string to form the sentence
    sentence = ' '.join(tokens)

    return sentence


# Now, you can use the model to predict the events from a new input text
def predict_events(input_text):
    # Extract event sentences
    event_sentences = extract_event_sentences(input_text)

    # Calculate the embeddings for the event sentences
    embeddings = calculate_embeddings(event_sentences)

    # Convert token IDs back to sentences
    output_sentence = tokens_to_sentence(event_sentences)

    # Use the trained model to predict the events
    prediction = classifier.predict([embeddings])

    # Clean the text
    cleaned_text = clean_text(output_sentence)
    return cleaned_text, prediction

# Test the function with a new input text
cleaned_text, prediction = predict_events(input_text.value)
print("News Article:\n",input_text.value)
print("\n-------------------------------------------------------------------------------------------------------------------")
print("Event sentences:\n",cleaned_text)

## Sentiment Analysis

In [9]:
import pandas as pd
import spacy
import yfinance
import re

In [13]:

## Feature extraction functions----------------------------------

def get_company_name(ticker):
    """
    Get the full company name for a given stock ticker.
    
    Args:
    - ticker (str): The stock ticker to look up.
    
    Returns:
    - str: The full company name for the stock ticker.
    """
    company = ''
    try:
        company = yfinance.Ticker(ticker).info['longName']
    except:
        company = ''
    return company


def compiled_list_of_keywords(ticker, company_name):
    """
    Generate a list of keywords to search for in news articles based on the stock ticker and company name.
    
    Args:
    - ticker (str): The stock ticker to generate keywords for.
    - company_name (str): The full company name to generate keywords for.
    
    Returns:
    - list: A list of keywords to search for in news articles.
    """
    keywords = [ticker, company_name]
    # Split the company name into individual words and add to the list of keywords
    company_name = re.sub(r'[^a-zA-Z\s]', '', company_name)  # Remove non-letter characters
    keywords.extend(company_name.split())
    return keywords


def find_relevant_sentences(text, keywords):
    """
    Extract sentences from the provided text that contain any of the specified keywords.
    
    Args:
    - text (str): The text to search within.
    - keywords (list): A list of keywords to search for.
    
    Returns:
    - list: A list of sentences from the text that contain any of the keywords.
    """
    relevant_sentences = []
    doc = nlp(text)
    
    # Convert keywords to lowercase for case-insensitive matching
    keywords_lower = [keyword.lower() for keyword in keywords]
    
    for sent in doc.sents:
        # Check if any keyword is in the sentence
        if any(keyword in sent.text.lower() for keyword in keywords_lower):
            relevant_sentences.append(sent.text)
            
    return relevant_sentences

def process_keywords(keywords):
    """
    Process the list of keywords to remove any duplicates and convert to lowercase.
    
    Args:
    - keywords (list): A list of keywords to process.
    
    Returns:
    - list: The processed list of keywords.
    """
    keywords = [keyword.lower() for keyword in keywords]  # Convert to lowercase
    for word in keywords:
    # remove the word from keywords if it contains a non-alphabet character
        if not word.isalpha():
            keywords.remove(word)
    
    for word in keywords:
        if "inc" in word or "corporation" in word:
            keywords.remove(word)

    return keywords



## Model Processing functions----------------------------------

def aggregate_article_sentiment(sentiments, index):
    """
    Aggregates sentiment scores from individual sentences to determine the overall article sentiment.

    Parameters:
    - sentiments (list of dicts): Each dict contains 'label' and 'score' for a sentence.

    Returns:
    - str: The overall sentiment of the article ('positive', 'neutral', 'negative').
    """
    # Initialize counters for each sentiment
    total_scores = {'positive': 0, 'neutral': 0, 'negative': 0}
    
    # Sum up the scores for each sentiment
    for sentiment in sentiments:
        label = sentiment['label']
        score = sentiment['score']
        if label in total_scores:
            total_scores[label] += score
    
    # Normalize the scores to sum up to 1
    total_score = sum(total_scores.values())
    if total_score > 0:  # Avoid division by zero
        for key in total_scores:
            total_scores[key] /= total_score
    
    # Determine the overall sentiment by finding the max score
    overall_sentiment = max(total_scores, key=total_scores.get)
    print(f"Overall Sentiment for row {index}: {overall_sentiment}")
    
    return overall_sentiment


def chunk_sentence(sentences, chunk_size):
    """
    Chunk the list of sentences into smaller groups of a specified size.

    Parameters:
    - sentences (list): The list of sentences to chunk.
    - chunk_size (int): The maximum number of sentences in each chunk.

    Returns:
    - list: A list of chunks, where each chunk is a list of sentences.
    """
    chunks = []
    for i in range(0, len(sentences), chunk_size):
        chunks.append(sentences[i:i + chunk_size])
    return chunks


def processed_relevant_sentences(relevant_sentences):
    """
    Process the list of relevant sentences by appending the chunks of sentences to the list

    Parameters:
    - relevant_sentences (list): A list of relevant sentences to process.

    Returns:
    - list: The processed list of relevant sentences.
    """
    result = []
    for sentence in relevant_sentences:
        chunks = chunk_sentence(sentence, 512)
        for chunk in chunks:
            result.append(chunk)
    return result




# Initiate FinBERT Model

In [None]:
import sys
!{sys.executable} -m spacy download en_core_web_md

In [21]:
## Model Processing - GETTING SENTIMENT LABELS

from transformers import BertTokenizer, BertForSequenceClassification
from transformers import pipeline

# Load spaCy model for sentence tokenization
nlp = spacy.load('en_core_web_md')

model = BertForSequenceClassification.from_pretrained("ahmedrachid/FinancialBERT-Sentiment-Analysis",num_labels=3)
tokenizer = BertTokenizer.from_pretrained("ahmedrachid/FinancialBERT-Sentiment-Analysis")



Downloading config.json:   0%|          | 0.00/789 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/439M [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/369 [00:00<?, ?B/s]

# Example of a news article, containing long texts


In [23]:
# We want to find the sentiment of this article about Cisco Systems, Inc. (CSCO)
# This is an bullish article;
news_article = '''
Kyndryl Holdings Inc KD expanded its technology partnership with Cisco Systems Inc CSCO to offer cyber resilience services. 
Per this deal, KD will combine its cyber resilience framework with Cisco's extensive network software portfolio, hardware, and equipment to aid customers in efficiently addressing cyber incidents. 
Customers can maximize the return on their security investment through the adoption of more efficient and integrated security solutions. 
"As customers consume more cloud-based applications, it's more important than ever to have the right tools to help them integrate a cyber resilient framework into their IT strategy and business operations. 
Our collaboration with Cisco will enable Kyndryl to support our customers' zero trust journeys with dynamic and tailored solutions while integrating existing security controls," said Michelle Weston, Vice President of Global Offerings for Security and Resiliency. 
Earlier this month, KD reported Q1 FY24 revenues of $4.2 billion and a quarterly net loss of $(141) million. The company also raised its fiscal 2024 adjusted EBITDA margin outlook to approximately 14%, up from its prior projection of 12% - 13%. 
Price Action: KD shares are trading flat at $15.99 on the last check Thursday.
'''



In [24]:
# Create a list of keywords containing the stock ticker and company name
# Ticker is provided by the dataset
# Company name is obtained from Yahoo Finance
ticker_companyName = compiled_list_of_keywords("CSCO", "Cisco Systems")
print(ticker_companyName)


['CSCO', 'Cisco Systems', 'Cisco', 'Systems']


### Filter the article to only contain sentences relevant to the ticker.
### Doing so will ensure that we are getting the sentiment of the article with respect to the stock Instead of getting the general sentiment of the article

In [25]:
# Extract only the relevant sentences from the news article based on the keywords
relevant_sentences = find_relevant_sentences(news_article, ticker_companyName)
print(relevant_sentences)


['\nKyndryl Holdings Inc KD expanded its technology partnership with Cisco Systems Inc CSCO to offer cyber resilience services. \n', "Per this deal, KD will combine its cyber resilience framework with Cisco's extensive network software portfolio, hardware, and equipment to aid customers in efficiently addressing cyber incidents. \n", 'Our collaboration with Cisco will enable Kyndryl to support our customers\' zero trust journeys with dynamic and tailored solutions while integrating existing security controls," said Michelle Weston, Vice President of Global Offerings for Security and Resiliency. \n']


In [26]:
# Limitation of FinBERT
# Chunk the relevant sentences into groups of 512 words
processed_sentences = processed_relevant_sentences(relevant_sentences)
print(processed_sentences)

['\nKyndryl Holdings Inc KD expanded its technology partnership with Cisco Systems Inc CSCO to offer cyber resilience services. \n', "Per this deal, KD will combine its cyber resilience framework with Cisco's extensive network software portfolio, hardware, and equipment to aid customers in efficiently addressing cyber incidents. \n", 'Our collaboration with Cisco will enable Kyndryl to support our customers\' zero trust journeys with dynamic and tailored solutions while integrating existing security controls," said Michelle Weston, Vice President of Global Offerings for Security and Resiliency. \n']


In [27]:
# Get the sentiment of each chunk of sentences

nlp = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)
sentence_sentiments = nlp(relevant_sentences)
print(sentence_sentiments)

[{'label': 'positive', 'score': 0.9998239874839783}, {'label': 'positive', 'score': 0.9998443126678467}, {'label': 'positive', 'score': 0.9997465014457703}]


In [28]:
# Aggregate the sentiment scores to determine the overall sentiment of the article
overall_sentiment = aggregate_article_sentiment(sentence_sentiments, 0)

Overall Sentiment for row 0: positive
