In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, confusion_matrix, accuracy_score, precision_score, recall_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
import numpy as np
from transformers import BertTokenizer, BertModel
import torch
from datetime import datetime
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import cross_val_score
from joblib import dump
import spacy
import pickle

In [12]:
# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Load pre-trained model (weights)
model = BertModel.from_pretrained('bert-base-uncased',
                                  output_hidden_states = True, # Whether the model returns all hidden-states.
                                  )

# Put the model in "evaluation" mode, meaning feed-forward operation.
model.eval()

# Load spaCy model
nlp = spacy.load('en_core_web_sm')

def extract_event_sentences(text):
    doc = nlp(text)
    event_sentences = [sent.text.strip() for sent in doc.sents if any(ent.label_ in ['EVENT', 'DATE', 'TIME'] for ent in sent.ents)]
    return ' '.join(event_sentences)

# Function to calculate the embeddings
def calculate_embeddings(text):
    # Add the special tokens.
    marked_text = "[CLS] " + str(text) + " [SEP]"

    # Split the sentence into tokens.
    tokenized_text = tokenizer.tokenize(marked_text)

    # Map the token strings to their vocabulary indeces.
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

    # Define the maximum length of sequences
    MAX_LEN = 512

    # Truncate and pad the input sequences so that they all have the same length
    indexed_tokens = pad_sequences([indexed_tokens], maxlen=MAX_LEN, dtype="long", 
                            value=0, truncating="post", padding="post")[0]

    # Mark each of the tokens as belonging to sentence "1".
    segments_ids = [1] * len(indexed_tokens)

    # Convert inputs to PyTorch tensors
    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensors = torch.tensor([segments_ids])

    # Run the text through BERT, and collect all of the hidden states produced from all 12 layers.
    with torch.no_grad():
        outputs = model(tokens_tensor, segments_tensors)

    # Evaluating the model will return a different number of objects based on how it's  configured in the `from_pretrained` call earlier. In this case, becase we set `output_hidden_states = True`, the third item will be the hidden states from all layers.
    hidden_states = outputs[2]

    # Concatenate the tensors for all layers. We use `stack` here to create a new dimension in the tensor.
    token_embeddings = torch.stack(hidden_states, dim=0)

    # Remove dimension 1, the "batches".
    token_embeddings = torch.squeeze(token_embeddings, dim=1)

    # Swap dimensions 0 and 1.
    token_embeddings = token_embeddings.permute(1,0,2)

    # Stores the token vectors, with shape [22 x 768]
    token_vecs_sum = []

    # `token_embeddings` is a [22 x 12 x 768] tensor.

    # For each token in the sentence...
    for token in token_embeddings:

        # `token` is a [12 x 768] tensor

        # Sum the vectors from the last four layers.
        sum_vec = torch.sum(token[-4:], dim=0)
        
        # Use `sum_vec` to represent `token`.
        token_vecs_sum.append(sum_vec)

    # Calculate the average embedding.
    sentence_embedding = torch.mean(torch.stack(token_vecs_sum), dim=0)

    return sentence_embedding.numpy()

def clean_text(text):
    # Split the text by space
    tokens = text.split()

    # Remove '##' and join subwords
    clean_tokens = [token.replace('##', '') if token.startswith('##') else ' ' + token for token in tokens]

    # Join tokens into a string to form the cleaned text
    cleaned_text = ''.join(clean_tokens)

    return cleaned_text

def tokens_to_sentence(text):
    # Add the special tokens.
    marked_text = "[CLS] " + str(text) + " [SEP]"

    # Split the sentence into tokens.
    tokenized_text = tokenizer.tokenize(marked_text)

    # Map the token strings to their vocabulary indeces.
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

    # Convert token IDs back to tokens
    tokens = tokenizer.convert_ids_to_tokens(indexed_tokens)

    # Remove [CLS] and [SEP] tokens
    tokens = [token for token in tokens if token not in ['[CLS]', '[SEP]']]

    # Join tokens into a string to form the sentence
    sentence = ' '.join(tokens)

    return sentence
# Load your data
df = pd.read_csv('news_cleaned_no_spaces.csv', encoding='latin1')
df['news_text'] = df['news_text'].astype(str)

# Extract event sentences
df['output_text'] = df['news_text'].apply(extract_event_sentences)

# Calculate the embeddings for each sentence
df['output'] = df['output_text'].apply(calculate_embeddings)

# Split the data into training and testing sets
train_sentences, test_sentences, train_labels, test_labels = train_test_split(df['output'].tolist(), df['gold_truth'].tolist(), test_size=0.2, random_state=30)

# ------- TRAIN CLASSIFIER ------------
# Convert list of arrays into a 2D array
train_sentences_array = np.vstack(train_sentences)
test_sentences_array = np.vstack(test_sentences)

# Train a Support Vector Machine model
classifier = SVC(random_state=42)
classifier.fit(train_sentences_array, train_labels)

# Make predictions on the test set
test_predictions = classifier.predict(test_sentences_array)

# Calculate the accuracy
accuracy = accuracy_score(test_labels, test_predictions)
print(f'Accuracy: {accuracy}')

# Calculate the precision
precision = precision_score(test_labels, test_predictions, average='weighted')
print(f'Precision: {precision}')

# Calculate the recall
recall = recall_score(test_labels, test_predictions, average='weighted')
print(f'Recall: {recall}')

# Calculate the F1 score
f1 = f1_score(test_labels, test_predictions, average='weighted')
print(f'F1 Score: {f1}')

scores = cross_val_score(classifier, train_sentences_array, train_labels, cv=5)
print("Cross-Validation Scores: ", scores)
print("Average Score: ", scores.mean())

timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')

# Convert token IDs back to sentences and store in 'output_sentence' column
df['output_sentence'] = df['output_text'].apply(tokens_to_sentence)

# Clean text
df['cleaned_text'] = df['output_sentence'].apply(clean_text)

# Save to CSV
df.to_csv(f'predicted_sentences_bert{timestamp}.csv', index=False)

# Save the model
dump(classifier, 'bert_model.joblib') 

  tokens_tensor = torch.tensor([indexed_tokens])


Accuracy: 0.9953900709219858


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Precision: 0.990801393290076
Recall: 0.9953900709219858
F1 Score: 0.9930904315187539




Cross-Validation Scores:  [0.99667553 0.9964539  0.99667479 0.99667479 0.99667479]
Average Score:  0.9966307634922357


['bert_model.joblib']

In [None]:
import pickle

# Save the model
with open('bert_model.pkl', 'wb') as f:
    pickle.dump(classifier, f)

In [2]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m15.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [7]:
import sys
print(sys.executable)

/usr/local/bin/python3


In [8]:
!{sys.executable} -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m24.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.7.1
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, confusion_matrix, accuracy_score, precision_score, recall_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
import numpy as np
from transformers import BertTokenizer, BertModel
import torch
from datetime import datetime
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import cross_val_score
from joblib import dump
import spacy
import pickle
# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Load pre-trained model (weights)
model = BertModel.from_pretrained('bert-base-uncased',
                                  output_hidden_states = True, # Whether the model returns all hidden-states.
                                  )

# Put the model in "evaluation" mode, meaning feed-forward operation.
model.eval()

# Load spaCy model
nlp = spacy.load('en_core_web_sm')

def extract_event_sentences(text):
    doc = nlp(text)
    event_sentences = [sent.text.strip() for sent in doc.sents if any(ent.label_ in ['EVENT', 'DATE', 'TIME'] for ent in sent.ents)]
    return ' '.join(event_sentences)

# Function to calculate the embeddings
def calculate_embeddings(text):
    # Add the special tokens.
    marked_text = "[CLS] " + str(text) + " [SEP]"

    # Split the sentence into tokens.
    tokenized_text = tokenizer.tokenize(marked_text)

    # Map the token strings to their vocabulary indeces.
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

    # Define the maximum length of sequences
    MAX_LEN = 512

    # Truncate and pad the input sequences so that they all have the same length
    indexed_tokens = pad_sequences([indexed_tokens], maxlen=MAX_LEN, dtype="long", 
                            value=0, truncating="post", padding="post")[0]

    # Mark each of the tokens as belonging to sentence "1".
    segments_ids = [1] * len(indexed_tokens)

    # Convert inputs to PyTorch tensors
    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensors = torch.tensor([segments_ids])

    # Run the text through BERT, and collect all of the hidden states produced from all 12 layers.
    with torch.no_grad():
        outputs = model(tokens_tensor, segments_tensors)

    # Evaluating the model will return a different number of objects based on how it's  configured in the `from_pretrained` call earlier. In this case, becase we set `output_hidden_states = True`, the third item will be the hidden states from all layers.
    hidden_states = outputs[2]

    # Concatenate the tensors for all layers. We use `stack` here to create a new dimension in the tensor.
    token_embeddings = torch.stack(hidden_states, dim=0)

    # Remove dimension 1, the "batches".
    token_embeddings = torch.squeeze(token_embeddings, dim=1)

    # Swap dimensions 0 and 1.
    token_embeddings = token_embeddings.permute(1,0,2)

    # Stores the token vectors, with shape [22 x 768]
    token_vecs_sum = []

    # `token_embeddings` is a [22 x 12 x 768] tensor.

    # For each token in the sentence...
    for token in token_embeddings:

        # `token` is a [12 x 768] tensor

        # Sum the vectors from the last four layers.
        sum_vec = torch.sum(token[-4:], dim=0)
        
        # Use `sum_vec` to represent `token`.
        token_vecs_sum.append(sum_vec)

    # Calculate the average embedding.
    sentence_embedding = torch.mean(torch.stack(token_vecs_sum), dim=0)

    return sentence_embedding.numpy()

def clean_text(text):
    # Split the text by space
    tokens = text.split()

    # Remove '##' and join subwords
    clean_tokens = [token.replace('##', '') if token.startswith('##') else ' ' + token for token in tokens]

    # Join tokens into a string to form the cleaned text
    cleaned_text = ''.join(clean_tokens)

    return cleaned_text

def tokens_to_sentence(text):
    # Add the special tokens.
    marked_text = "[CLS] " + str(text) + " [SEP]"

    # Split the sentence into tokens.
    tokenized_text = tokenizer.tokenize(marked_text)

    # Map the token strings to their vocabulary indeces.
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

    # Convert token IDs back to tokens
    tokens = tokenizer.convert_ids_to_tokens(indexed_tokens)

    # Remove [CLS] and [SEP] tokens
    tokens = [token for token in tokens if token not in ['[CLS]', '[SEP]']]

    # Join tokens into a string to form the sentence
    sentence = ' '.join(tokens)

    return sentence
# Load your data
df = pd.read_csv('news_cleaned_no_spaces.csv', encoding='latin1')
df['news_text'] = df['news_text'].astype(str)

# Extract event sentences
df['output_text'] = df['news_text'].apply(extract_event_sentences)

# Calculate the embeddings for each sentence
df['output'] = df['output_text'].apply(calculate_embeddings)

# Split the data into training and testing sets
train_sentences, test_sentences, train_labels, test_labels = train_test_split(df['output'].tolist(), df['gold_truth'].tolist(), test_size=0.2, random_state=30)

# ------- TRAIN CLASSIFIER ------------
# Convert list of arrays into a 2D array
train_sentences_array = np.vstack(train_sentences)
test_sentences_array = np.vstack(test_sentences)

# Train a Support Vector Machine model
classifier = SVC(random_state=42)
classifier.fit(train_sentences_array, train_labels)

# Make predictions on the test set
test_predictions = classifier.predict(test_sentences_array)

# Calculate the accuracy
accuracy = accuracy_score(test_labels, test_predictions)
print(f'Accuracy: {accuracy}')

# Calculate the precision
precision = precision_score(test_labels, test_predictions, average='weighted')
print(f'Precision: {precision}')

# Calculate the recall
recall = recall_score(test_labels, test_predictions, average='weighted')
print(f'Recall: {recall}')

# Calculate the F1 score
f1 = f1_score(test_labels, test_predictions, average='weighted')
print(f'F1 Score: {f1}')

scores = cross_val_score(classifier, train_sentences_array, train_labels, cv=5)
print("Cross-Validation Scores: ", scores)
print("Average Score: ", scores.mean())

timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')

# Convert token IDs back to sentences and store in 'output_sentence' column
df['output_sentence'] = df['output_text'].apply(tokens_to_sentence)

# Clean text
df['cleaned_text'] = df['output_sentence'].apply(clean_text)

# Save to CSV
df.to_csv(f'predicted_sentences_bert{timestamp}.csv', index=False)

# Save the model
with open('bert_model.pkl', 'wb') as f:
    pickle.dump(classifier, f)

  from .autonotebook import tqdm as notebook_tqdm
  tokens_tensor = torch.tensor([indexed_tokens])


Accuracy: 0.9953900709219858


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Precision: 0.990801393290076
Recall: 0.9953900709219858
F1 Score: 0.9930904315187539




Cross-Validation Scores:  [0.99667553 0.9964539  0.99667479 0.99667479 0.99667479]
Average Score:  0.9966307634922357


In [2]:
# Load the model
with open('bert_model.pkl', 'rb') as f:
    classifier = pickle.load(f)


In [2]:
# Assume we have a new input text
input_text = input("Enter your text here:")

In [12]:
import ipywidgets as widgets
# Create text box for input
input_text = widgets.Textarea(
    value='',
    placeholder='Enter your text here:',
    description='Input:',
    disable=False
)

# Display the text box
display(input_text)

Textarea(value='', description='Input:', placeholder='Enter your text here:')

Technology companies are known for strong revenue growth fueled by their innovations, but that doesn't always translate to the bottom line. A number of tech companies are not profitable, but profits are a must in order to pay dividends. Otherwise, it should raise questions over the affordability of that dividend. For example, data storage provider Seagate Technology (STX -1.19%) delivered an attractive yield of 3.7% at the time of this writing. But look past that juicy yield at the company's financials, and it's not a pretty picture. In its fiscal first quarter, ended Sept. 29, Seagate paid out dividends of $145 million but suffered a net loss of $184 million. The company also generated free cash flow (FCF) of $57 million. FCF provides insight into the cash available for a company to invest in its business, pay debt obligations, repurchase shares, and hand out dividends. With no profit and a dividend payout more than double its FCF, Seagate can't sustain a payout if its financials don'

In [25]:
import pandas as pd
import spacy
import torch
from transformers import BertTokenizer, BertModel
from keras.preprocessing.sequence import pad_sequences
import pickle

# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Load pre-trained model (weights)
model = BertModel.from_pretrained('bert-base-uncased',
                                  output_hidden_states = True, # Whether the model returns all hidden-states.
                                  )

# Put the model in "evaluation" mode, meaning feed-forward operation.
model.eval()

# Load spaCy model
nlp = spacy.load('en_core_web_sm')

# Load the trained model
with open('bert_model.pkl', 'rb') as f:
    classifier = pickle.load(f)

# Define the functions: extract_event_sentences, calculate_embeddings, clean_text, tokens_to_sentence
def extract_event_sentences(text):
    doc = nlp(text)
    event_sentences = [sent.text.strip() for sent in doc.sents if any(ent.label_ in ['EVENT', 'DATE', 'TIME'] for ent in sent.ents)]
    return ' '.join(event_sentences)

# Function to calculate the embeddings
def calculate_embeddings(text):
    # Add the special tokens.
    marked_text = "[CLS] " + str(text) + " [SEP]"

    # Split the sentence into tokens.
    tokenized_text = tokenizer.tokenize(marked_text)

    # Map the token strings to their vocabulary indeces.
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

    # Define the maximum length of sequences
    MAX_LEN = 512

    # Truncate and pad the input sequences so that they all have the same length
    indexed_tokens = pad_sequences([indexed_tokens], maxlen=MAX_LEN, dtype="long", 
                            value=0, truncating="post", padding="post")[0]

    # Mark each of the tokens as belonging to sentence "1".
    segments_ids = [1] * len(indexed_tokens)

    # Convert inputs to PyTorch tensors
    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensors = torch.tensor([segments_ids])

    # Run the text through BERT, and collect all of the hidden states produced from all 12 layers.
    with torch.no_grad():
        outputs = model(tokens_tensor, segments_tensors)

    # Evaluating the model will return a different number of objects based on how it's  configured in the `from_pretrained` call earlier. In this case, becase we set `output_hidden_states = True`, the third item will be the hidden states from all layers.
    hidden_states = outputs[2]

    # Concatenate the tensors for all layers. We use `stack` here to create a new dimension in the tensor.
    token_embeddings = torch.stack(hidden_states, dim=0)

    # Remove dimension 1, the "batches".
    token_embeddings = torch.squeeze(token_embeddings, dim=1)

    # Swap dimensions 0 and 1.
    token_embeddings = token_embeddings.permute(1,0,2)

    # Stores the token vectors, with shape [22 x 768]
    token_vecs_sum = []

    # `token_embeddings` is a [22 x 12 x 768] tensor.

    # For each token in the sentence...
    for token in token_embeddings:

        # `token` is a [12 x 768] tensor

        # Sum the vectors from the last four layers.
        sum_vec = torch.sum(token[-4:], dim=0)
        
        # Use `sum_vec` to represent `token`.
        token_vecs_sum.append(sum_vec)

    # Calculate the average embedding.
    sentence_embedding = torch.mean(torch.stack(token_vecs_sum), dim=0)

    return sentence_embedding.numpy()

def clean_text(text):
    # Split the text by space
    tokens = text.split()

    # Remove '##' and join subwords
    clean_tokens = [token.replace('##', '') if token.startswith('##') else ' ' + token for token in tokens]

    # Join tokens into a string to form the cleaned text
    cleaned_text = ''.join(clean_tokens)

    return cleaned_text

def tokens_to_sentence(text):
    # Add the special tokens.
    marked_text = "[CLS] " + str(text) + " [SEP]"

    # Split the sentence into tokens.
    tokenized_text = tokenizer.tokenize(marked_text)

    # Map the token strings to their vocabulary indeces.
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

    # Convert token IDs back to tokens
    tokens = tokenizer.convert_ids_to_tokens(indexed_tokens)

    # Remove [CLS] and [SEP] tokens
    tokens = [token for token in tokens if token not in ['[CLS]', '[SEP]']]

    # Join tokens into a string to form the sentence
    sentence = ' '.join(tokens)

    return sentence


# Now, you can use the model to predict the events from a new input text
def predict_events(input_text):
    # Extract event sentences
    event_sentences = extract_event_sentences(input_text)

    # Calculate the embeddings for the event sentences
    embeddings = calculate_embeddings(event_sentences)

    # Convert token IDs back to sentences
    output_sentence = tokens_to_sentence(event_sentences)

    # Use the trained model to predict the events
    prediction = classifier.predict([embeddings])

    # Clean the text
    cleaned_text = clean_text(output_sentence)
    return cleaned_text, prediction

# Test the function with a new input text
print(input_text.value)
cleaned_text, prediction = predict_events(input_text.value)
print("Event sentences: ",cleaned_text)

Technology companies are known for strong revenue growth fueled by their innovations, but that doesn't always translate to the bottom line. A number of tech companies are not profitable, but profits are a must in order to pay dividends. Otherwise, it should raise questions over the affordability of that dividend. For example, data storage provider Seagate Technology (STX -1.19%) delivered an attractive yield of 3.7% at the time of this writing. But look past that juicy yield at the company's financials, and it's not a pretty picture. In its fiscal first quarter, ended Sept. 29, Seagate paid out dividends of $145 million but suffered a net loss of $184 million. The company also generated free cash flow (FCF) of $57 million. FCF provides insight into the cash available for a company to invest in its business, pay debt obligations, repurchase shares, and hand out dividends. With no profit and a dividend payout more than double its FCF, Seagate can't sustain a payout if its financials don'

In [24]:
pd.set_option('display.max_colwidth', 50)