## Utils

In [1]:
import json
import os

In [2]:
import torch

In [3]:
from sklearn.metrics import precision_recall_fscore_support

In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [5]:
def generate_event_information(model, tokenizer, prompt, max_input_length=500, max_new_tokens=50):
    # Tokenize the input prompt, truncating to the maximum input length
    inputs = tokenizer(prompt, return_tensors="pt",
                       truncation=True, max_length=max_input_length)

    inputs = {k: v.to(model.device) for k, v in inputs.items()}

    outputs = model.generate(
        inputs['input_ids'], max_new_tokens=max_new_tokens)
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

    return generated_text

In [6]:
def get_model_response(model, tokenizer, prompt):
    inputs = tokenizer(prompt, return_tensors="pt", max_length=1024, truncation=True)
    outputs = model.generate(**inputs, max_length=1024)

    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [7]:
cache_dir='/dcs/large/u5579267/.huggingface'

## LlaMa 3 8B Quantized

In [8]:
# New Method

In [9]:
import re
import json
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification

def extract_company_name(sentence):
    """Extracts the company name from the sentence based on common patterns."""
    # Regular expression to find company names
    pattern = r"\b[A-Z][a-zA-Z]*\s(?:Inc|Corp|Ltd|LLC|Group|Company|PLC|Corporation|Incorporated|N\.A\.)\b"
    match = re.search(pattern, sentence)
    if match:
        return match.group()
    return None

2024-07-17 12:29:13.707441: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-07-17 12:29:19.387029: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /local/java/cuda-11.6.0/lib64/:/local/java/cudnn-linux-x86_64-8.5.0.96_cuda11-archive/lib/:/local/java/cuda-12.2/lib64/:/local/java/cudnn-linux-x86_64-8.9.4.25_cuda12-archive/lib/:/local/java/cuda-12.2/extras/CUPTI/lib64/
2024-07-17 12:29:19.388218: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinf

In [10]:
def extract_events_zero_shot(model, tokenizer, sentence, event_types):
    """Performs zero-shot event extraction and company name extraction.

    Args:
        model_name (str): Name of the Hugging Face model (e.g., "google/flan-t5-xl").
        sentence (str): The sentence to extract events from.
        event_types (list): List of possible event types.

    Returns:
        dict: A dictionary containing the extracted event type and company name.
    """
    torch.cuda.empty_cache()
    model.to(device)

    extractor = pipeline("zero-shot-classification", model=model, tokenizer=tokenizer, device=0)
    candidate_labels = event_types + ["Company"]

    result = extractor(sentence, candidate_labels)

    # Find the event type with the highest score
    event_type = max(result["labels"], key=lambda label: result["scores"][result["labels"].index(label)])
    if event_type not in event_types:
        event_type = "Other/None"  # If no valid event type found

    company_name = extract_company_name(sentence)
    if not company_name:
        company_name = "Unknown"  # "Unknown" if no company name is found

    return {"event_type": event_type, "company": company_name}


In [None]:
!export PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:128

In [11]:
sentence = "CBTX Inc. Declares Quarterly Dividend and Suspends Repurchase Program. HOUSTON, March 18, 2020 ( ) CBTX, Inc., the bank holding company for CommunityBank of Texas N.A., today announced that its Board of Directors declared a quarterly cash dividend in the amount of $0.10 per share of common stock. The dividend will be payable on April 15, 2020 to shareholders of record as of the close of business on April 1, 2020. In addition, CBTX, Inc. today announced that it has temporarily suspended its share repurchase program in light of the challenges presented by the COVID-19 pandemic and surrounding events. CBTX, Inc. believes that it remains strong and well-capitalized, and the Company may reinstate the share repurchase program in the future. The Company repurchased 240,445 shares of common stock during the first quarter of 2020 for an aggregate purchase price of approximately $5.4 million under its repurchase program."
event_types = [
    "Acquisition (A)", "Clinical Trial (CT)", "Regular Dividend (RD)",
    "Dividend Cut (DC)", "Dividend Increase (DI)", "Guidance Increase (GI)",
    "New Contract (NC)", "Reverse Stock Split (RSS)", "Special Dividend (SD)",
    "Stock Repurchase (SR)", "Stock Split (SS)", "Other/None (O)"
]

In [12]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

model_name = "meta-llama/Meta-Llama-3-8B-Instruct"
llama_tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_dir)
llama_model = AutoModelForCausalLM.from_pretrained(model_name, cache_dir=cache_dir)

extracted_info = extract_events_zero_shot(llama_model, llama_tokenizer, sentence, event_types)
print(json.dumps(extracted_info, indent=2))

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 224.00 MiB (GPU 0; 10.75 GiB total capacity; 10.46 GiB already allocated; 18.50 MiB free; 10.46 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [13]:
import json

In [14]:
data_path = os.path.join(os.getcwd(), "EDT_dataset", "Event_detection", "train.json")

# Load the JSON object from the file
with open(data_path, "r") as file:
    data = json.load(file)

# Print the first few entries
for i in range(3):
    print(data[-i])
    print()

{'sentence': ['JTI report warns of a \'Gathering Storm\' in the black market English English Intelligence shows criminals are ready for post-Covid boom GENEVA , Sept . 11 , 2020 / / JTI ( Japan Tobacco International ) has published a report , independently verified by Intrinsic Insight Ltd. , entitled \'The Gathering Storm\' , on how the illegal tobacco trade are operating during the Covid-19 global pandemic and preparing to reap the rewards in the economic aftermath that will follow . Law enforcement agencies around the world have welcomed the report , which is based on 63 field studies , conducted across 50 countries including Russia , Canada , Malaysia , and the Philippines where tobacco smugglers currently have a strong presence . JTI intelligence found that the global public health crisis and financial downturn has created the conditions for a \'perfect storm\' where organized criminal groups will further exploit public demand for cheap goods , and capitalize on dwindling buying p

In [15]:
def process_sentence(model_name, sentence):
        event_types = [
            "Acquisition (A)", "Clinical Trial (CT)", "Regular Dividend (RD)",
            "Dividend Cut (DC)", "Dividend Increase (DI)", "Guidance Increase (GI)",
            "New Contract (NC)", "Reverse Stock Split (RSS)", "Special Dividend (SD)",
            "Stock Repurchase (SR)", "Stock Split (SS)", "Other/None (O)"
        ]

        extracted_events = extract_events_zero_shot(model_name, sentence, event_types)

        return extracted_events

In [16]:
data_copy = list(reversed(data))
data_copy = data_copy[:10]
results = []

for item in data:
    sentence = item["sentence"][0]
    extracted_events = process_sentence(model_name, sentence)
    results.append({"sentence": sentence, "extracted_events": extracted_events, "actual_events": item["events"]})

for result in results:
    print(result)
    print()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of T5ForSequenceClassification were not initialized from the model checkpoint at google/flan-t5-xl and are newly initialized: ['classification_head.dense.bias', 'classification_head.dense.weight', 'classification_head.out_proj.bias', 'classification_head.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
Failed to determine 'entailment' label id from the label2id mapping in the model config. Setting to -1. Define a descriptive label2id mapping in the model config to ensure correct outputs.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of T5ForSequenceClassification were not initialized from the model checkpoint at google/flan-t5-xl and are newly initialized: ['classification_head.dense.bias', 'classification_head.dense.weight', 'classification_head.out_proj.bias', 'classification_head.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
Failed to determine 'entailment' label id from the label2id mapping in the model config. Setting to -1. Define a descriptive label2id mapping in the model config to ensure correct outputs.


KeyboardInterrupt: 

In [None]:
from enum import Enum

class EventType(Enum):
    A = "Acquisition (A)"
    CT = "Clinical Trial (CT)"
    RD = "Regular Dividend (RD)"
    DC = "Dividend Cut (DC)"
    DI = "Dividend Increase (DI)"
    GI = "Guidance Increase (GI)"
    NC = "New Contract (NC)"
    RSS = "Reverse Stock Split (RSS)"
    SD = "Special Dividend (SD)"
    SR = "Stock Repurchase (SR)"
    SS = "Stock Split (SS)"
    O = "Other/None (O)"

In [None]:
def evaluate_events(results):
    y_true = []
    y_pred = []

    for result in results:
        actual_events = result["actual_events"]
        extracted_event = result["extracted_events"]

        if not actual_events:
            actual_events = [EventType.O.value]

        for actual, extract in zip(actual_events, extracted_event.values()):
            print(f"Event type: {actual}")
            actual_event_enum = next(
                (e for e in EventType if e.value == actual), EventType.O)
            y_true.append(actual_event_enum.value)
            print(f"y true : {y_true}")

            extracted_event_enum = next(
                (e for e in EventType if e.value == extracted_event["event_type"]), EventType.O)

            y_pred.append(extracted_event_enum.value)
            print(f"y pred : {y_pred}")

        print("-" * 50)

    # Calculate Exact Match (EM)
    exact_matches = sum(1 for yt, yp in zip(y_true, y_pred) if yt == yp)
    em_score = exact_matches / len(y_true)

    # Calculate F1 score
    precision, recall, f1, _ = precision_recall_fscore_support(
        y_true, y_pred, average='weighted')

    print(f"Evaluation metrics - Exact Match (EM): {em_score}, F1 Score: {f1}")
    return {"exact_match": em_score, "f1": f1}

In [None]:
evaluate_events(results)

## Zero Shot Learning
### 1. Schema Prompt

In [62]:
model_name = "google/flan-t5-base"

if "flan-t5" in model_name:
    tokenizer = T5Tokenizer.from_pretrained(model_name, legacy=True)
    model = T5ForConditionalGeneration.from_pretrained(model_name)
elif "bart" in model_name:
    tokenizer = BartTokenizer.from_pretrained(model_name)
    model = BartForConditionalGeneration.from_pretrained(model_name)
else:
    raise ValueError(f"Model {model_name} not supported")

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

zero_shot_classifier = pipeline("zero-shot-classification", model=model_name, device=device)

print("Initialized EventExtraction model")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Some weights of T5ForSequenceClassification were not initialized from the model checkpoint at google/flan-t5-base and are newly initialized: ['classification_head.dense.bias', 'classification_head.dense.weight', 'classification_head.out_proj.bias', 'classification_head.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Failed to determine 'entailment' label id from the label2id mapping in the model config. Setting to -1. Define a descriptive label2id mapping in the model config to ensure correct outputs.


Initialized EventExtraction model


In [63]:
from enum import Enum


class EventType(Enum):
    A = "Acquisition (A)"
    CT = "Clinical Trial (CT)"
    RD = "Regular Dividend (RD)"
    DC = "Dividend Cut (DC)"
    DI = "Dividend Increase (DI)"
    GI = "Guidance Increase (GI)"
    NC = "New Contract (NC)"
    RSS = "Reverse Stock Split (RSS)"
    SD = "Special Dividend (SD)"
    SR = "Stock Repurchase (SR)"
    SS = "Stock Split (SS)"
    O = "Other/None (O)"

In [64]:
def get_schema_prompt(sentence):
    return f"""
        Extract event information from the following sentence and return events in JSON format as this: {{"event_type": event_type, "company": company_name}}.

        Event_type:
        - Acquisition (A)
        - Clinical Trial (CT)
        - Regular Dividend (RD)
        - Dividend Cut (DC)
        - Dividend Increase (DI)
        - Guidance Increase (GI)
        - New Contract (NC)
        - Reverse Stock Split (RSS)
        - Special Dividend (SD)
        - Stock Repurchase (SR)
        - Stock Split (SS)
        - Other/None (O)

        Argument type:
        - company

        Sentence: "{sentence}"

        Output:
        """

In [67]:
def process_sentence(sentence):
        print(f"Processing sentence: {sentence}")
        prompt = get_schema_prompt(sentence)

        # Use the zero-shot-classification pipeline
        response = get_model_response(prompt)
        
        print(response)
        
#         # Extract and format the response
#         try:
#             extracted_events = json.loads(response)
#         except json.JSONDecodeError:
#             logging.error(f"Invalid JSON response: {response}")
#             extracted_events = {"event_type": "Other/None", "company": "Unknown"}

#         logging.info(f"Extracted events: {extracted_events}")
#         return extracted_events

In [68]:
sentence = "CBTX Inc. Declares Quarterly Dividend and Suspends Repurchase Program. HOUSTON, March 18, 2020 ( ) CBTX, Inc., the bank holding company for CommunityBank of Texas N.A., today announced that its Board of Directors declared a quarterly cash dividend in the amount of $0.10 per share of common stock. The dividend will be payable on April 15, 2020 to shareholders of record as of the close of business on April 1, 2020. In addition, CBTX, Inc. today announced that it has temporarily suspended its share repurchase program in light of the challenges presented by the COVID-19 pandemic and surrounding events. CBTX, Inc. believes that it remains strong and well-capitalized, and the Company may reinstate the share repurchase program in the future. The Company repurchased 240,445 shares of common stock during the first quarter of 2020 for an aggregate purchase price of approximately $5.4 million under its repurchase program."

extracted_events = process_sentence(sentence)
print(f"Extracted Events : {extracted_events}")

Processing sentence: CBTX Inc. Declares Quarterly Dividend and Suspends Repurchase Program. HOUSTON, March 18, 2020 ( ) CBTX, Inc., the bank holding company for CommunityBank of Texas N.A., today announced that its Board of Directors declared a quarterly cash dividend in the amount of $0.10 per share of common stock. The dividend will be payable on April 15, 2020 to shareholders of record as of the close of business on April 1, 2020. In addition, CBTX, Inc. today announced that it has temporarily suspended its share repurchase program in light of the challenges presented by the COVID-19 pandemic and surrounding events. CBTX, Inc. believes that it remains strong and well-capitalized, and the Company may reinstate the share repurchase program in the future. The Company repurchased 240,445 shares of common stock during the first quarter of 2020 for an aggregate purchase price of approximately $5.4 million under its repurchase program.
Model response: "event_type": event_type, "company": "

In [61]:
def get_few_shot_examples():
        """Returns a few-shot learning prompt with examples."""
        examples = """
        Example 1:
        Sentence: "ABC Corp. Acquires XYZ Ltd."
        Output: {"event_type": "Acquisition (A)", "company": "ABC Corp."}

        Example 2:
        Sentence: "DEF Inc. Declares Regular Dividend of $0.50 per share."
        Output: {"event_type": "Regular Dividend (RD)", "company": "DEF Inc."}

        Example 3:
        Sentence: "GHI Ltd. Cuts Dividend to $0.20 per share."
        Output: {"event_type": "Dividend Cut (DC)", "company": "GHI Ltd."}
        """
        return examples


### 2. Code Prompt

In [None]:
prompt="""
Event = {
    "event_type": str #options: [Acquisition (A), Clinical Trial (CT), Regular Dividend (RD), Dividend Cut (DC), Dividend Increase (DI), Guidance Increase (GI), New Contract (NC), Reverse Stock Split (RSS), Special Dividend (SD), Stock Repurchase (SR), Stock Split (SS), Other/None (O)]
    "company": str
}

events: List[Event] = extract events in the sentence: "CBTX Inc. Declares Quarterly Dividend and Suspends Repurchase Program. HOUSTON, March 18, 2020 ( ) CBTX, Inc., the bank holding company for CommunityBank of Texas N.A., today announced that its Board of Directors declared a quarterly cash dividend in the amount of $0.10 per share of common stock. The dividend will be payable on April 15, 2020 to shareholders of record as of the close of business on April 1, 2020. In addition, CBTX, Inc. today announced that it has temporarily suspended its share repurchase program in light of the challenges presented by the COVID-19 pandemic and surrounding events. CBTX, Inc. believes that it remains strong and well-capitalized, and the Company may reinstate the share repurchase program in the future. The Company repurchased 240,445 shares of common stock during the first quarter of 2020 for an aggregate purchase price of approximately $5.4 million under its repurchase program."
print(json.dumps(events))
"""

codePrompt_response = get_model_response(flan_model_test, flan_tokenizer_test, prompt)

In [None]:
prompt = """
# Task: Extract Events from Text
# Input:
text = "CBTX Inc. Declares Quarterly Dividend and Suspends Repurchase Program. HOUSTON, March 18, 2020 ( ) CBTX, Inc., the bank holding company for CommunityBank of Texas N.A., today announced that its Board of Directors declared a quarterly cash dividend in the amount of $0.10 per share of common stock. The dividend will be payable on April 15, 2020 to shareholders of record as of the close of business on April 1, 2020. In addition, CBTX, Inc. today announced that it has temporarily suspended its share repurchase program in light of the challenges presented by the COVID-19 pandemic and surrounding events. CBTX, Inc. believes that it remains strong and well-capitalized, and the Company may reinstate the share repurchase program in the future. The Company repurchased 240,445 shares of common stock during the first quarter of 2020 for an aggregate purchase price of approximately $5.4 million under its repurchase program."

Instructions:
1. dataclass `Event` to represents extracted events:

from dataclasses import dataclass
from typing import List

@dataclass
class Event:
    event_type: str  # Choose from: [A, CT, RD, DC, DI, GI, NC, RSS, SD, SR, SS, O]
    company: str

2. Function `extract_events` takes the input text and returns a list of Event objects:

def extract_events(text: str) -> List[Event]:
    # - Identify financial/corporate events in the text.
    # - Classify events based on the provided types.
    # - Extract the primary company associated with each event.
    return events

3. Extract events from the provided text and print the result as JSON:

events = extract_events(text)
import json
print(json.dumps(events))
"""

codePrompt_response = get_model_response(flan_model_test, flan_tokenizer_test, prompt)

In [None]:
print(codePrompt_response)

### 3. Explanation Prompt

In [None]:
explanation_prompt="""
Task: Event Extraction

Instructions:

1. Identify any financial or corporate events within the sentence.
2. Classify each identified event using the following types:
   - Acquisition (A): A company purchases another company or a significant portion of it.
   - Clinical Trial (CT): A company conducts a research study to test new medical treatments or drugs.
   - Regular Dividend (RD): A company distributes a portion of its earnings to shareholders regularly.
   - Dividend Cut (DC): A company reduces the amount of dividend it pays out to shareholders.
   - Dividend Increase (DI): A company increases the amount of dividend it pays out to shareholders.
   - Guidance Increase (GI): A company raises its future earnings or revenue forecast.
   - New Contract (NC): A company secures a new agreement for providing goods or services.
   - Reverse Stock Split (RSS): A company reduces the number of its outstanding shares to increase the share price.
   - Special Dividend (SD): A company makes a one-time distribution of additional earnings to shareholders.
   - Stock Repurchase (SR): A company buys back its own shares from the marketplace.
   - Stock Split (SS): A company increases the number of its outstanding shares by dividing its current shares.
   - Other/None (O): Events that do not fit into any of the specified categories.
3. Extract the primary corporate entity (company name) directly associated with each event. Use contextual clues and focus on entities performing financial actions. Prioritise the company name. A company associated is the primary corporate entity directly involved in the financial or corporate event being reported.

4. Output a JSON array of dictionaries, each containing:
    - "event_type": [Use the exact event classification code from the provided list, e.g., "RD" for Regular Dividend]
    - "company": [Identify the primary company performing the financial action, often the publicly traded parent company]

5. Ensure the output is valid JSON. Double-check for proper formatting, brackets, commas, and quotation marks.

## Important: The final output should be a JSON array, not an explanation of the instructions.


Example 1:
Input: "Company A announces a regular dividend and a new stock repurchase program."
Output: [{"event_type": "RD", "company": "Company A"}, {"event_type": "SR", "company": "Company A"}]

Example 2:
Input: "MegaCorp (parent company of Subsidiary B) declares a quarterly cash dividend payable to shareholders of record."
Output: [{"event_type": "RD", "company": "MegaCorp"}]

Example 3:
Input: "BigPharma Inc., the parent company of BioTech Labs, announced a successful Phase III clinical trial."
Output: [{"event_type": "CT", "company": "BigPharma Inc."}]

Extract the event from the following sentence:
Input: "CBTX Inc. Declares Quarterly Dividend and Suspends Repurchase Program. HOUSTON, March 18, 2020 ( ) CBTX, Inc., the bank holding company for CommunityBank of Texas N.A., today announced that its Board of Directors declared a quarterly cash dividend in the amount of $0.10 per share of common stock. The dividend will be payable on April 15, 2020 to shareholders of record as of the close of business on April 1, 2020. In addition, CBTX, Inc. today announced that it has temporarily suspended its share repurchase program in light of the challenges presented by the COVID-19 pandemic and surrounding events. CBTX, Inc. believes that it remains strong and well-capitalized, and the Company may reinstate the share repurchase program in the future. The Company repurchased 240,445 shares of common stock during the first quarter of 2020 for an aggregate purchase price of approximately $5.4 million under its repurchase program."
"""

explanation_response = get_model_response(flan_model_test, flan_tokenizer_test, explanation_prompt)

In [None]:
print(explanation_response)

### 4. Pipeline Prompt

In [None]:
pipeline_prompt="""

Stage 1: 
Extract financial or corporate events in the sentence, as well as the primary corporate entity (company) involved in the event. Return the output in JSON format as this: [{"event_type": event type, "company": "company name"}].
Event type options: Acquisition (A), Clinical Trial (CT), Regular Dividend (RD), Dividend Cut (DC), Dividend Increase (DI), Guidance Increase (GI), New Contract (NC), Reverse Stock Split (RSS), Special Dividend (SD), Stock Repurchase (SR), Stock Split (SS), Other/None (O).

Instructions:
1. Clearly identify the main events described in the sentence.
2. Classify each event using the provided list of event types.
3. Extract the company name directly associated with each event, prioritizing the main corporate entity mentioned in the sentence.
4. Ensure the output is a valid JSON array with square brackets.

Sentence: "CBTX Inc. Declares Quarterly Dividend and Suspends Repurchase Program. HOUSTON, March 18, 2020 ( ) CBTX, Inc., the bank holding company for CommunityBank of Texas N.A., today announced that its Board of Directors declared a quarterly cash dividend in the amount of $0.10 per share of common stock. The dividend will be payable on April 15, 2020 to shareholders of record as of the close of business on April 1, 2020. In addition, CBTX, Inc. today announced that it has temporarily suspended its share repurchase program in light of the challenges presented by the COVID-19 pandemic and surrounding events. CBTX, Inc. believes that it remains strong and well-capitalized, and the Company may reinstate the share repurchase program in the future. The Company repurchased 240,445 shares of common stock during the first quarter of 2020 for an aggregate purchase price of approximately $5.4 million under its repurchase program."

Output (Stage 1): 

Stage 2: Answer the question related to the given sentence and given event information.

Instructions:
1. Analyze the extracted events from Stage 1.
2. Iterate over each event, and for each:
    - Determine the corresponding question from the list below based on the event type.
    - Find the answer to the question within the provided sentence.
    - Extract the exact span of text that answers the question.
    - If no answer is found, return "N/A".
3. Output the answers as a JSON array, in the format: `[{"event_type": event type, "question": question, "answer": answer}]`

Questions for each event type:
Acquisition (A): What company was acquired?
Clinical Trial (CT): What was the result of the clinical trial?
Regular Dividend (RD): What is the amount of the regular dividend?
Dividend Cut (DC): By how much was the dividend cut?
Dividend Increase (DI): By how much was the dividend increased?
Guidance Increase (GI): What is the new guidance value?
New Contract (NC): What is the nature of the new contract?
Reverse Stock Split (RSS): What is the ratio of the reverse stock split?
Special Dividend (SD): What is the amount of the special dividend?
Stock Repurchase (SR): How many shares were repurchased?
Stock Split (SS): What is the ratio of the stock split?

Sentence: "CBTX Inc. Declares Quarterly Dividend and Suspends Repurchase Program. HOUSTON, March 18, 2020 ( ) CBTX, Inc., the bank holding company for CommunityBank of Texas N.A., today announced that its Board of Directors declared a quarterly cash dividend in the amount of $0.10 per share of common stock. The dividend will be payable on April 15, 2020 to shareholders of record as of the close of business on April 1, 2020. In addition, CBTX, Inc. today announced that it has temporarily suspended its share repurchase program in light of the challenges presented by the COVID-19 pandemic and surrounding events. CBTX, Inc. believes that it remains strong and well-capitalized, and the Company may reinstate the share repurchase program in the future. The Company repurchased 240,445 shares of common stock during the first quarter of 2020 for an aggregate purchase price of approximately $5.4 million under its repurchase program."
Complete Stage 1 and then Stage 2 based on Stage 1's output.
"""

pipeline_response = get_model_response(flan_model_test, flan_tokenizer_test, pipeline_prompt)

In [None]:
print(pipeline_response)

## GPT

In [7]:
from huggingface_hub import login
from transformers import AutoModel, AutoTokenizer

# Set your Hugging Face token
token = os.environ['HUGGINGFACE_TOKEN']

# Log in to Hugging Face
login(token=token)

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /dcs/pg23/u5579267/.cache/huggingface/token
Login successful


In [8]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel

# Load the model and tokenizer
gpt2_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
gpt2_model = GPT2LMHeadModel.from_pretrained("gpt2")

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [15]:
prompt = """
Extract event information from the following sentences and return just the most matching event type. Event Types: Acquisition (A), Clinical Trial (CT), Regular Dividend (RD), Dividend Cut (DC), Dividend Increase (DI), Guidance Increase (GI), New Contract (NC), Reverse Stock Split (RSS), Special Dividend (SD), Stock Repurchase (SR), Stock Split (SS). If there is no event, use "O".

Sentence: "JTI report warns of a \'Gathering Storm\' in the black market English English Intelligence shows criminals are ready for post-Covid boom GENEVA , Sept . 11 , 2020 / / JTI ( Japan Tobacco International ) has published a report , independently verified by Intrinsic Insight Ltd. , entitled \'The Gathering Storm\' , on how the illegal tobacco trade are operating during the Covid-19 global pandemic and preparing to reap the rewards in the economic aftermath that will follow . Law enforcement agencies around the world have welcomed the report , which is based on 63 field studies , conducted across 50 countries including Russia , Canada , Malaysia , and the Philippines where tobacco smugglers currently have a strong presence . JTI intelligence found that the global public health crisis and financial downturn has created the conditions for a \'perfect storm\' where organized criminal groups will further exploit public demand for cheap goods , and capitalize on dwindling buying power in the impending global recession , particularly in countries with high tax regimes . The report has provided JTI with a global picture of four emerging trends , consistent with Euromonitor and Europol intelligence: 1 . Evidence shows that criminal groups are biding their time in readiness for an anticipated boom in illegal tobacco sales; 2 . After initial disruption to the illegal supply chain in Western European markets , organizedcriminals quickly exploited the inconsistent approach to travel and lockdownrulesand found alternative routes from production to distribution , leading to significant seizures of illegal factories or their components in countries such as the Czech Republic , Greece , Ireland , Belgium , and Spain; 3 . Changed law enforcement priorities and border restrictions have been mixed in limiting supply and the availability of illegal tobacco: whilst governments and authorities in Far East Asia were quicker to impose restrictions , those in the West failed to act with such precision; 4 . Technology has been increasingly deployed throughout the pandemic to enable sales of illegal tobacco to continue where strict lockdowns were put in place by governments throughout Eastern Europe , the Middle East , Africa and Asia Pacific , where WhatsApp and Facebook have provided quick and easy methods of communication between the consumer and criminals . Furthermore , the International Chamber of Commerce predicts that global counterfeit trade will reach $4 trillion by 2022 , primarily fueled by e-commerce[1] . According to the World Bank , the global trade in illegal tobacco is already worth an estimated $40-50 billion each year to the criminal groups who produce , manufacture , smuggle , distribute and sell tobacco products on which there is no tax duty paid . The loss of revenue to law-abiding retailers is also significantly felt , as is the impact on consumers who are lured into buying sub-standard products . "To some consumers illegal tobacco is a victimless crime , which is why we need to inform them not only of the hidden dangers they are consuming , but the wider social consequences of buying from criminal groups"
"""

response = get_model_response(gpt2_model, gpt2_tokenizer, prompt)
print(response)



Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



Extract event information from the following sentences and return just the most matching event type. Event Types: Acquisition (A), Clinical Trial (CT), Regular Dividend (RD), Dividend Cut (DC), Dividend Increase (DI), Guidance Increase (GI), New Contract (NC), Reverse Stock Split (RSS), Special Dividend (SD), Stock Repurchase (SR), Stock Split (SS). If there is no event, use "O".

Sentence: "JTI report warns of a 'Gathering Storm' in the black market English English Intelligence shows criminals are ready for post-Covid boom GENEVA, Sept. 11, 2020 / / JTI ( Japan Tobacco International ) has published a report, independently verified by Intrinsic Insight Ltd., entitled 'The Gathering Storm', on how the illegal tobacco trade are operating during the Covid-19 global pandemic and preparing to reap the rewards in the economic aftermath that will follow. Law enforcement agencies around the world have welcomed the report, which is based on 63 field studies, conducted across 50 countries inclu

## Llama 3B

In [None]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

llama_tokenizer = AutoTokenizer.from_pretrained("openlm-research/open_llama_3b_v2")
llama_model = AutoModelForCausalLM.from_pretrained("openlm-research/open_llama_3b_v2")

In [None]:
prompt="""
Task: Event Extraction

Instructions:

1. Identify any financial or corporate events within the sentence.
2. Classify each identified event using the following types:
   - Acquisition (A)
   - Clinical Trial (CT)
   - Regular Dividend (RD)
   - Dividend Cut (DC)
   - Dividend Increase (DI)
   - Guidance Increase (GI)
   - New Contract (NC)
   - Reverse Stock Split (RSS)
   - Special Dividend (SD)
   - Stock Repurchase (SR)
   - Stock Split (SS)
   - Other/None (O)
3. Extract the primary corporate entity (company name) directly associated with each event. Use contextual clues and focus on entities performing financial actions. Prioritise the company name.

4. Output a JSON array of dictionaries, each containing:
    - "event_type": [Use the exact event classification code from the provided list, e.g., "RD" for Regular Dividend]
    - "company": [Identify the primary company performing the financial action, often the publicly traded parent company]

5. Ensure the output is valid JSON. Double-check for proper formatting, brackets, commas, and quotation marks.

Example 1:
Input: "Company A announces a regular dividend and a new stock repurchase program."
Output: [{"event_type": "RD", "company": "Company A"}, {"event_type": "SR", "company": "Company A"}]

Example 2:
Input: "MegaCorp (parent company of Subsidiary B) declares a quarterly cash dividend payable to shareholders of record."
Output: [{"event_type": "RD", "company": "MegaCorp"}]

Example 3:
Input: "BigPharma Inc., the parent company of BioTech Labs, announced a successful Phase III clinical trial."
Output: [{"event_type": "CT", "company": "BigPharma Inc."}]

Extract the event from the following sentence:
Input: "CBTX Inc. Declares Quarterly Dividend and Suspends Repurchase Program. HOUSTON, March 18, 2020 ( ) CBTX, Inc., the bank holding company for CommunityBank of Texas N.A., today announced that its Board of Directors declared a quarterly cash dividend in the amount of $0.10 per share of common stock. The dividend will be payable on April 15, 2020 to shareholders of record as of the close of business on April 1, 2020. In addition, CBTX, Inc. today announced that it has temporarily suspended its share repurchase program in light of the challenges presented by the COVID-19 pandemic and surrounding events. CBTX, Inc. believes that it remains strong and well-capitalized, and the Company may reinstate the share repurchase program in the future. The Company repurchased 240,445 shares of common stock during the first quarter of 2020 for an aggregate purchase price of approximately $5.4 million under its repurchase program."
"""

response = get_model_response(llama_model, llama_tokenizer, prompt)

In [None]:
print(response)

## FinGPT

In [None]:
from peft import PeftModel, PeftConfig
from transformers import AutoModelForCausalLM

config = PeftConfig.from_pretrained("FinGPT/fingpt-forecaster_dow30_llama2-7b_lora")
base_model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
model = PeftModel.from_pretrained(base_model, "FinGPT/fingpt-forecaster_dow30_llama2-7b_lora")