In [1]:
# ============================== #
# 1. Install dependencies (Kaggle does not come with these preinstalled)
# ============================== #
!pip install -q -U transformers accelerate peft bitsandbytes datasets scikit-learn tqdm

In [2]:
# ============================== #
# 2. Import Libraries
# ============================== #
import warnings
warnings.filterwarnings("ignore")

from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel
from sklearn.metrics import accuracy_score, f1_score
from datasets import load_dataset
from tqdm import tqdm
import torch

In [3]:
# First, install or update bitsandbytes with specific version
!pip install -U bitsandbytes==0.41.1

# Make sure CUDA is available (if you're using GPU)
import torch
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA device: {torch.cuda.get_device_name(0)}")

# ============================== #
# 3. Set Model and Tokenizer
# ============================== #
# Import necessary libraries
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel
import torch

# Verify bitsandbytes installation
import bitsandbytes as bnb
# Use pip to check the version instead of accessing __version__ attribute
import subprocess
bnb_version = subprocess.check_output(['pip', 'show', 'bitsandbytes']).decode('utf-8')
print(f"bitsandbytes info: {bnb_version.split('Version: ')[1].split('\\n')[0]}")

base_model = "NousResearch/Llama-2-13b-hf"
peft_model = "oliverwang15/FinGPT_v33_Llama2_13B_Sentiment_Instruction_LoRA_FT_8bit"

tokenizer = AutoTokenizer.from_pretrained(base_model, use_fast=True)
tokenizer.pad_token = tokenizer.eos_token  # Required for padding

# Configure BitsAndBytes for 8-bit quantization
bnb_config = BitsAndBytesConfig(
    load_in_8bit=True,
    llm_int8_threshold=6.0,
    llm_int8_has_fp16_weight=False
)

# Load the model with proper error handling
try:
    model = AutoModelForCausalLM.from_pretrained(
        base_model,
        device_map="auto",
        quantization_config=bnb_config,
        torch_dtype=torch.float16
    )
    
    model = PeftModel.from_pretrained(model, peft_model)
    model.eval()
    #print("Model loaded successfully!")
except Exception as e:
    print(f"Error loading model: {e}")

Collecting bitsandbytes==0.41.1
  Using cached bitsandbytes-0.41.1-py3-none-any.whl.metadata (9.8 kB)
Using cached bitsandbytes-0.41.1-py3-none-any.whl (92.6 MB)
Installing collected packages: bitsandbytes
  Attempting uninstall: bitsandbytes
    Found existing installation: bitsandbytes 0.42.0
    Uninstalling bitsandbytes-0.42.0:
      Successfully uninstalled bitsandbytes-0.42.0
Successfully installed bitsandbytes-0.41.1
CUDA available: False
'NoneType' object has no attribute 'cadam32bit_grad_fp32'
bitsandbytes info: 0.41.1
Summary: k-bit optimizers and matrix multiplication routines.
Home-page: https://github.com/TimDettmers/bitsandbytes
Author: Tim Dettmers
Author-email: dettmers@cs.washington.edu
License: MIT
Location: /opt/anaconda3/lib/python3.12/site-packages
Requires: 
Required-by: 

Error loading model: Using `bitsandbytes` 8-bit quantization requires the latest version of bitsandbytes: `pip install -U bitsandbytes`


In [4]:
# ============================== #
# 4. Helper Functions
# ============================== #
label_map = {0: "negative", 1: "neutral", 2: "positive"}

def format_example(example: dict) -> dict:
    context = f"Instruction: {example['instruction']}\n"
    if example.get("input"):
        context += f"Input: {example['input']}\n"
    context += "Answer: "
    target = example["output"]
    return {"context": context, "target": target}

def change_target(x):
    x = x.lower()
    if 'positive' in x:
        return 'positive'
    elif 'negative' in x:
        return 'negative'
    else:
        return 'neutral'

# *Here i tried Few Shot Prompting that got 60% acc*

In [5]:
# def prompt_fun_few_shot(row):
#     # Define few-shot examples (ideally 2–3)
#     few_shot_prompt = (
#         "Determine the sentiment of the following financial news headlines. "
#         "Respond with one word: Negative, Neutral, or Positive.\n\n"
#         "Example 1:\n"
#         "Input: Company X's shares dropped after a disappointing earnings report.\n"
#         "Answer: Negative\n\n"
#         "Example 2:\n"
#         "Input: The central bank maintained current interest rates.\n"
#         "Answer: Neutral\n\n"
#         "Example 3:\n"
#         "Input: Tech firm Y reported record profits this quarter.\n"
#         "Answer: Positive\n\n"
#         "Now analyze the following news:\n"
#     )

#     # Append the current input
#     input_text = f"Input: {row['input']}\nAnswer: "
#     return few_shot_prompt + input_text


# *Here tried zero shot prompting with a direct instruction that got 84% acc*

In [6]:
#def better_zero_shot_prompt(row):
  #  return (
     #   "Analyze the following financial news and determine the sentiment it expresses. "
     #   "Choose only one: Negative, Neutral, or Positive. "
     #   "Respond with only one word and no explanation.\n"
      #  f"Input: {row['input']}\n"
       # "Answer:"
    #)


# *Here i did chain of thought prompting that got 86% acc*

In [35]:
def chain_of_thought_prompt(row):
   return (
       "You are a financial sentiment expert. think step-by-step to determine if the following news is: Negative, Neutral, or Positive."
        f"Input: {row['input']}\n"
       "Reasoning: "
   )


# *Here i did Zero Short COT*

In [8]:
# def zero_shot_chain_of_thought_prompt(row):
#     return (
#         "You are a financial sentiment expert. Analyze the financial news carefully and think step-by-step to determine the sentiment expressed. "
#         "Consider whether the language indicates optimism, pessimism, or neutrality in the financial context. "
#         "Finally, classify the sentiment using one word only: Negative, Neutral, or Positive.\n\n"
#         f"Input: {row['input']}\n"
#         "Reasoning: "
#     )


# *Automatic Chain of Thought Prompt Version:*

In [9]:
# def automatic_chain_of_thought_prompt(row):
#     return (
#         "You are a financial sentiment expert. Analyze the financial news carefully and think step-by-step to determine the sentiment expressed. "
#         "Consider whether the language indicates optimism, pessimism, or neutrality in the financial context. "
#         "Finally, classify the sentiment using one word only: Negative, Neutral, or Positive.\n\n"
        
#         "Example:\n"
#         "Input: The company's quarterly earnings surpassed expectations, boosting investor confidence.\n"
#         "Reasoning: The news talks about exceeding expectations and increased investor confidence, which suggests optimism.\n"
#         "Answer: Positive\n\n"

#         f"Input: {row['input']}\n"
#         "Reasoning: "
#     )


# *Meta Prompting Version:*

In [10]:
# def meta_prompting_chain_of_thought(row):
#     return (
#         "You are a financial sentiment expert. Your task is not just to classify sentiment, but to design the best reasoning strategy to do so.\n"
#         "First, decide what reasoning steps would help determine if the news expresses optimism, pessimism, or neutrality.\n"
#         "Then, apply your own strategy to analyze the input.\n"
#         "Finally, classify the sentiment using one word only: Negative, Neutral, or Positive.\n\n"
        
#         f"Input: {row['input']}\n"
#         "Step-by-step reasoning strategy: \n"
#         "Applied reasoning: \n"
#         "Answer: "
#     )


# *Generated Knowledge Prompting*

In [11]:
# def generated_knowledge_prompt(row):
#     return (
#         "You are a financial sentiment expert. Your task is to first generate relevant background knowledge to help interpret the following financial statement excerpt. "
#         "Then, using that knowledge, analyze the excerpt and determine the sentiment expressed (optimism, pessimism, or neutrality). "
#         "Finally, classify the sentiment using one word only: Negative, Neutral, or Positive.\n\n"
        
#         f"Input: {row['input']}\n"
        
#         "Generated Knowledge: \n"
#         "Step-by-step Analysis: \n"
#         "Final Answer: "
#     )


# *Prompt Chaining*

In [12]:
# def prompt_chain(row):
#     return (
#         "You are a financial sentiment expert. Follow the steps below:\n\n"
#         f"Input: {row['input']}\n\n"
#         "Step 1 - Extract the key financial facts or events mentioned in the statement.\n"
#         "Step 2 - Analyze the likely impact of these facts on the company's financial outlook.\n"
#         "Step 3 - Classify the sentiment using one word only: Negative, Neutral, or Positive.\n"
#         "Answer: "
#     )


# *Tree-of-Thought Prompting*

In [13]:
# def tree_of_thought_prompt(row):
#     return (
#         "You are a financial sentiment expert. Think through multiple reasoning paths to determine whether the following financial news expresses a Negative, Neutral, or Positive sentiment.\n\n"
#         f"Input: {row['input']}\n\n"
#         "Path 1 - Consider the impact on company revenue and profitability:\n"
#         "...\n\n"
#         "Path 2 - Consider investor and market reactions to similar events:\n"
#         "...\n\n"
#         "Path 3 - Consider short-term vs long-term financial implications:\n"
#         "...\n\n"
#         "After evaluating all paths, synthesize your findings and give a final sentiment decision in one word (Negative, Neutral, or Positive).\n"
#         "Answer: "
#     )


# *Hybrid Prompt*

In [14]:
# def hybrid_cot_selfconsistency_prompt(row):
#     return (
#         "You are a financial sentiment expert. Carefully evaluate the sentiment in the following news article.\n"
#         "Think in multiple ways and explain your reasoning step-by-step.\n\n"
#         f"Input: {row['input']}\n\n"
#         "Path 1 - Analyze how the event affects profitability and earnings:\n"
#         "...\n\n"
#         "Path 2 - Consider the market and investor reaction to similar events:\n"
#         "...\n\n"
#         "Path 3 - Evaluate the long-term vs short-term financial implications:\n"
#         "...\n\n"
#         "Synthesize your thoughts across the paths. Then, classify the sentiment with one word only: Negative, Neutral, or Positive.\n"
#         "Answer: "
#     )


# *refined Zero-Shot CoT*

In [15]:
# def best_prompt(row):
#     return (
#         "You are a financial sentiment expert. Analyze the financial news below and determine the sentiment.\n"
#         "Think step-by-step and explain your reasoning. Then give your final sentiment in one word: Negative, Neutral, or Positive.\n\n"
#         f"News: {row['input']}\n\n"
#         "Reasoning:"
#     )


# *Here was the standard prompting that got 88% - 87% acc*

In [5]:
# ============================== #
# 5. Main Evaluation Function
# ============================== #
def test_fpb(model, tokenizer, batch_size=4, prompt_fun=None):
    data = load_dataset("financial_phrasebank", "sentences_50agree")["train"]
    data = data.train_test_split(seed=42)['test']
    df = data.to_pandas()
    df.columns = ["input", "output"]
    df["output"] = df["output"].apply(lambda x: label_map[x])

    if prompt_fun is None:
        df["instruction"] = "What is the sentiment of this news? Please choose an answer from {negative/neutral/positive}."
    else:
        df["instruction"] = df.apply(prompt_fun, axis=1)

    df[["context", "target"]] = df.apply(format_example, axis=1, result_type="expand")

    print(f"\nPrompt Example:\n{df['context'].iloc[0]}\n")

    context = df['context'].tolist()
    total_steps = len(context) // batch_size + 1

    print(f"Evaluating {len(context)} samples with batch size {batch_size}...")

    out_text_list = []
    for i in tqdm(range(total_steps)):
        tmp_context = context[i * batch_size:(i + 1) * batch_size]
        if not tmp_context:
            continue
        tokens = tokenizer(tmp_context, return_tensors='pt', padding=True, truncation=True, max_length=512)
        tokens = {k: v.cuda() for k, v in tokens.items()}
        with torch.no_grad():
            res = model.generate(**tokens, max_length=512, pad_token_id=tokenizer.eos_token_id)
        decoded = tokenizer.batch_decode(res, skip_special_tokens=True)
        extracted = [text.split("Answer: ")[-1].strip() for text in decoded]
        out_text_list.extend(extracted)
        torch.cuda.empty_cache()

    df["out_text"] = out_text_list
    df["new_target"] = df["target"].apply(change_target)
    df["new_out"] = df["out_text"].apply(change_target)

    acc = accuracy_score(df["new_target"], df["new_out"])
    f1_macro = f1_score(df["new_target"], df["new_out"], average="macro")
    f1_micro = f1_score(df["new_target"], df["new_out"], average="micro")
    f1_weighted = f1_score(df["new_target"], df["new_out"], average="weighted")

    print(f"\n✅ Evaluation Results:")
    print(f"Accuracy: {acc:.4f}")
    print(f"F1 Macro: {f1_macro:.4f}")
    print(f"F1 Micro: {f1_micro:.4f}")
    print(f"F1 Weighted: {f1_weighted:.4f}")

    return df

In [6]:
# ============================== #
# 6. Run Evaluation
# ============================== #
#results_df = test_fpb(model, tokenizer, batch_size=4)

instructions_df = test_fpb(model, tokenizer, batch_size=8, prompt_fun=chain_of_thought_prompt)

NameError: name 'model' is not defined

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# Define class labels explicitly
class_labels = ["negative", "neutral", "positive"]

# Generate the confusion matrix
#cm = confusion_matrix(results_df["new_target"], results_df["new_out"], labels=class_labels)
cm = confusion_matrix(instructions_df["new_target"], instructions_df["new_out"], labels=class_labels)

# Plot the confusion matrix
fig, ax = plt.subplots(figsize=(6, 6))
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=class_labels)
disp.plot(cmap='Blues', ax=ax, values_format='d')

plt.title("Confusion Matrix - FinGPT Sentiment Classification")
plt.grid(False)
plt.show()
