In [None]:
!pip install -U trl transformers accelerate bitsandbytes peft

Collecting bitsandbytes
  Downloading bitsandbytes-0.48.2-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting peft
  Downloading peft-0.18.0-py3-none-any.whl.metadata (14 kB)
Downloading bitsandbytes-0.48.2-py3-none-manylinux_2_24_x86_64.whl (59.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.4/59.4 MB[0m [31m45.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading peft-0.18.0-py3-none-any.whl (556 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m556.4/556.4 kB[0m [31m49.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes, peft
  Attempting uninstall: peft
    Found existing installation: peft 0.17.1
    Uninstalling peft-0.17.1:
      Successfully uninstalled peft-0.17.1
Successfully installed bitsandbytes-0.48.2 peft-0.18.0


In [1]:
import pandas as pd
import torch
from datasets import Dataset, DatasetDict
from huggingface_hub import login
from peft import LoraConfig, PeftModel, get_peft_model
from rich.console import Console
from rich.table import Table
from rich.text import Text
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    # TrainingArguments is no longer used
)
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from torch.amp import autocast

# Clear previous models and cache to free up GPU memory
if 'model' in locals() and model is not None:
    del model
if 'tokenizer' in locals() and tokenizer is not None:
    del tokenizer
torch.cuda.empty_cache()

# --- Make sure TRL is installed ---
try:
    from trl import SFTTrainer, SFTConfig # <-- Import SFTConfig
except ImportError:
    print("TRL not found. Installing...")
    import subprocess
    import sys
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "trl"])
    from trl import SFTTrainer, SFTConfig

# --- Make sure PEFT, Accelerate, and BitsAndBytes are installed ---
try:
    import peft
except ImportError:
    print("PEFT not found. Installing...")
    import subprocess
    import sys
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "peft"])

try:
    import accelerate
except ImportError:
    print("Accelerate not found. Installing...")
    import subprocess
    import sys
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "accelerate"])

try:
    import bitsandbytes
except ImportError:
    print("bitsandbytes not found. Installing...")
    import subprocess
    import sys
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-qU", "bitsandbytes"]) # Use -qU to ensure upgrade


import re
import numpy as np

# --- Configuration ---

# *** CRITICAL STEP ***
# Llama 3.1 is a gated model. You MUST paste your Hugging Face
# read-access token here or the download will fail.
HF_TOKEN = "TOKEN" # <--- PASTE YOUR HF TOKEN HERE

MODEL_NAME = "meta-llama/Llama-3.1-8B-Instruct"
NEW_MODEL_NAME = "llama-3.1-8b-financial-predictor"
DATA_PATH = "Dataset.csv"
PRICE_HISTORY_DAYS = 15
NEWS_HISTORY_DAYS = 7

# --- Rich Console Setup ---
console = Console()
def print_rich(text, style="bold green"):
    """Prints rich formatted text to the console."""
    console.print(Text(text, style=style))

def create_finetuning_dataset(data_path: str, test_size: float = 0.1) -> (Dataset, Dataset, pd.DataFrame):
    """
    Loads, processes, and formats the dataset for fine-tuning.

    *** NEW: Now includes VADER sentiment analysis. ***
    """
    print_rich(f"Loading data from {data_path}...", style="cyan")
    try:
        df = pd.read_csv(data_path)
    except FileNotFoundError:
        print_rich(f"Error: The file {data_path} was not found.", style="bold red")
        return None, None, None
    except Exception as e:
        print_rich(f"Error loading data: {e}", style="bold red")
        return None, None, None

    # --- Basic Cleaning ---
    df = df.dropna(subset=['prices'])
    df['news'] = df['news'].fillna("No specific news reported.")
    df['date'] = pd.to_datetime(df['date'])
    df = df.sort_values(by=['ticker', 'date'])

    # ---
    # !!! --- NEW: VADER Sentiment Analysis --- !!!
    # ---
    print_rich("Initializing VADER for sentiment analysis...", style="yellow")
    try:
        # Check if VADER is already downloaded
        nltk.data.find('sentiment/vader_lexicon.zip')
    except LookupError:
        # Download VADER if it's not found
        print_rich("VADER lexicon not found. Downloading...", style="yellow")
        nltk.download('vader_lexicon')

    sid = SentimentIntensityAnalyzer()

    # Calculate sentiment for each news item. We use the 'compound' score.
    df['sentiment'] = df['news'].apply(lambda x: sid.polarity_scores(x)['compound'])
    print_rich("Sentiment scores calculated for all news items.", style="cyan")
    # --- End of new section ---

    # --- Feature Engineering (Label & Context) ---
    def get_average_price(price_str):
        try:
            prices = [float(p) for p in str(price_str).split(',') if p.strip()]
            if not prices:
                return np.nan
            return sum(prices) / len(prices)
        except (ValueError, IndexError, ZeroDivisionError):
            return np.nan

    print_rich("Calculating average price for each day...", style="yellow")
    df['price_t_avg'] = df['prices'].apply(get_average_price)
    df['price_t_avg'] = pd.to_numeric(df['price_t_avg'], errors='coerce')
    df = df.dropna(subset=['price_t_avg'])
    df['price_t_avg'] = df['price_t_avg'].astype(float)

    df['price_t1_avg'] = df.groupby('ticker')['price_t_avg'].shift(-1)
    df['pct_change'] = df.groupby('ticker')['price_t_avg'].pct_change()

    print_rich(f"Building {PRICE_HISTORY_DAYS}-day price *percent change* history...", style="yellow")
    price_history_list = []
    for ticker, g in df.groupby("ticker"):
        pct_changes = g["pct_change"].values
        history_for_ticker = []
        for i in range(len(pct_changes)):
            start = max(0, i - PRICE_HISTORY_DAYS)
            window = pct_changes[start:i]
            if len(window) < 5:
                history_for_ticker.append(np.nan)
                continue
            history_for_ticker.append(
                ", ".join(f"{p*100:+.2f}%" for p in window if pd.notna(p))
            )
        price_history_list.extend(history_for_ticker)
    df["price_history"] = price_history_list

    print_rich(f"Building {NEWS_HISTORY_DAYS}-day news history...", style="yellow")
    news_history_list = []
    for ticker, g in df.groupby("ticker"):
        news_items = g["news"].astype(str).values
        history_for_ticker = []
        for i in range(len(news_items)):
            start = max(0, i - NEWS_HISTORY_DAYS + 1)
            window = news_items[start:i+1]
            history_for_ticker.append(" | ".join(window))
        news_history_list.extend(history_for_ticker)
    df["news_history"] = news_history_list

    # ---
    # !!! --- NEW: Create Sentiment History Feature --- !!!
    # ---
    print_rich(f"Building {NEWS_HISTORY_DAYS}-day sentiment history...", style="yellow")
    sentiment_history_list = []
    for ticker, g in df.groupby("ticker"):
        sentiments = g["sentiment"].values # Use the new sentiment column
        history_for_ticker = []
        for i in range(len(sentiments)):
            start = max(0, i - NEWS_HISTORY_DAYS + 1)
            window = sentiments[start:i+1]
            # Create a string of sentiment scores
            history_for_ticker.append(
                ", ".join(f"{s:+.2f}" for s in window if pd.notna(s))
            )
        sentiment_history_list.extend(history_for_ticker)
    df["sentiment_history"] = sentiment_history_list
    # --- End of new section ---

    # Add the new column to the dropna list
    df = df.dropna(subset=['price_t1_avg', 'price_history', 'news_history', 'sentiment_history'])
    df = df[df['price_history'].str.len() > 0]
    df['label'] = np.where(df['price_t1_avg'] > df['price_t_avg'], 'UP', 'DOWN')

    print_rich(f"Processed {len(df)} datapoints with full historical context.", style="cyan")

    # --- Balance the ENTIRE dataset BEFORE splitting ---
    print_rich("Balancing the dataset...", style="bold yellow")

    label_counts = df['label'].value_counts()
    min_label_count = label_counts.min()
    print_rich(f"Label counts before balancing: UP={label_counts.get('UP', 0)}, DOWN={label_counts.get('DOWN', 0)}", style="yellow")
    print_rich(f"Undersampling to {min_label_count} samples per class.", style="yellow")

    df_up = df[df['label'] == 'UP'].sample(min_label_count, random_state=42)
    df_down = df[df['label'] == 'DOWN'].sample(min_label_count, random_state=42)
    df_balanced = pd.concat([df_up, df_down])
    df_balanced = df_balanced.sample(frac=1, random_state=42)

    print_rich(f"Balanced dataset size: {len(df_balanced)}", style="cyan")

    # NOW split into train/test
    # Replace the train_test_split line with:
    train_df, test_df = train_test_split(
        df_balanced,
        test_size=test_size,
        shuffle=True,
        stratify=df_balanced['label'],  # <-- Add this
        random_state=42
    )

    print_rich("\n--- Dataset Statistics ---", style="bold cyan")
    print_rich(f"Training set: {train_df['label'].value_counts().to_dict()}", style="yellow")
    print_rich(f"Test set: {test_df['label'].value_counts().to_dict()}", style="yellow")

    train_ds = Dataset.from_pandas(train_df)
    test_ds = Dataset.from_pandas(test_df)

    return train_ds, test_ds, test_df

#
# --- !!! ENTIRE FUNCTION CORRECTED !!! ---
#
def train_model(train_ds: Dataset, test_ds: Dataset): # <-- Added test_ds
    """
    Loads a base model, configures it for 4-bit QLoRA, and runs the fine-tuning.
    """

    if not HF_TOKEN:
        print_rich("HF_TOKEN is not set. Llama 3.1 is a gated model.", style="bold red")
        print_rich("Please paste your Hugging Face token into the HF_TOKEN variable.", style="bold red")
        return None, None

    print_rich("Logging into Hugging Face Hub...", style="yellow")
    try:
        login(token=HF_TOKEN)
        print_rich("Login successful.", style="green")
    except Exception as e:
        print_rich(f"Login failed: {e}. Check your HF_TOKEN.", style="bold red")
        return None, None

    # --- Model Loading Configuration ---
    print_rich("Loading base model with 4-bit QLoRA to save memory...", style="yellow")
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_use_double_quant=True,
    )

    model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        quantization_config=bnb_config,
        device_map="auto",
        dtype=torch.bfloat16,
        trust_remote_code=True,
    )
    model.config.use_cache = False
    model.config.pretraining_tp = 1

    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
        print_rich("Set pad_token to eos_token for Llama 3.1 tokenizer.", style="yellow")
    tokenizer.padding_side = "right"

    # --- PEFT (LoRA) Configuration ---
    lora_config = LoraConfig(
        r=64,
        lora_alpha=128,
        lora_dropout=0.05,
        bias="none",
        task_type="CAUSAL_LM",
        target_modules=[
            "q_proj",
            "k_proj",
            "v_proj",
            "o_proj",
            "gate_proj",
            "up_proj",
            "down_proj",
        ],
    )
    model = get_peft_model(model, lora_config)
    print_rich(f"PEFT model configured with r={lora_config.r} and lora_alpha={lora_config.lora_alpha}.", style="cyan")

    # ---
    # !!! --- API FIX: Use SFTConfig instead of TrainingArguments --- !!!
    # ---
    print_rich("Setting up SFTConfig...", style="yellow")
    sft_config = SFTConfig(
        # All the training args
        output_dir="./results",
        num_train_epochs=3,
        per_device_train_batch_size=8,  # <-- Increase from 4 to 8 or even 16
        gradient_accumulation_steps=4,  # <-- Reduce from 8 to 4
        gradient_checkpointing=True,     # <-- Enable this for memory
        learning_rate=2e-5,
        weight_decay=0.01,
        optim="paged_adamw_32bit",
        logging_steps=25,
        max_steps=-1,
        warmup_ratio=0.03,
        group_by_length=True,
        lr_scheduler_type="cosine",
        save_strategy="epoch",
        report_to="none",
        bf16=True,

        # Add evaluation
        eval_strategy="epoch", # <-- Added this

        # --- SFT-specific args that were missing ---
        dataset_text_field="text",   # <-- Added this
    )

    # --- Create the prompt formatting function ---
    def format_chat_template(row):
        system_prompt = "You are a financial analyst. Analyze the provided data and predict the next day's stock price direction. Consider BOTH possibilities (UP or DOWN) equally before deciding."

        user_prompt = f"""Analyze the following financial data for ticker {row['ticker']} and predict the stock price direction for the next trading day.

        ### Ticker:
        {row['ticker']}

        ### Price History (Daily % Change, last {PRICE_HISTORY_DAYS} days):
        {row['price_history']}

        ### Recent News (last {NEWS_HISTORY_DAYS} days):
        {row['news_history']}

        ### Recent Sentiment (last {NEWS_HISTORY_DAYS} days, VADER score):
        {row['sentiment_history']}

        Important: Carefully evaluate whether the indicators suggest UP or DOWN movement. Do not default to one answer.

        Answer with EXACTLY ONE WORD: UP or DOWN"""

        messages = [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt},
            {"role": "assistant", "content": f"{row['label']}"}
        ]

        # This function now has access to the `tokenizer`
        return {"text": tokenizer.apply_chat_template(messages, tokenize=False)}

    print_rich("Applying chat template formatting to datasets...", style="cyan")
    train_ds_formatted = train_ds.map(format_chat_template, num_proc=4)
    test_ds_formatted = test_ds.map(format_chat_template, num_proc=4) # <-- Format test set

    # ---
    # !!! --- API FIX: Updated SFTTrainer call --- !!!
    # ---
    trainer = SFTTrainer(
        model=model,
        train_dataset=train_ds_formatted,
        eval_dataset=test_ds_formatted,  # <-- Added eval dataset
        peft_config=lora_config,
        args=sft_config,                 # <-- Pass the SFTConfig object
        # NO tokenizer=tokenizer (this was the error)
    )

    # --- Start Training ---
    print_rich("--- Starting Model Training ---", style="bold magenta")
    trainer.train()
    print_rich("--- Model Training Complete ---", style="bold magenta")

    # --- Save the Model ---
    print_rich(f"Saving fine-tuned model to {NEW_MODEL_NAME}", style="cyan")
    trainer.model.save_pretrained(NEW_MODEL_NAME)
    tokenizer.save_pretrained(NEW_MODEL_NAME)

    return model, tokenizer

def evaluate_model(test_df: pd.DataFrame, model, tokenizer):
    """
    Evaluates the fine-tuned model on the hold-out test set.
    """
    print_rich("\n--- Starting Model Evaluation ---", style="bold magenta")

    # ADD THIS at the start
    print_rich(f"Test set distribution: {test_df['label'].value_counts().to_dict()}", style="cyan")
    print_rich(f"Test set percentages: {test_df['label'].value_counts(normalize=True).to_dict()}", style="cyan")


    predictions = []
    ground_truth = []

    model.eval()
    system_prompt = "You are a financial analyst. Analyze the provided data and predict the next day's stock price direction (UP or DOWN)."

    for _, row in test_df.iterrows():

        # ---
        # !!! --- CRITICAL FIX: Added sentiment_history to the prompt --- !!!
        # ---
        user_prompt = f"""Analyze the following financial data for ticker {row['ticker']} and predict the stock price direction for the next trading day.

        ### Ticker:
        {row['ticker']}

        ### Price History (Daily % Change, last {PRICE_HISTORY_DAYS} days):
        {row['price_history']}

        ### Recent News (last {NEWS_HISTORY_DAYS} days):
        {row['news_history']}

        ### Recent Sentiment (last {NEWS_HISTORY_DAYS} days, VADER score):
        {row['sentiment_history']}

        Based on this data, will the price move UP or DOWN? Answer strictly as: UP or DOWN."""
        # --- End of fix ---

        messages = [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt}
        ]

        prompt_text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
        inputs = tokenizer(prompt_text, return_tensors="pt").to(model.device)

        with torch.no_grad():
            with autocast(device_type='cuda', dtype=torch.bfloat16):
                outputs = model.generate(
                    **inputs,
                    max_new_tokens=3, # Keep small
                    pad_token_id=tokenizer.eos_token_id,
                    do_sample=False
                )

        response_text = tokenizer.decode(outputs[0][inputs.input_ids.shape[1]:], skip_special_tokens=True).strip().upper()

        if response_text.startswith("UP"):
            pred = "UP"
        elif response_text.startswith("DOWN"):
            pred = "DOWN"
        else:
            pred = "UNKNOWN"

        predictions.append(pred)
        ground_truth.append(row['label'])

        if (len(predictions) > 0) and (len(predictions) % 50 == 0):
            print(f"Evaluated {len(predictions)} / {len(test_df)} examples...")

    # --- Display Results ---
    print_rich("\n--- Evaluation Results ---", style="bold magenta")

    accuracy = accuracy_score(ground_truth, predictions)
    print_rich(f"Overall Accuracy: {accuracy * 100:.2f}%")

    cm = confusion_matrix(ground_truth, predictions, labels=["UP", "DOWN", "UNKNOWN"])
    table = Table(title="Confusion Matrix")
    table.add_column("Actual", justify="right", style="cyan")
    table.add_column("Pred UP", justify="right", style="green")
    table.add_column("Pred DOWN", justify="right", style="red")
    table.add_column("Pred UNKNOWN", justify="right", style="yellow")

    labels = ["UP", "DOWN", "UNKNOWN"]
    for i, label in enumerate(labels):
        if label == "UNKNOWN" and sum(cm[i]) == 0 and "UNKNOWN" not in ground_truth: continue
        table.add_row(
            label,
            str(cm[i, 0]),
            str(cm[i, 1]),
            str(cm[i, 2])
        )
    console.print(table)

    print_rich("\nClassification Report:", style="bold white")
    report_labels = ["UP", "DOWN"]
    if "UNKNOWN" in predictions or "UNKNOWN" in ground_truth:
        report_labels.append("UNKNOWN")

    print(classification_report(ground_truth, predictions, labels=report_labels, zero_division=0))


# --- Main execution ---
if __name__ == "__main__":
    train_ds, test_ds, test_dataframe = create_finetuning_dataset(DATA_PATH)

    if train_ds is None or test_dataframe is None:
        print_rich("Dataset creation failed. Exiting.", style="bold red")
    else:
        #
        # --- !!! API FIX: Pass both train_ds and test_ds --- !!!
        #
        model, tokenizer = train_model(train_ds, test_ds) # <-- Pass both

        if model and tokenizer and test_dataframe is not None:
            evaluate_model(test_dataframe, model, tokenizer)
        else:
            print_rich("Training failed. Skipping evaluation.", style="bold red")

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/855 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/184 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/55.4k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

Map (num_proc=4):   0%|          | 0/7050 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/784 [00:00<?, ? examples/s]



Adding EOS to train dataset:   0%|          | 0/7050 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/7050 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/7050 [00:00<?, ? examples/s]

Adding EOS to eval dataset:   0%|          | 0/784 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/784 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/784 [00:00<?, ? examples/s]

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 128009}.
  return fn(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Entropy,Num Tokens,Mean Token Accuracy
1,0.4014,0.637417,0.556616,3210054.0,0.847874


  return fn(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Entropy,Num Tokens,Mean Token Accuracy
1,0.4014,0.637417,0.556616,3210054.0,0.847874
2,0.371,0.452393,0.400632,6420108.0,0.88702
3,0.2272,0.282642,0.281348,9630162.0,0.927547
4,0.1407,0.190524,0.21926,12840216.0,0.950249
5,0.1222,0.17781,0.19042,16050270.0,0.953404


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)


The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Evaluated 50 / 784 examples...
Evaluated 100 / 784 examples...
Evaluated 150 / 784 examples...
Evaluated 200 / 784 examples...
Evaluated 250 / 784 examples...
Evaluated 300 / 784 examples...
Evaluated 350 / 784 examples...
Evaluated 400 / 784 examples...
Evaluated 450 / 784 examples...
Evaluated 500 / 784 examples...
Evaluated 550 / 784 examples...
Evaluated 600 / 784 examples...
Evaluated 650 / 784 examples...
Evaluated 700 / 784 examples...
Evaluated 750 / 784 examples...


              precision    recall  f1-score   support

          UP       0.54      0.71      0.62       392
        DOWN       0.58      0.39      0.47       392

    accuracy                           0.55       784
   macro avg       0.56      0.55      0.54       784
weighted avg       0.56      0.55      0.54       784

