# Finance LLM Integration Notebook

This notebook guides you through generating finance datasets, fine-tuning language models, integrating Milvus for retrieval-augmented generation (RAG), evaluating with ROUGE and LastMile AutoEval, and building a simple web UI.

In [None]:
# Install required packages (run once)
!pip install transformers datasets peft sentence-transformers pymilvus torch fastapi uvicorn[standard] reactpy lastmile jupyterlab

## Part 1: Dataset Generation and Loading

In [None]:
import csv

def generate_finance_datasets():
    basic_data = [
        ("What is a stock?", "A stock represents ownership in a company and a claim on part of its assets and earnings."),
        ("Explain compound interest.", "Compound interest is interest calculated on the initial principal and also on accumulated interest."),
        ("What is a bond?", "A bond is a fixed income instrument representing a loan made by an investor to a borrower."),
        ("How does inflation affect investments?", "Inflation reduces the purchasing power of money, impacting investment returns."),
        ("What is diversification?", "Diversification is an investment strategy to reduce risk by allocating investments across various assets."),
    ]
    with open("finance_basic_finetune.csv", "w", newline="", encoding="utf-8") as f:
        writer = csv.writer(f)
        writer.writerow(["instruction", "response"])
        writer.writerows(basic_data)

    peft_data = [
        ["Stocks are traded on exchanges such as NYSE and NASDAQ."],
        ["The Federal Reserve controls monetary policy in the US."],
        ["Diversification helps reduce unsystematic risk in portfolios."],
        ["ETFs are investment funds traded on stock exchanges."],
        ["Credit ratings assess the creditworthiness of borrowers."],
    ]
    with open("finance_peft_finetune.csv", "w", newline="", encoding="utf-8") as f:
        writer = csv.writer(f)
        writer.writerow(["text"])
        writer.writerows(peft_data)

    preference_data = [
        ("What is a stock?", 
         "A stock represents ownership in a company and a claim on its assets and earnings.",
         "A stock is a type of bond."),
        ("Explain compound interest.", 
         "Compound interest is interest on principal and accumulated interest.",
         "Compound interest is interest only on the principal."),
        ("What is diversification?", 
         "Diversification spreads investments to reduce risk.",
         "Diversification means investing all in one asset."),
    ]
    with open("finance_rlhf_preferences.csv", "w", newline="", encoding="utf-8") as f:
        writer = csv.writer(f)
        writer.writerow(["prompt", "chosen_response", "rejected_response"])
        writer.writerows(preference_data)

generate_finance_datasets()
print("Finance datasets generated.")

## Part 2: Dataset Loading Functions

In [None]:
import pandas as pd
from datasets import Dataset

def load_text_dataset_from_csv(csv_path, text_columns, delimiter="\n"):
    df = pd.read_csv(csv_path)
    df['text'] = df[text_columns].astype(str).agg(delimiter.join, axis=1)
    return Dataset.from_pandas(df[['text']])

def load_text_dataset_from_txt(txt_path):
    with open(txt_path, 'r', encoding='utf-8') as f:
        lines = [line.strip() for line in f if line.strip()]
    return Dataset.from_list([{'text': line} for line in lines])

## Part 3: Basic Fine-tuning Example

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
import torch

def basic_fine_tuning(model_name, dataset_path, output_dir='./basic_finetune_results'):
    dataset = load_text_dataset_from_csv(dataset_path, ['instruction', 'response'], delimiter="\nAssistant: ")
    dataset = dataset.map(lambda x: {'text': f"Human: {x['text']}"})

    model = AutoModelForCausalLM.from_pretrained(model_name)
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    def tokenize_function(examples):
        return tokenizer(examples["text"], truncation=True, max_length=512)

    tokenized_dataset = dataset.map(tokenize_function, batched=True)

    training_args = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=1,  # Increase for real training
        per_device_train_batch_size=2,
        logging_steps=10,
        save_steps=50,
        evaluation_strategy="no",
        save_total_limit=2,
        learning_rate=5e-5,
        weight_decay=0.01,
        warmup_steps=100,
        fp16=torch.cuda.is_available(),
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_dataset,
    )

    trainer.train()
    print(f"Basic fine-tuning done. Model saved at {output_dir}")

# Example call:
# basic_fine_tuning("meta-llama/Llama-3-8b", "finance_basic_finetune.csv")

## Part 4: Milvus RAG Setup and Inference

In [None]:
from pymilvus import MilvusClient
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForCausalLM

class MilvusRAG:
    def __init__(self, db_path="milvus_rag_db.db", collection_name="rag_collection", embedding_dim=768):
        self.client = MilvusClient(db_path)
        self.collection_name = collection_name
        self.embedding_dim = embedding_dim
        if not self.client.has_collection(collection_name):
            self.client.create_collection(collection_name, dimension=embedding_dim)
        self.document_store = {}

    def insert_documents(self, documents):
        embedder = SentenceTransformer('all-MiniLM-L6-v2')
        embeddings = embedder.encode(documents, convert_to_numpy=True).tolist()
        res = self.client.insert(self.collection_name, embeddings)
        ids = res['ids']
        for doc_id, doc_text in zip(ids, documents):
            self.document_store[doc_id] = doc_text
        self.client.flush(self.collection_name)

    def search(self, query, top_k=3):
        embedder = SentenceTransformer('all-MiniLM-L6-v2')
        query_embedding = embedder.encode([query], convert_to_numpy=True).tolist()
        results = self.client.search(self.collection_name, query_embedding, limit=top_k)
        hits = results[0]
        return [self.document_store.get(hit['id'], "") for hit in hits]

def rag_generate(query, rag_client, llm_model, tokenizer, top_k=3, max_length=200):
    retrieved_docs = rag_client.search(query, top_k=top_k)
    context = "\n".join(retrieved_docs)
    prompt = f"Context:\n{context}\n\nQuestion: {query}\nAnswer:"
    inputs = tokenizer(prompt, return_tensors="pt")
    outputs = llm_model.generate(**inputs, max_length=max_length)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

## Part 5: ROUGE Evaluation

In [None]:
from datasets import load_metric

def compute_rouge(predictions, references):
    rouge = load_metric("rouge")
    results = rouge.compute(predictions=predictions, references=references)
    print("ROUGE scores:", results)
    return results

## Part 6: LastMile AutoEval Integration

In [None]:
try:
    from lastmile.lib.auto_eval import AutoEval, BuiltinMetrics
    LASTMILE_AVAILABLE = True
except ImportError:
    LASTMILE_AVAILABLE = False

def setup_lastmile(api_token):
    if not LASTMILE_AVAILABLE:
        raise ImportError("Install lastmile SDK: pip install lastmile")
    client = AutoEval(api_token=api_token)
    print("Connected to LastMile AutoEval")
    return client

def run_lastmile_builtin_eval(client, dataset_path):
    dataset_id = client.upload_dataset(file_path=dataset_path, name="Eval Dataset")
    results = client.evaluate_data(dataset_id=dataset_id, metrics=[BuiltinMetrics.FAITHFULNESS, BuiltinMetrics.RELEVANCE])
    print("LastMile built-in evaluation results:\n", results.head())
    return results

## Part 7: Simple Web UI with ReactPy (runs inside Jupyter)

In [None]:
from reactpy import component, html, run
from reactpy.backend.fastapi import configure
from fastapi import FastAPI

app = FastAPI()

rag_client = None
model = None
tokenizer = None

@component
def ChatApp():
    from reactpy import use_state

    query, set_query = use_state("")
    answer, set_answer = use_state("")
    loading, set_loading = use_state(False)

    async def on_submit(event):
        event.prevent_default()
        set_loading(True)
        import httpx
        async with httpx.AsyncClient() as client:
            response = await client.post("http://localhost:8000/generate", json={"query": query})
            data = response.json()
            set_answer(data.get("answer", "No answer"))
        set_loading(False)

    return html.section(
        html.h1("Finance Q&A Chatbot"),
        html.form(
            html.textarea(
                {"rows": 4, "cols": 60, "value": query, "on_change": lambda e: set_query(e["target"]["value"]), "placeholder": "Ask a finance question..."}
            ),
            html.br(),
            html.button({"type": "submit", "disabled": loading}, "Ask" if not loading else "Thinking..."),
            on_submit=on_submit,
        ),
        html.h2("Answer:"),
        html.p(answer),
    )

configure(app, ChatApp)

# To run the UI inside this notebook, uncomment below:
# run(ChatApp)