<a href="https://colab.research.google.com/github/kairamilanifitria/PurpleBox-Intern/blob/main/03_11_agent_pdf1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# conversational agent

In [None]:
!pip install supabase

In [3]:
import os
import ast
import json
import torch
import uuid
import numpy as np
import re
import openai
from supabase import create_client, Client
from transformers import AutoTokenizer, AutoModel
from scipy.spatial.distance import cosine
from collections import deque


SUPABASE_URL = "____________"
SUPABASE_KEY = "________________"

supabase: Client = create_client(SUPABASE_URL, SUPABASE_KEY)

# Load Embedding Model
tokenizer = AutoTokenizer.from_pretrained("Alibaba-NLP/gte-multilingual-base", trust_remote_code=True)
model = AutoModel.from_pretrained("Alibaba-NLP/gte-multilingual-base", trust_remote_code=True).to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))

# OpenAI API Key
OPENAI_API_KEY = "__________________"
openai.api_key = OPENAI_API_KEY

# Chat history storage (FIFO queue for memory management)
import numpy as np
import ast
import re
from scipy.spatial.distance import cosine
from collections import deque

# Chat history storage (FIFO queue for memory management)
chat_history = deque(maxlen=10)  # Keeps the last 10 interactions

def get_embedding(text):
    """Generates an embedding vector from input text."""
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512).to(model.device)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().cpu().tolist()

def query_supabase(user_query):
    """Retrieves both text and table chunks based on query, using improved embeddings and keyword matching."""

    #### Step 1: Get Query Embedding ####
    query_embedding = np.array(get_embedding(user_query), dtype=np.float32).flatten()

    #### Step 2: Retrieve Text Chunks (Vector Search) ####
    response_text = supabase.table("documents").select("chunk_id, content, embedding, type, metadata").execute()
    text_results = []

    for record in response_text.data:
        chunk_embedding = record["embedding"]

        # Convert stored string embeddings to list if needed
        if isinstance(chunk_embedding, str):
            chunk_embedding = ast.literal_eval(chunk_embedding)

        chunk_embedding = np.array(chunk_embedding, dtype=np.float32).flatten()

        if chunk_embedding.shape == query_embedding.shape:
            similarity = 1 - cosine(query_embedding, chunk_embedding)
            text_results.append((record["chunk_id"], "text", record["content"], similarity))

    #### Step 3: Retrieve Table Chunks (Description + Embedding Match) ####
    response_tables = supabase.table("tables").select("chunk_id, table_data, description, embedding, metadata").execute()
    table_results = []

    for record in response_tables.data:
        table_data = record["table_data"]
        metadata = record.get("metadata", {})
        table_description = record.get("description", "")  # Use generated description
        table_embedding = record.get("embedding", None)

        # Ensure metadata fields are strings
        table_title = str(metadata.get("table_title", ""))
        section = str(metadata.get("section", ""))

        # Extract table number from the query (if any)
        table_number_match = re.search(r'table (\d+)', user_query, re.IGNORECASE)
        specified_table_number = table_number_match.group(1) if table_number_match else None

        # Step 3.1: Keyword Matching for Table Title, Section & Description
        keyword_match_score = 0
        if re.search(rf"\b{re.escape(user_query)}\b", table_title, re.IGNORECASE):
            keyword_match_score += 0.5  # Higher weight for title match
        if re.search(rf"\b{re.escape(user_query)}\b", section, re.IGNORECASE):
            keyword_match_score += 0.3  # Lower weight for section match
        if re.search(rf"\b{re.escape(user_query)}\b", table_description, re.IGNORECASE):
            keyword_match_score += 0.7  # Highest weight for description match

        # Prioritize the exact table number if mentioned
        if specified_table_number and specified_table_number in table_title.lower():
            keyword_match_score += 1.0  # Give a strong boost to matching table numbers

        # Step 3.2: Compute Embedding Similarity
        if table_embedding:
            if isinstance(table_embedding, str):
                table_embedding = ast.literal_eval(table_embedding)  # Convert string to list
            table_embedding = np.array(table_embedding, dtype=np.float32).flatten()

            if table_embedding.shape == query_embedding.shape:
                similarity = 1 - cosine(query_embedding, table_embedding)
                final_score = (0.7 * similarity) + (1.3 * keyword_match_score)  # Boost keyword matching
                table_results.append((record["chunk_id"], "table", table_description, final_score))

    #### Step 4: Merge & Sort Results ####
    all_results = text_results + table_results
    all_results.sort(key=lambda x: x[3], reverse=True)  # Sort by final similarity score

    return all_results[:5]  # Return top 5 results

# Function to call OpenAI LLM with chat history
def call_openai_llm(user_query, retrieved_chunks, chat_history=[]):
    """Send the query along with retrieved context and chat history to OpenAI API."""

    # Prepare context from retrieved chunks
    context_text = "\n\n".join([f"Chunk {i+1}: {chunk[2]}" for i, chunk in enumerate(retrieved_chunks)])

    # Construct messages for conversational memory
    messages = [
        {"role": "system", "content": "You are an intelligent assistant. Use the following retrieved information to answer the user's query."},
    ]

    # Append chat history
    messages.extend(chat_history)

    # Append current query with retrieved context
    messages.append({"role": "user", "content": f"Context:\n{context_text}\n\nUser's Question: {user_query}"})

    # Call OpenAI's Chat API with the new format
    client = openai.OpenAI(api_key=openai.api_key)  # Ensure you are using the new client-based API
    response = client.chat.completions.create(
        model="gpt-4-turbo",  # You can change this to another OpenAI model
        messages=messages,
        temperature=0.7
    )

    answer = response.choices[0].message.content  # Adjusted based on the new API response format

    # Append response to chat history
    chat_history.append({"role": "user", "content": user_query})
    chat_history.append({"role": "assistant", "content": answer})

    return answer, chat_history


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

configuration.py:   0%|          | 0.00/7.13k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/Alibaba-NLP/new-impl:
- configuration.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling.py:   0%|          | 0.00/59.0k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/Alibaba-NLP/new-impl:
- modeling.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors:   0%|          | 0.00/611M [00:00<?, ?B/s]

Some weights of the model checkpoint at Alibaba-NLP/gte-multilingual-base were not used when initializing NewModel: {'classifier.weight', 'classifier.bias'}
- This IS expected if you are initializing NewModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing NewModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [7]:
# Example usage
user_query = "is there any statistical or tables data?"
retrieved_chunks = query_supabase(user_query)
chat_history = []  # Store conversation history

if retrieved_chunks:
    response, chat_history = call_openai_llm(user_query, retrieved_chunks, chat_history)
    print("\n🔹 Chatbot Response:\n", response)
else:
    print("No relevant information found.")



🔹 Chatbot Response:
 Yes, there are statistical tables and data mentioned in the provided chunks:

1. In Chunk 1, there is mention of "Table 3: Classification Report for Best Volatility Direction Model" associated with the 'overall daily average title' sentiment score for the third data input.
2. In Chunk 2, there is mention of "Table 2: Classification Report for Best Volatility Direction Model" associated with the 'overall daily average' sentiment score for the second data input.
3. In Chunk 5, there is a reference to a table under Appendix A titled "Ticker News Weight Table," which includes data fields such as Close Price, Capital, Capital Weight %, and News Weight %.


In [11]:
new_query = "what about table 1?"

response, chat_history = call_openai_llm(new_query, retrieved_chunks, chat_history)

print("\n🔹 Chatbot Response:\n", response)


🔹 Chatbot Response:
 Table 1 is mentioned in Chunk 5. It presents the Classification Report for Best Volatility Direction Model based on the 'daily weighted average' sentiment score. This table is part of the results generated from the application of a simple Linear Discriminant Analysis (LDA) algorithm, which was used to forecast results with volatility direction as the predicted target, utilizing sentiment scores as the training parameters. This table is likely to contain performance metrics such as precision, recall, F1-score, and support for different classes, similar to the details provided in Tables 2 and 3.


In [12]:
new_query = "what are metrics used in all data table?"

response, chat_history = call_openai_llm(new_query, retrieved_chunks, chat_history)

print("\n🔹 Chatbot Response:\n", response)


🔹 Chatbot Response:
 The metrics used in the classification reports for the Best Volatility Direction Model across the tables mentioned are:

1. **Precision**: Measures the accuracy of positive predictions. It is the ratio of correctly predicted positive observations to the total predicted positives.
2. **Recall** (also known as Sensitivity or True Positive Rate): Measures the ability of a model to identify all relevant instances. It is the ratio of correctly predicted positive observations to all observations in the actual class.
3. **F1-Score**: The weighted average of Precision and Recall. This score takes both false positives and false negatives into account, providing a balance between Precision and Recall in uneven class distribution scenarios.
4. **Support**: The number of actual occurrences of the class in the specified dataset.

These metrics are typical for classification reports and help in assessing the performance of the classification model across different classes.


In [5]:
# Example usage
user_query = "explain all table data provided in the document?"
retrieved_chunks = query_supabase(user_query)
chat_history = []  # Store conversation history

if retrieved_chunks:
    response, chat_history = call_openai_llm(user_query, retrieved_chunks, chat_history)
    print("\n🔹 Chatbot Response:\n", response)
else:
    print("No relevant information found.")



🔹 Chatbot Response:
 The document appears to include several tables related to sentiment analysis and volatility prediction models, as well as adjustments in data related to stock market indices. Here’s an explanation of each table mentioned:

1. **Table 3: Classification Report for Best Volatility Direction Model (Chunk 1)** - This table likely presents the results of a classification model that predicts the direction of market volatility based on the 'overall daily average title sentiment score'. A classification report typically includes measures such as accuracy, precision, recall, and F1-score for each class predicted by the model.

2. **Table 2: Classification Report for Best Volatility Direction Model (Chunk 2)** - Similar to Table 3, this table presents the results of a classification model. However, it uses the 'overall daily average sentiment score' as input. It also likely includes metrics to evaluate the performance of the model in predicting the direction of market volati

In [6]:
# Example usage
user_query = "explain all statistical data provided in the document"
retrieved_chunks = query_supabase(user_query)
chat_history = []  # Store conversation history

if retrieved_chunks:
    response, chat_history = call_openai_llm(user_query, retrieved_chunks, chat_history)
    print("\n🔹 Chatbot Response:\n", response)
else:
    print("No relevant information found.")



🔹 Chatbot Response:
 The statistical data and methodologies described in the document are centered around the analysis of sentiment scores derived from financial data and their impact on volatility direction models in a financial setting. Here’s a breakdown based on the provided chunks:

### Chunk 1 & Chunk 2: Data and LDA Model Results
- **Data Input**: These chunks mention different data inputs related to sentiment scores:
  - The 'overall daily average title' sentiment score.
  - The 'overall daily average' sentiment score.
- **Model Results**: Both chunks reference a "Classification Report for Best Volatility Direction Model". This likely includes statistical metrics (such as accuracy, precision, recall, F1-score) used to evaluate how well the model predicts the direction of volatility based on the sentiment scores.

### Chunk 3: Machine Learning Procedures
- **Adjusted Sentiment Scores**: This mentions that columns in a dataset represent adjusted sentiment scores, which are tweak

In [16]:
# Example usage
user_query = "what does Deveikyte state in his paper?"
retrieved_chunks = query_supabase(user_query)
chat_history = []  # Store conversation history

if retrieved_chunks:
    response, chat_history = call_openai_llm(user_query, retrieved_chunks, chat_history)
    print("\n🔹 Chatbot Response:\n", response)
else:
    print("No relevant information found.")



🔹 Chatbot Response:
 Deveikyte et al. in their paper applied Latent Dirichlet Allocation to forecast market prices and volatility. They proposed a thorough approach to compute the sentiment score and forecast the directions of the next day's market return and volatility specifically for FTSE100 stocks.


In [18]:
# Example usage
user_query = "where does the Primary data obtained?"
retrieved_chunks = query_supabase(user_query)
chat_history = []  # Store conversation history

if retrieved_chunks:
    response, chat_history = call_openai_llm(user_query, retrieved_chunks, chat_history)
    print("\n🔹 Chatbot Response:\n", response)
else:
    print("No relevant information found.")



🔹 Chatbot Response:
 The primary data mentioned in the provided chunks is primarily sourced from news coverage related to the semiconductor sector, specifically focusing on tickers from the iShares Semiconductor ETF (SOXX). The sentiment scores are derived from aggregated news data, which is then processed and adjusted by individual tickers' weights within the overall SOXX sector. Additionally, certain data elements such as the tickers' weights are adjusted by removing entries that do not represent actual companies, as seen in Chunk 4 with the removal of futures indexes. This data is then used to compute various sentiment scores, which are fed into a Linear Discriminant Analysis model to predict market volatility directions.
