In [1]:
import pickle

# Load the pickle file
with open('Vectordatabase/financial_chunks_metadata_sentences.pkl', 'rb') as file:
    text_chunks = pickle.load(file)

import numpy as np
import faiss
from sentence_transformers import SentenceTransformer

# Initialize SentenceTransformer model
text_embedder = SentenceTransformer('all-MiniLM-L6-v2')


def retrieve_similar(query, k=1):
    # Get query embedding
    query_embedding = text_embedder.encode([query])
    # Search in FAISS index
    distances, indices = index.search(query_embedding.astype(np.float32), k)
    return indices, distances

def generate_response(query):
    indices, distances = retrieve_similar(query)
    context = "\n".join([text_chunks[idx]['raw_text'] for _, idx in enumerate(indices[0])])

    return context

  from tqdm.autonotebook import tqdm, trange


In [2]:
import pandas as pd

# Function to convert text file to DataFrame
def text_to_dataframe(file_path):
    questions = []
    answers = []
    
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read().strip()
        entries = content.split('\n\n')  # Split on double newline for each Q&A
        
        for entry in entries:
            if entry.strip():  # Ensure the entry is not empty
                lines = entry.split('\n')
                if len(lines) >= 2:  # Ensure there are at least 2 lines (Q and A)
                    question = lines[0].replace('Q: ', '').strip()
                    answer = lines[1].replace('A: ', '').strip()
                    questions.append(question)
                    answers.append(answer)

    # Create a DataFrame
    df = pd.DataFrame({'Question': questions, 'Answer': answers})
    return df

In [13]:
# For document in Data that's not PTLO 2023 Q4 10K
import os


folder_path = "../Data"

# Loop through each file in the folder
for filename in os.listdir(folder_path):
    if filename.endswith(".pdf"):
        filename_without_spaces_and_extension = filename.replace(" ", "").replace(".pdf", "")
        index = faiss.read_index('Vectordatabase/financial_docs_text_index_sentences_' + filename_without_spaces_and_extension + '.faiss')
        with open('Vectordatabase/financial_chunks_metadata_sentences_' + filename_without_spaces_and_extension + '.pkl', 'rb') as file:
            text_chunks = pickle.load(file)
        file_path = "../Data/"
        file_path = file_path + filename.replace(".pdf", "")
        file_path = file_path + ".txt"
        df = text_to_dataframe(file_path)
        df['RAG_context'] = df['Question'].apply(generate_response)
        df.to_csv("LLM_" + filename_without_spaces_and_extension + "_test.csv")

        

In [15]:
def generate_response(query, context):
    
    input_text = f"question: {query} context: {context}. Please limit the response to 5 sentences, and use visual language to explain the concept. No images, and no newlines just a short one sentence overall summary, then the next few building up a story to support the summary."


    chat_completion = client.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": input_text,
            }
        ],
        model="llama3-8b-8192",
    )

    response = chat_completion.choices[0].message.content

    return response

In [16]:
import time
from groq import Groq

client = Groq(
    api_key=os.environ.get("GROQ_API_KEY"),
)

folder_path = "Test_Files"

# Loop through each file in the folder
for filename in os.listdir(folder_path):
    df = pd.read_csv(folder_path + "/" + filename)
    # Loop through the DataFrame, processing 30 rows every minute
    for start_idx in range(0, len(df), 30):
        end_idx = min(start_idx + 30, len(df))  # Ensure we don't go out of bounds
        
        # Apply the function to the current chunk using both columns
        df.loc[start_idx:end_idx-1, 'RAG_5_sent_visual'] = df.loc[start_idx:end_idx-1].apply(
            lambda row: generate_response(row['Question'], row['RAG_context']), axis=1
        )
        
        # Print progress for tracking
        print(f"Processed rows {start_idx} to {end_idx - 1}")
        
        # Sleep for 60 seconds after processing each chunk
        if end_idx < len(df):  # No need to sleep after the last chunk
            print("Sleeping for 1 minute...")
            time.sleep(60)
    df.to_csv("LLM_Test_Files/" + filename)

Processed rows 0 to 29
Sleeping for 1 minute...
Processed rows 30 to 51
Processed rows 0 to 29
Sleeping for 1 minute...
Processed rows 30 to 44
Processed rows 0 to 29
Sleeping for 1 minute...
Processed rows 30 to 52
Processed rows 0 to 29
Sleeping for 1 minute...
Processed rows 30 to 59
Sleeping for 1 minute...
Processed rows 60 to 61
