In [1]:
from langchain.document_loaders import UnstructuredPDFLoader, PyPDFDirectoryLoader, PyPDFLoader
from langchain.indexes import VectorstoreIndexCreator
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.chains.question_answering import load_qa_chain
import openai
import pandas as pd
import requests
import json
import datetime
import shutil
import os
from datasets import load_dataset
import time

In [2]:
dataset = load_dataset("PatronusAI/financebench")

In [3]:
df = pd.DataFrame(dataset['train'])

In [4]:
OPENAI_KEY = os.getenv('OPENAI_KEY')
openai.api_key = OPENAI_KEY

In [5]:
OPENAI_KEY

'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'

In [6]:
df.head(2)

Unnamed: 0,financebench_id,doc_name,doc_link,doc_period,question_type,question,answer,evidence_text,page_number
0,financebench_id_03029,3M_2018_10K,https://investors.3m.com/financials/sec-filing...,2018,metrics-generated,What is the FY2018 capital expenditure amount ...,$1577.00,Table of Contents \n3M Company and Subsidiarie...,60
1,financebench_id_04672,3M_2018_10K,https://investors.3m.com/financials/sec-filing...,2018,metrics-generated,Assume that you are a public equities analyst....,$8.70,Table of Contents \n3M Company and Subsidiarie...,58


In [7]:
def text_to_openai_json(data,filename):
    """
    Converts a given dataset into a JSON Lines (JSONL) file suitable for OpenAI's GPT-3.5 turbo model.
    
    Args:
        data (DataFrame or similar data structure): Input data containing text and labels.

    The function processes the input data row by row, constructing conversations for each row with a system message, user message, and an assistant message. It then writes the generated conversation data to a JSONL file.
 
    """
    # Initialize an empty list to store conversation data
    message_list = []

    # Iterate through the rows in the input data
    for _, row in data.iterrows():
        # Create a system message as an initial instruction
        system_message = {
            "role": "system",
            "content":  f"You are a factual chatbot that answers questions about 10-K documents. You only answer with answers you find in the text, no outside information." 
        }

        # Append the system message to the conversation
        message_list.append({"messages": [system_message]})

        # Create a user message based on the 'text' column from the data
        user_message = {
            "role": "user",
            "content": f"{row['evidence_text']} based on {row['question']}  "
        }

        # Append the user message to the conversation
        message_list[-1]["messages"].append(user_message)

        # Create an assistant message based on the 'coarse_label' column from the data
        assistant_message = {
            "role": 'assistant',
            "content": row['answer']
        }

        # Append the assistant message to the conversation
        message_list[-1]["messages"].append(assistant_message)

    # Write the conversation data to a JSON Lines (JSONL) file
    with open(filename, "w") as json_file:
        for message in message_list:
            # Serialize the conversation data to JSON and write it to the file
            json.dump(message, json_file)
            json_file.write("\n")

In [8]:
df.head(2)

Unnamed: 0,financebench_id,doc_name,doc_link,doc_period,question_type,question,answer,evidence_text,page_number
0,financebench_id_03029,3M_2018_10K,https://investors.3m.com/financials/sec-filing...,2018,metrics-generated,What is the FY2018 capital expenditure amount ...,$1577.00,Table of Contents \n3M Company and Subsidiarie...,60
1,financebench_id_04672,3M_2018_10K,https://investors.3m.com/financials/sec-filing...,2018,metrics-generated,Assume that you are a public equities analyst....,$8.70,Table of Contents \n3M Company and Subsidiarie...,58


In [9]:
def fine_tune_model(model_id,num_label,pandas_df):
    df = pandas_df.iloc[:num_label]
    filename = f'ft_increment_{num_label}.jsonl'
    text_to_openai_json(df, filename)
    loader = openai.File.create(file=open(filename, "rb"), purpose='fine-tune')
    fine_tuning_job = openai.FineTuningJob.create(training_file=loader.id, model="gpt-3.5-turbo-1106")
    return fine_tuning_job.id

In [10]:
def wait_for_fine_tuning(job_id):
    while True:
        response = openai.FineTuningJob.retrieve(job_id)
        print(response["fine_tuned_model"])
        if response["fine_tuned_model"]:
            print(response["fine_tuned_model"])
            return response["fine_tuned_model"]
        time.sleep(30)

In [11]:
system_content = "You are a factual chatbot that answers questions about 10-K documents. You only answer with answers you find in the text, no outside information." 


In [12]:
df['question']

0      What is the FY2018 capital expenditure amount ...
1      Assume that you are a public equities analyst....
2      Is 3M a capital-intensive business based on FY...
3      What drove operating margin change as of FY202...
4      If we exclude the impact of M&A, which segment...
                             ...                        
145    Is Verizon a capital intensive business based ...
146    Has Verizon increased its debt on balance shee...
147    What is FY2018 days payable outstanding (DPO) ...
148    Based on the information provided primarily in...
149    What is the FY2018 - FY2020 3 year average una...
Name: question, Length: 150, dtype: object

In [13]:
def generate_10K_responses(data,model_id):
    syntheses = []
    system_content = "You are a factual chatbot that answers questions about 10-K documents. You only answer with answers you find in the text, no outside information." 
    for idx, row in data.iterrows():
        completion = openai.ChatCompletion.create(
            model= model_id ,
            messages=[
                {"role": "system", "content": system_content},
                {"role": "user", "content": f"{row['question']} based on {row['evidence_text']}" }
            ])
        
        #print(f'text: {row}')
        print(completion.choices[0].message.content)
        syntheses.append(completion.choices[0].message.content)
    syntheses_df = pd.DataFrame({'evidence_text': data['evidence_text'], 'answer' : data['answer'], 'syntheses' : syntheses })
    
    return syntheses_df


In [19]:
count = 0
label_count = [] 
model_ids = []


In [20]:
for i in range(5):
    count += 10
    label_count.append(count)
    ft_id = fine_tune_model(model_id = 'gpt-3.5-turbo-1106', num_label=count, pandas_df=df)
    if wait_for_fine_tuning(ft_id) is not None:
        model_ids.append(wait_for_fine_tuning(ft_id))
        syntheses_df = generate_10K_responses(data = df, model_id = wait_for_fine_tuning(ft_id))
        syntheses_df.to_csv(f'syntheses_df{count}.csv',index=False)
       

None
None
None
None
None
None
None
None
None
None
None
ft:gpt-3.5-turbo-1106:liangzhang::8bYkDRZK
ft:gpt-3.5-turbo-1106:liangzhang::8bYkDRZK
ft:gpt-3.5-turbo-1106:liangzhang::8bYkDRZK
ft:gpt-3.5-turbo-1106:liangzhang::8bYkDRZK
ft:gpt-3.5-turbo-1106:liangzhang::8bYkDRZK
ft:gpt-3.5-turbo-1106:liangzhang::8bYkDRZK
$1577.00
$8.70
No, 3M is a technology-driven company with a significant manufacturing and marketing presence. It is less capital-intensive than traditional chemical companies.
The decrease in operating margin for 3M in FY2022 was primarily due to the impact of special items, including:
-Decreased by 2.2%, primarily due to the impact of special items
The Consumer segment shrunk by 0.9% organically.
No. The quick ratio is not relevant to measure liquidity.
The 1.500% Notes due 2026 (Trading Symbol: MMM26) and
1.750% Notes due 2030 (Trading Symbol: MMM30) are registered to trade on the New York Stock Exchange.
Yes, 3M has a good track record of consistently increasing its dividend 