In [30]:
import lamini
import pandas as pd
from lamini.index.lamini_index import LaminiIndex

from contract_poc.utils import read_pdf, build_prompts_from_dataframe
from contract_poc.qa_pipeline import QuestionAnswerPipeline, load_qa_prompts, save_answers
from contract_poc.answer_pipeline import AnswerPipeline

In [31]:
uber_text = read_pdf("data/uber_2021.pdf")

In [32]:
lyft_text = read_pdf("data/lyft_2021.pdf")

In [33]:
len(uber_text) 

307

In [34]:
len(lyft_text)

238

# Build Index for PDFs

In [6]:
index_pages = []
for page in uber_text:
    index_pages.append([uber_text[page]])
for page in lyft_text:
    index_pages.append([lyft_text[page]])

In [7]:
len(index_pages)

545

In [8]:
llm_index = LaminiIndex.build_index(index_pages)

Building index with 545 batches


 87%|███████████████████████████████████▋     | 474/545 [13:25<36:36, 30.94s/it]

Error in adding embeddings to index.


 98%|███████████████████████████████████████▏| 534/545 [32:30<58:29, 319.03s/it]

Error in adding embeddings to index.


100%|█████████████████████████████████████████| 545/545 [32:46<00:00,  3.61s/it]


In [13]:
llm_index.save_index("model_index")

In [1]:
llm_index = LaminiIndex.load_index("model_index")

NameError: name 'LaminiIndex' is not defined

In [38]:
embed = llm_index.get_embeddings("Ask three separate questions around a fact, table, or number within this text:")

In [40]:
_, indices = llm_index.index.search(embed, 2)

In [43]:
indices

array([[308, 360]])

In [44]:
[llm_index.splits[i] for i in indices[0]]

2

# QA Pipeline

In [6]:
qa_pipeline = QuestionAnswerPipeline(
    question_system_prompt = "Ask three separate questions around a fact, table, or number within this text: "
)

In [8]:
answers = qa_pipeline.call(load_qa_prompts([uber_text[page] for page in uber_text]))
await save_answers(answers, path="uber_generated_qa.jsonl", print_outputs=False)


Saving answers: 0 answers [00:00, ? answers/s][A
Saving answers: 1 answers [00:24, 24.67s/ answers][A
Saving answers: 6 answers [00:25,  3.15s/ answers][A
Saving answers: 11 answers [00:25,  1.44s/ answers][A
Saving answers: 15 answers [00:38,  1.44s/ answers][A
Saving answers: 16 answers [01:13,  5.03s/ answers][A
Saving answers: 21 answers [01:16,  3.34s/ answers][A
Saving answers: 26 answers [01:25,  2.80s/ answers][A
Saving answers: 31 answers [01:33,  2.35s/ answers][A
Saving answers: 36 answers [02:02,  3.50s/ answers][A
Saving answers: 41 answers [02:03,  2.43s/ answers][A
Saving answers: 46 answers [02:23,  2.94s/ answers][A
Saving answers: 51 answers [02:24,  2.11s/ answers][A
Saving answers: 56 answers [02:25,  1.49s/ answers][A
Saving answers: 61 answers [02:52,  2.71s/ answers][A
Saving answers: 66 answers [02:53,  1.92s/ answers][A
Saving answers: 71 answers [03:14,  2.62s/ answers][A
Saving answers: 76 answers [03:15,  1.89s/ answers][A
Saving answers: 

CancelledError: 

Error in process_generation_batch, type: <class 'lamini.error.error.APIError'>, message: Request Timeout: The server did not respond in time.
Traceback (most recent call last):
  File "/Users/powerml/Library/Caches/pypoetry/virtualenvs/apple-legal-contracts-vyiYZUr_-py3.12/lib/python3.12/site-packages/aiohttp/client_reqrep.py", line 1058, in start
    message, payload = await protocol.read()  # type: ignore[union-attr]
                       ^^^^^^^^^^^^^^^^^^^^^
  File "/Users/powerml/Library/Caches/pypoetry/virtualenvs/apple-legal-contracts-vyiYZUr_-py3.12/lib/python3.12/site-packages/aiohttp/streams.py", line 643, in read
    await self._waiter
asyncio.exceptions.CancelledError

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/Users/powerml/Library/Caches/pypoetry/virtualenvs/apple-legal-contracts-vyiYZUr_-py3.12/lib/python3.12/site-packages/lamini/api/rest_requests.py", line 140, in make_async_web_request
    async wi

In [7]:
answers = qa_pipeline.call(load_qa_prompts([lyft_text[page] for page in lyft_text]))
await save_answers(answers, path="lyft_generated_qa.jsonl", print_outputs=False)

Saving answers: 0 answers [00:00, ? answers/s]

# Overfitting to eval set

In [25]:
base_llm_one = lamini.Lamini(model_name = "meta-llama/Llama-3.1-8B-Instruct")
base_llm_two = lamini.Lamini(model_name = "meta-llama/Llama-3.2-3B-Instruct")

In [10]:
eval_df = pd.read_csv("tune_sets/Apple PoC - Uber_Lyft Eval Set.csv")

In [45]:
eval_df.head()

Unnamed: 0,Question,Direct Text with Answer,Text Source,Page Content
0,What comprises the Lyft transportation network?,Our transportation network is comprised of:\n•...,Lyft - 7,Our transportation network is comprised of:•\n...
1,What is Lyft's growth strategy?,Our Growth Strategy\nTransportation represents...,Lyft - 9,We have invested in a patent program to identi...
2,Who is Lyft's main ridesharing competitors?,Our main ridesharing competitors in the United...,Lyft - 10,entrants in the market that may be well-establ...
3,Who is Lyft's main competitors in consumer veh...,"Enterprise, Hertz and Avis Budget Group as wel...",Lyft - 10,entrants in the market that may be well-establ...
4,What is the latest information on the workforc...,"As of December 31, 2021, we had 4,453 employee...",Lyft - 12,"adversely affect our business”, “Risk Factors—..."


In [46]:
eval_df.shape

(20, 4)

In [12]:
eval_set = [
    {
        "input": row["Question"],
        "output": row["Direct Text with Answer"]
    }
    for idx, row in eval_df.iterrows()
]

In [13]:
eval_set[0]

{'input': 'What comprises the Lyft transportation network?',
 'output': 'Our transportation network is comprised of:\n• Ridesharing Marketplace. Our core offering since 2012 connects drivers with riders who need to get somewhere. The scale of our network enables us to predict demand and proactively incentivize drivers to be available for rides in the right place at the right time. This allows us to optimize earning opportunities for drivers and offer convenient rides for riders, creating sustainable value to both sides of our marketplace. Our ridesharing marketplace connects drivers with riders in cities across the United States and in select cities in Canada.\n• Express Drive. Our flexible car rentals program for drivers who want to drive using our platform but do not have access to a vehicle that meets our requirements. Through our Express Drive program, drivers can enter into short-term rental agreements for vehicles that may be used to provide ridesharing services on the Lyft Platf

In [None]:
base_llm = lamini.Lamini(model_name = "meta-llama/Llama-3.1-8B-Instruct")
base_llm.tune(eval_set, finetune_args={"max_steps":100, "learning_rate":0.0001})

In [21]:
base_llm_two.tune(eval_set, finetune_args={"max_steps":100, "learning_rate":0.0001})

Data pairs uploaded to local.

Your dataset id is: d045bf11384d38141b0de24127c28e004e786112381c0c35d6d8132bcf40a381 . Consider using this in the future to train using the same data. 
Eg: llm.train(data_or_dataset_id='d045bf11384d38141b0de24127c28e004e786112381c0c35d6d8132bcf40a381')
Tuning job submitted! Check status of job 13261 here: https://api.lamini.ai/train/13261


{'job_id': 13261,
 'status': 'CREATED',
 'dataset_id': 'd045bf11384d38141b0de24127c28e004e786112381c0c35d6d8132bcf40a381'}

Above model only got 16 out of 20 correct responses, trying a longer job. 250 was chosen as it has seen some success from other Factual QA fine tuning jobs. 500 is the next iteration if this one doesn't overfit

In [26]:
base_llm_two.tune(eval_set, finetune_args={"max_steps":250, "learning_rate":0.0001})

Data pairs uploaded to local.

Your dataset id is: ba03d48ea14451864d9ed30316c049192e394a3b4fd153d612953860fd44d1dd . Consider using this in the future to train using the same data. 
Eg: llm.train(data_or_dataset_id='ba03d48ea14451864d9ed30316c049192e394a3b4fd153d612953860fd44d1dd')
Tuning job submitted! Check status of job 13264 here: https://api.lamini.ai/train/13264


{'job_id': 13264,
 'status': 'CREATED',
 'dataset_id': 'ba03d48ea14451864d9ed30316c049192e394a3b4fd153d612953860fd44d1dd'}

In [28]:
base_llm_two.tune(eval_set, finetune_args={"max_steps":500, "learning_rate":0.0001})

Data pairs uploaded to local.

Your dataset id is: 0104c14b59bbb6204be8a8b519c72bd32a0d9455741aff015e7895cc247ff3b5 . Consider using this in the future to train using the same data. 
Eg: llm.train(data_or_dataset_id='0104c14b59bbb6204be8a8b519c72bd32a0d9455741aff015e7895cc247ff3b5')
Tuning job submitted! Check status of job 13265 here: https://api.lamini.ai/train/13265


{'job_id': 13265,
 'status': 'CREATED',
 'dataset_id': '0104c14b59bbb6204be8a8b519c72bd32a0d9455741aff015e7895cc247ff3b5'}

In [20]:
from lamini.generation.generation_node import GenerationNode
from lamini.generation.base_prompt_object import PromptObject
from lamini.generation.generation_pipeline import GenerationPipeline

class GenPipeline(GenerationPipeline):
    def __init__(self, model_name):
        super(GenPipeline, self).__init__()

        self.generation_node = GenerationNode(model_name=model_name)

    def forward(self, x):
        x = self.generation_node(x, output_type={"answer":"str"})
        return x

In [21]:
def simple_prompt_generator(
    df: pd.DataFrame, 
    input_col: str = "Question", 
    output_col: str = "Direct Text with Answer",
    content_col: str = "Page Content",
    company_col: str = "Text Source"
):
    for idx, row in df.iterrows():
        yield PromptObject(
            prompt = row[input_col],
            data = {
                "question": row[input_col],
                "expected_output": row[output_col],
            }
        )

In [23]:
generation_pipeline = GenPipeline(model_name = "0abba8b3b71782bf9bbd48101e7def385e9663f9c76bd293f94184c8e8840ac0")
answers = generation_pipeline.call(simple_prompt_generator(eval_df))
await save_answers(answers, path="overfit_response.jsonl", print_outputs=False)

Saving answers: 0 answers [00:00, ? answers/s]

In [27]:
generation_pipeline = GenPipeline(model_name = "42c5c00ff85f93b27ef2e88b4f0a13c0c54a60360ff176230b05a869cadda30c")
answers = generation_pipeline.call(simple_prompt_generator(eval_df))
await save_answers(answers, path="overfit_v2_response.jsonl", print_outputs=False)

Saving answers: 0 answers [00:00, ? answers/s]

In [29]:
generation_pipeline = GenPipeline(model_name = "d9341ae5cfc60d5ed9e173954387404b6419ee201aa2baacb79fe4a6cef532a1")
answers = generation_pipeline.call(simple_prompt_generator(eval_df))
await save_answers(answers, path="overfit_v3_response.jsonl", print_outputs=False)

Saving answers: 0 answers [00:00, ? answers/s]

# Base Llama 3.2 responses

In [22]:
generation_pipeline = GenPipeline(model_name = "meta-llama/Llama-3.2-3B-Instruct")
answers = generation_pipeline.call(simple_prompt_generator(eval_df))
await save_answers(answers, path="base_llama_3_2_response.jsonl", print_outputs=False)

Saving answers: 0 answers [00:00, ? answers/s]

# Direct RAG (No index search, direct page as input)

In [14]:
def get_page(row):
    global uber_text
    global lyft_text

    company = row["Text Source"].split(" - ")[0].strip()
    if company == "Uber":
        text = uber_text
    elif company == "Lyft":
        text = lyft_text
    else:
        raise ValueError(f"Company could not be found {company} on row {idx}")

    page_num = row["Text Source"].split(" - ")[1].strip()
    page_index = f"Page {page_num}"
    
    if page_index in text:
        return text[page_index]
    else:
        raise ValueError(f"{page_index} was not found within {company} text")

eval_df["Page Content"] = eval_df.apply(lambda row: get_page(row), axis=1)
    


In [15]:
eval_df["Page Content"]

0     Our transportation network is comprised of:•\n...
1     We have invested in a patent program to identi...
2     entrants in the market that may be well-establ...
3     entrants in the market that may be well-establ...
4     adversely affect our business”, “Risk Factors—...
5     Because we stand at a pivotal moment in the fi...
6     Risks Related to Operational FactorsOur limite...
7     Moreover, we could be required or otherwise fi...
8     Moreover, we could be required or otherwise fi...
9     Cash FlowsThe following table summar\nizes our...
10    PART IITEM 1. BUSINESS\nOverview\nUber\n Techn...
11    Financial and Operational HighlightsYear Ended...
12    •Interest income, which consists primarily of ...
13    Gross Bookings. We  define Gross Bookings as t...
14     The following table presents a reconciliation...
15    the Premium and the assumed liabilities (inclu...
16    Contract Balances and Remaining Performance Ob...
17    Note 6 - Leases    Our\n leases primarily 

In [16]:
answer_pipeline = AnswerPipeline(model_name = "meta-llama/Llama-3.2-3B-Instruct")

In [18]:
answers = answer_pipeline.call(build_prompts_from_dataframe(eval_df))
await save_answers(answers, path="direct_rag_response.jsonl", print_outputs=False)

Saving answers: 0 answers [00:00, ? answers/s]