# Imports

In [1]:
import os
import jsonlines
from tqdm.notebook import tqdm

import lamini
import pandas as pd

from contract_poc.utils import read_pdf, build_prompts_from_dataframe, load_format_pages
from contract_poc.qa_pipeline import QuestionAnswerPipeline, load_qa_prompts, save_answers
from contract_poc.answer_pipeline import AnswerPipeline
from contract_poc.rag_pipeline import RAGPipeline
from contract_poc.gen_pipeline import GenPipeline, simple_prompt_generator, save_answers_to_csv
from contract_poc.summary_pipeline import SummaryPipeline

# Data Wrangling

In [2]:
uber_text = read_pdf("data/uber_2021.pdf")
print(f"Uber number of pages: {len(uber_text)}")

Uber number of pages: 307


In [3]:
lyft_text = read_pdf("data/lyft_2021.pdf")
print(f"Lyft number of pages: {len(lyft_text)}")

Lyft number of pages: 238


In [4]:
eval_data = []
with jsonlines.Reader(open(os.path.join("eval_sets","gold-test-set.jsonlines"), "rb")) as reader:
    for line in reader:
        eval_data.append(line)

In [5]:
eval_data[:4]

[{'question': 'What countries does Uber operate in?',
  'answer': 'Our technology is available in approximately 72 countries around the world, principally in the United States (“U.S.”) and Canada, Latin America, Europe, the Middle East, Africa, and Asia (excluding China and Southeast Asia).'},
 {'question': 'What countries does Lyft operate in?',
  'answer': 'United states and select cities in Canada'},
 {'question': 'Do Uber and Lyft operate in Mexico?',
  'answer': 'Uber operates in Mexico including ride sharing and delivery services. Lyft only operates in the United States and select cities in Canada.'},
 {'question': "Think step by step. First, consider the largest spanish speaking north american country that lyft operates in. Second describe any recent regulation that impacts Lyft's business in that market.",
  'answer': 'Lyft only operates in canada and the united states.'}]

In [6]:
eval_df = pd.DataFrame(eval_data)
eval_df.head()

Unnamed: 0,question,answer
0,What countries does Uber operate in?,Our technology is available in approximately 7...
1,What countries does Lyft operate in?,United states and select cities in Canada
2,Do Uber and Lyft operate in Mexico?,Uber operates in Mexico including ride sharing...
3,"Think step by step. First, consider the larges...",Lyft only operates in canada and the united st...
4,"Think step by step. First, consider the larges...","Uber operates in mexico. Since April 2019, Mex..."


# Eval Criteria

* Accurate information - Facts can be sourced to a ground truth
* Correct context - Extrapolation of the model is still in the context of the question
* Just enough information - No run on thoughts, just answer the question

Accurate
* Response is factually accurate, doesn't extend the answer beyond the scope of the question

Mixed
* Response contains the correct information, but either the context extended beyond the question or additional facts are added that have nothing to do with the question

Incorrect
* Any fact is not accurate, response may not include corret context

Successful Failure
* Model recognizes that it does not know the answer, interesting to see how well the model catches itself. This is helpful for UX purposes, and it would be good to have incorrect responses be this category

# Base Llama Performance

In [16]:
generation_pipeline = GenPipeline(model_name = "meta-llama/Meta-Llama-3.1-8B-Instruct")
answers = generation_pipeline.call(simple_prompt_generator(eval_df))
llama_3_1_responses = await save_answers_to_csv(answers, path="responses/base_llama_3_1_response.csv", print_outputs=False)

Saving answers: 0 answers [00:00, ? answers/s]

https://docs.google.com/spreadsheets/d/1zctM75Pd3dprofVs2Caky3kAv1LnyU1Z2hnxhbFeyPI/edit?usp=sharing

Correct: 3 (0.15)

Mixed: 7 (0.35)

Incorrect: 9 (0.45)

Successfull Failure: 1 (0.05)

In [17]:
generation_pipeline = GenPipeline(model_name = "meta-llama/Llama-3.2-3B-Instruct")
answers = generation_pipeline.call(simple_prompt_generator(eval_df))
llama_3_2_responses = await save_answers_to_csv(answers, path="responses/base_llama_3_2_response.csv", print_outputs=False)

Saving answers: 0 answers [00:00, ? answers/s]

https://docs.google.com/spreadsheets/d/1WrILF9GV6RS09LTQ2fHUHtH6SJs5v2DN61Ia55fIxME/edit?usp=sharing

Correct: 2 (0.1)

Mixed: 4 (0.2)

Incorrect: 13 (0.65)

Successfull Failure: 1 (0.05)

In [7]:
generation_pipeline = GenPipeline(model_name = "google/gemma-2-9b-it")
answers = generation_pipeline.call(simple_prompt_generator(eval_df))
gemma_2_responses = await save_answers_to_csv(answers, path="responses/base_gemma_2_response.csv", print_outputs=False)

Saving answers: 0 answers [00:00, ? answers/s]

Error in process_generation_batch, type: <class 'lamini.error.error.APIError'>, message: API error 524
Traceback (most recent call last):
  File "/Users/powerml/Library/Caches/pypoetry/virtualenvs/contract-poc-jPOtkveY-py3.12/lib/python3.12/site-packages/lamini/generation/process_generation_batch.py", line 42, in process_generation_batch
    result = await query_api(client, key, url, json, batch["type"])
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/powerml/Library/Caches/pypoetry/virtualenvs/contract-poc-jPOtkveY-py3.12/lib/python3.12/site-packages/lamini/generation/process_generation_batch.py", line 85, in query_api
    result = await pipeline_client.completions(client, key, url, json)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/powerml/Library/Caches/pypoetry/virtualenvs/contract-poc-jPOtkveY-py3.12/lib/python3.12/site-packages/lamini/api/pipeline_client.py", line 17, in completions
    result = await mak

https://docs.google.com/spreadsheets/d/1W3TOvj-2s3zvr8nIInPSf6lTzyjwjmexzEYtCKJBHb8/edit?usp=sharing

Correct: 8 (0.4)

Mixed: 6 (0.3)

Incorrect: 3 (0.15)

Successfull Failure: 3 (0.15)

In [8]:
generation_pipeline = GenPipeline(model_name = "mistralai/Mistral-7B-Instruct-v0.3")
answers = generation_pipeline.call(simple_prompt_generator(eval_df))
mistral_responses = await save_answers_to_csv(answers, path="responses/base_mistral_7B_response.csv", print_outputs=False)

Saving answers: 0 answers [00:00, ? answers/s]

# RAG Performance

In [7]:
from lamini.index.lamini_index import LaminiIndex

In [9]:
llm_index = LaminiIndex.load_index("model_index")

In [10]:
embed = llm_index.get_embeddings("Ask three separate questions around a fact, table, or number within this text:")

In [None]:
_, indices = llm_index.index.search(embed, 2)

In [None]:
[llm_index.splits[i] for i in indices[0]]

In [5]:
llm_rag = RAGPipeline(rag_model_path="model_index", model_name = "meta-llama/Meta-Llama-3.1-8B-Instruct")
answers = llm_rag.call(simple_prompt_generator(eval_df))
rag_3_1_responses = await save_answers_to_csv(answers, path="responses/rag_llama_3_1_response.csv", print_outputs=False)

Saving answers: 0 answers [00:00, ? answers/s]

https://docs.google.com/spreadsheets/d/17cl6vp8cEaAo3MmATy3L9zJyW37jHf4AQSfXaKKKr5s/edit?usp=sharing

Correct: 3 (0.15)

Mixed: 2 (0.1)

Incorrect: 1 (0.05)

Successfull Failure: 14 (0.7)

In [6]:
llm_rag = RAGPipeline(rag_model_path="model_index", model_name = "meta-llama/Llama-3.2-3B-Instruct")
answers = llm_rag.call(simple_prompt_generator(eval_df))
rag_3_2_responses = await save_answers_to_csv(answers, path="responses/rag_llama_3_2_response.csv", print_outputs=False)

Saving answers: 0 answers [00:00, ? answers/s]

https://docs.google.com/spreadsheets/d/1y9341YAzY0NDFIuF54PWpb-j3IW1gHMGe1afGwo7tjU/edit?usp=sharing

Correct: 4 (0.2)

Mixed: 1 (0.05)

Incorrect: 1 (0.05)

Successfull Failure: 14 (0.7)

In [7]:
llm_rag = RAGPipeline(rag_model_path="model_index", model_name = "google/gemma-2-9b-it", rag_query_size=1)
answers = llm_rag.call(simple_prompt_generator(eval_df))
rag_gemma_2_responses = await save_answers_to_csv(answers, path="responses/rag_gemma_2_response.csv", print_outputs=False)

Saving answers: 0 answers [00:00, ? answers/s]

In [9]:
llm_rag = RAGPipeline(rag_model_path="model_index", model_name = "mistralai/Mistral-7B-Instruct-v0.3", rag_query_size=1)
answers = llm_rag.call(simple_prompt_generator(eval_df))
rag_mistral_responses = await save_answers_to_csv(answers, path="responses/rag_mistral_7B_response.csv", print_outputs=False)

Saving answers: 0 answers [00:00, ? answers/s]

# Tuning Experiments

## Automatic QA Experiments

### V1

Job ID: 13265

Model ID: d9341ae5cfc60d5ed9e173954387404b6419ee201aa2baacb79fe4a6cef532a1

This is from an initial overfit run in the Contracts Data Discovery notebook, this was to overfit to a set of automatically extracted QA pairs

In [6]:
generation_pipeline = GenPipeline(model_name = "d9341ae5cfc60d5ed9e173954387404b6419ee201aa2baacb79fe4a6cef532a1")
answers = generation_pipeline.call(simple_prompt_generator(eval_df))
model_responses = await save_answers_to_csv(answers, path="responses/gold_test_responses_13265.csv", print_outputs=False)

Saving answers: 0 answers [00:00, ? answers/s]

### Eval

Correct: 5 (0.25)

Mixed: 7 (0.35)

Incorrect: 6 (0.3)

Successfull Failure: 2 (0.1)

### V2

#### Generate QA Pairs

In [7]:
qa_pipeline = QuestionAnswerPipeline(
    question_system_prompt = "Ask three separate questions around a fact, table, or number within this text: \n"
)

##### Uber

In [8]:
answers = qa_pipeline.call(load_qa_prompts([uber_text[page] for page in uber_text]))
await save_answers(answers, path="uber_generated_qa.jsonl", print_outputs=False)

Saving answers: 0 answers [00:00, ? answers/s]

##### Lyft

In [9]:
answers = qa_pipeline.call(load_qa_prompts([lyft_text[page] for page in lyft_text]))
await save_answers(answers, path="lyft_generated_qa.jsonl", print_outputs=False)

Saving answers: 0 answers [00:00, ? answers/s]

In [19]:
tune_data = []
with jsonlines.Reader(open("uber_generated_qa.jsonl", "rb")) as reader:
    for line in reader:
        line["input"] = line["question"]
        line["output"] = line["answer"]
        tune_data.append(line)
with jsonlines.Reader(open("lyft_generated_qa.jsonl", "rb")) as reader:
    for line in reader:
        line["input"] = line["question"]
        line["output"] = line["answer"]
        tune_data.append(line)
tune_df = pd.DataFrame(tune_data)

In [20]:
tune_df.head()

Unnamed: 0,prompt,question,answer,input,output
0,<|begin_of_text|><|start_header_id|>user<|end_...,What is the name of the company that is filing...,"UBER TECHNOLOGIES, INC",What is the name of the company that is filing...,"UBER TECHNOLOGIES, INC"
1,<|begin_of_text|><|start_header_id|>user<|end_...,What is the state of incorporation of the company,Delaware,What is the state of incorporation of the company,Delaware
2,<|begin_of_text|><|start_header_id|>user<|end_...,What is the name of the exchange on which the ...,New York Stock Exchange,What is the name of the exchange on which the ...,New York Stock Exchange
3,<|begin_of_text|><|start_header_id|>user<|end_...,What is the number of shares of the registrant...,1954464088,What is the number of shares of the registrant...,1954464088
4,<|begin_of_text|><|start_header_id|>user<|end_...,What is the aggregate market value of the voti...,approximately $90.5 billion,What is the aggregate market value of the voti...,approximately $90.5 billion


In [21]:
base_llm = lamini.Lamini(model_name="meta-llama/Meta-Llama-3.1-8B-Instruct")
base_llm.tune(tune_data)

Data pairs uploaded to local.

Your dataset id is: f00cbe4f6103fc99b731b351f7d963ac779909ca5f13cc7063c113e55d665f52 . Consider using this in the future to train using the same data. 
Eg: llm.train(data_or_dataset_id='f00cbe4f6103fc99b731b351f7d963ac779909ca5f13cc7063c113e55d665f52')
Tuning job submitted! Check status of job 13695 here: https://api.lamini.ai/train/13695


{'job_id': 13695,
 'status': 'CREATED',
 'dataset_id': 'f00cbe4f6103fc99b731b351f7d963ac779909ca5f13cc7063c113e55d665f52'}

In [23]:
generation_pipeline = GenPipeline(model_name = "6bbfc54db98cb502a629aa5c3ed0aba4372fd2ad795ea694042ba6105ba892b4")
answers = generation_pipeline.call(simple_prompt_generator(eval_df))
model_responses = await save_answers_to_csv(answers, path="responses/gold_test_responses_13695.csv", print_outputs=False)

Saving answers: 0 answers [00:00, ? answers/s]

##### Eval

Job ID: 13695

Model ID: 6bbfc54db98cb502a629aa5c3ed0aba4372fd2ad795ea694042ba6105ba892b4

Correct: 5 (0.25)

Mixed: 4 (0.2)

Incorrect: 8 (0.4)

Successfull Failure: 3 (0.15)

### V3

In [18]:
test_string = "Something to split"

In [25]:
len(test_string) // 2

9

In [23]:
test_string[:len(test_string) // 2]

'Something'

In [24]:
test_string[len(test_string) // 2:]

' to split'

In [27]:
chunk_size = len(test_string) // 4
for i in range(0, len(test_string), chunk_size):
    print(test_string[i:i + chunk_size])

Some
thin
g to
 spl
it


In [13]:
base_gemma = lamini.Lamini("google/gemma-2-9b-it")

In [16]:
print(base_gemma.generate(f"Follow these instructions. 1. Consider the following text: \n ++++++ {uber_text["Page 1"]} ++++++ 2. Summarize that text with a single sentence.\n\n"))

Uber Technologies, Inc. is filing its annual report with the Securities and Exchange Commission. 





In [17]:
print(base_gemma.generate(f"Follow these instructions. 1. Consider the following text: \n ++++++ {uber_text["Page 2"]} ++++++ 2. Summarize that text with a single sentence.\n\n"))

This document is a filing with the Securities and Exchange Commission by a large accelerated filer, providing information about the company's financial performance and other relevant details. 





In [None]:
sum_pipeline = 

In [None]:
answers = qa_pipeline.call(load_format_pages([uber_text[page] for page in uber_text]))
await save_answers(answers, path="v3_uber_generated_qa.jsonl", print_outputs=False)

In [None]:
[uber_text[page] for page in uber_text]

In [None]:
answers = qa_pipeline.call(load_qa_prompts([uber_text[page] for page in uber_text]))
await save_answers(answers, path="v3_uber_generated_qa.jsonl", print_outputs=False)

In [None]:
answers = qa_pipeline.call(load_qa_prompts([lyft_text[page] for page in lyft_text]))
await save_answers(answers, path="v3_lyft_generated_qa.jsonl", print_outputs=False)