# Imports

In [1]:
import os
import jsonlines
from tqdm.notebook import tqdm

import lamini
import pandas as pd

from contract_poc.utils import read_pdf, build_prompts_from_dataframe
from contract_poc.qa_pipeline import QuestionAnswerPipeline, load_qa_prompts, save_answers
from contract_poc.answer_pipeline import AnswerPipeline
from contract_poc.gen_pipeline import GenPipeline, simple_prompt_generator, save_answers_to_csv

# Data Wrangling

In [9]:
uber_text = read_pdf("data/uber_2021.pdf")
print(f"Uber number of pages: {len(uber_text)}")

Uber number of pages: 307


In [10]:
lyft_text = read_pdf("data/lyft_2021.pdf")
print(f"Lyft number of pages: {len(lyft_text)}")

Lyft number of pages: 238


In [2]:
eval_data = []
with jsonlines.Reader(open(os.path.join("eval_sets","gold-test-set.jsonlines"), "rb")) as reader:
    for line in reader:
        eval_data.append(line)

In [3]:
eval_data[:4]

[{'question': 'What countries does Uber operate in?',
  'answer': 'Our technology is available in approximately 72 countries around the world, principally in the United States (“U.S.”) and Canada, Latin America, Europe, the Middle East, Africa, and Asia (excluding China and Southeast Asia).'},
 {'question': 'What countries does Lyft operate in?',
  'answer': 'United states and select cities in Canada'},
 {'question': 'Do Uber and Lyft operate in Mexico?',
  'answer': 'Uber operates in Mexico including ride sharing and delivery services. Lyft only operates in the United States and select cities in Canada.'},
 {'question': "Think step by step. First, consider the largest spanish speaking north american country that lyft operates in. Second describe any recent regulation that impacts Lyft's business in that market.",
  'answer': 'Lyft only operates in canada and the united states.'}]

In [4]:
eval_df = pd.DataFrame(eval_data)
eval_df.head()

Unnamed: 0,question,answer
0,What countries does Uber operate in?,Our technology is available in approximately 7...
1,What countries does Lyft operate in?,United states and select cities in Canada
2,Do Uber and Lyft operate in Mexico?,Uber operates in Mexico including ride sharing...
3,"Think step by step. First, consider the larges...",Lyft only operates in canada and the united st...
4,"Think step by step. First, consider the larges...","Uber operates in mexico. Since April 2019, Mex..."


# Eval Criteria

* Accurate information - Facts can be sourced to a ground truth
* Correct context - Extrapolation of the model is still in the context of the question
* Just enough information - No run on thoughts, just answer the question

Accurate
* Response is factually accurate, doesn't extend the answer beyond the scope of the question

Mixed
* Response contains the correct information, but either the context extended beyond the question or additional facts are added that have nothing to do with the question

Incorrect
* Any fact is not accurate, response may not include corret context

Successful Failure
* Model recognizes that it does not know the answer

# Tuning Experiments

## Automatic QA Experiments

### V1

Job ID: 13265

Model ID: d9341ae5cfc60d5ed9e173954387404b6419ee201aa2baacb79fe4a6cef532a1

This is from an initial overfit run in the Contracts Data Discovery notebook, this was to overfit to a set of automatically extracted QA pairs

In [6]:
generation_pipeline = GenPipeline(model_name = "d9341ae5cfc60d5ed9e173954387404b6419ee201aa2baacb79fe4a6cef532a1")
answers = generation_pipeline.call(simple_prompt_generator(eval_df))
model_responses = await save_answers_to_csv(answers, path="responses/gold_test_responses_13265.csv", print_outputs=False)

Saving answers: 0 answers [00:00, ? answers/s]

### Eval

Correct: 5 (0.25)

Mixed: 7 (0.35)

Incorrect: 6 (0.3)

Successfull Failure: 2 (0.1)

### V2

In [None]:
qa_pipeline = QuestionAnswerPipeline(
    question_system_prompt = "Ask three separate questions around a fact, table, or number within this text: "
)