In [1]:
# !pip3 install openpyxl

In [2]:
import pandas as pd
from llm import LLMBackendGenAI
from prompts import QUERY_GENERATOR_PROMPT
from concurrent.futures import ThreadPoolExecutor, as_completed
from utils import extract_first_text_within_tags
import time


In [3]:
from datetime import date

today = date.today()
print("Today's date:", today,)

Today's date: 2024-07-10


In [4]:
schema="""CREATE TABLE CONTRACT_DATA (
	AGREEMENT_TYPE VARCHAR(1000),
	COUNTRIES VARCHAR(1000),
	CURRENCIES VARCHAR(1000),
	DB_ID VARCHAR(1000),
	DOCUMENT_TYPE VARCHAR(1000),
	EFFECTIVE_DATE VARCHAR(1000),
	EXPIRATION_DATE VARCHAR(1000),
	FUNCTIONS VARCHAR(1000),
	ID VARCHAR(1000),
	NAME VARCHAR(1000),
	REGIONS VARCHAR(1000),
	REPORTING_CURRENCY VARCHAR(1000),
	SERVICES VARCHAR(1000),
	SOURCE_NAME_TITLE VARCHAR(1000),
	STATUS VARCHAR(1000),
	SUPPLIER VARCHAR(1000),
	TCV INTEGER,
	TERM_TYPE VARCHAR(1000),
	TIME_ZONE VARCHAR(1000),
	TITLE VARCHAR(1000)
)"""

In [5]:
def generate_answer(question, llm_obj):
    start_time = time.time()  # Record start time
    print(f"for question : {question}")
    formatted_prompt = QUERY_GENERATOR_PROMPT.format(question=question, context=schema)
    print(formatted_prompt)
    answer = llm_obj.generate_response(prompt=formatted_prompt).replace("Answer:","").strip()
    end_time = time.time()  # Record end time
    latency = end_time - start_time  # Calculate latency
    print(f"Answer: {answer}")
    print(f"Latency: {latency} seconds")  # Print latency

    try:
        sql_query = extract_first_text_within_tags(text=answer, tag="SQL")
        print(f"sql_query: {sql_query}")
    except:
        return answer, latency
    finally:
        if len(str(sql_query)) == 0:
            return answer, latency
    return sql_query, latency

In [6]:
models =["ibm/granite-8b-code-instruct","ibm/granite-34b-code-instruct","ibm/granite-20b-code-instruct","ibm/granite-13b-instruct-v2","meta-llama/llama-3-70b-instruct","mistralai/mixtral-8x7b-instruct-v01","kaist-ai/prometheus-8x7b-v2","codellama/codellama-34b-instruct","deepseek-ai/deepseek-coder-33b-instruct"]
#models =["ibm/granite-34b-code-instruct"]
#models =["ibm/granite-34b-code-instruct"]

In [7]:
sheets = pd.read_excel('./TTMC Co creation Data.xlsx',sheet_name=None)
print(sheets.keys())

dict_keys(['Guidelines', 'Metadata fields', 'Evaluation Data'])


In [8]:
df = sheets['Evaluation Data']
print(df.shape)
df.head()

(204, 3)


Unnamed: 0,Question,Entity,Expected
0,List all contracts that are in draft stage) an...,Contract Draft Request,"SELECT agreement_type, title\nFROM CDR;"
1,What are the document types of contracts creat...,Contract,SELECT DISTINCT document_type \nFROM contracts...
2,What is the total number of contracts by Count...,Contract,"SELECT COUNT(*) AS total_contracts, supplier\n..."
3,What is our spending with IT vendors this year...,Contract,"SELECT \n SUM(tcv) AS spending_this_year,\n..."
4,"""Show me the payment terms of the contract ID ...",Contract,SELECT *\nFROM contract\nWHERE id = 'CO200456'...


In [9]:
sample_df = df[df['Entity'] == 'Contract']
print(sample_df.shape)
sample_df.head()

(197, 3)


Unnamed: 0,Question,Entity,Expected
1,What are the document types of contracts creat...,Contract,SELECT DISTINCT document_type \nFROM contracts...
2,What is the total number of contracts by Count...,Contract,"SELECT COUNT(*) AS total_contracts, supplier\n..."
3,What is our spending with IT vendors this year...,Contract,"SELECT \n SUM(tcv) AS spending_this_year,\n..."
4,"""Show me the payment terms of the contract ID ...",Contract,SELECT *\nFROM contract\nWHERE id = 'CO200456'...
5,"""What are the obligations of parties mentioned...",Contract,SELECT *\nFROM contract WHERE id = 'CO300789' ...


In [10]:
QUERY_GENERATOR_PROMPT

'[INST] Write SQLite query to answer the following question given the database schema. Please wrap your code answer using ```: Schema: {context} [/INST] Here is the SQLite query to answer to the question:{question} ```\n        '

In [11]:
# Process each model and question
for model_id in models:
    print(f"Processing answers for: {model_id}")
    answers = [None] * len(sample_df)  # Ensure this matches the DataFrame row count
    latencies = [None] * len(sample_df)  # Initialize latencies list
    print(f"Answers list initialized with length: {len(answers)}")  # Debugging statement
    llm_obj = LLMBackendGenAI(model_id=model_id)

    # Create a thread pool executor to handle requests in parallel
    with ThreadPoolExecutor(max_workers=30) as executor:
        futures = {executor.submit(generate_answer, row["Question"], llm_obj): i for i, (index, row) in enumerate(sample_df.iterrows())}
        # Collect results as they complete
        for future in as_completed(futures):
            index = futures[future] 
            print(f"Attempting to assign answer to index: {index}")  # Debugging statement

            try:
                answer, latency = future.result()
                if index >= len(answers):
                    print(f"Error: index {index} is out of range for answers list.")  # Error check
                else:
                    answers[index] = answer
                    latencies[index] = latency
            except Exception as exc:
                print(f'Generated an exception: {exc} at index {index}')
                if index >= len(answers):
                    print(f"Error: index {index} is out of range for answers list during exception handling.")  # Error check
                else:
                    answers[index] = None
                    latencies[index] = None
                break

    sample_df[f"Answers_{model_id.replace('/', '_')}"] = answers
    sample_df[f"Latencies_{model_id.replace('/', '_')}"] = latencies  # Add latencies to DataFrame
    print(f"Final assignment to DataFrame for model {model_id} completed.")  # Confirmation of process

sample_df.to_excel("./Model_answers_baseline_multi_shot_cot_synonyms.xlsx", sheet_name='Evaluation Data', index=False)
print("Data saved to Excel.")  # Confirmation of data save

Processing answers for: ibm/granite-8b-code-instruct
Answers list initialized with length: 197
for question : What are the document types of contracts created by the Sales department in the last month?
[INST] Write SQLite query to answer the following question given the database schema. Please wrap your code answer using ```: Schema: CREATE TABLE CONTRACT_DATA (
	AGREEMENT_TYPE VARCHAR(1000),
	COUNTRIES VARCHAR(1000),
	CURRENCIES VARCHAR(1000),
	DB_ID VARCHAR(1000),
	DOCUMENT_TYPE VARCHAR(1000),
	EFFECTIVE_DATE VARCHAR(1000),
	EXPIRATION_DATE VARCHAR(1000),
	FUNCTIONS VARCHAR(1000),
	ID VARCHAR(1000),
	NAME VARCHAR(1000),
	REGIONS VARCHAR(1000),
	REPORTING_CURRENCY VARCHAR(1000),
	SERVICES VARCHAR(1000),
	SOURCE_NAME_TITLE VARCHAR(1000),
	STATUS VARCHAR(1000),
	SUPPLIER VARCHAR(1000),
	TCV INTEGER,
	TERM_TYPE VARCHAR(1000),
	TIME_ZONE VARCHAR(1000),
	TITLE VARCHAR(1000)
) [/INST] Here is the SQLite query to answer to the question:What are the document types of contracts created by the 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample_df[f"Answers_{model_id.replace('/', '_')}"] = answers
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample_df[f"Latencies_{model_id.replace('/', '_')}"] = latencies  # Add latencies to DataFrame


Answer: SELECTWARRANTIES FROM CONTRACT_DATA WHERE ID = "CO700123"
```<|endoftext|>
Latency: 3.1766622066497803 seconds
sql_query: 
for question : Display contracts with Paper Tpe as Own and Status as archivd.
[INST] Write SQLite query to answer the following question given the database schema. Please wrap your code answer using ```: Schema: CREATE TABLE CONTRACT_DATA (
	AGREEMENT_TYPE VARCHAR(1000),
	COUNTRIES VARCHAR(1000),
	CURRENCIES VARCHAR(1000),
	DB_ID VARCHAR(1000),
	DOCUMENT_TYPE VARCHAR(1000),
	EFFECTIVE_DATE VARCHAR(1000),
	EXPIRATION_DATE VARCHAR(1000),
	FUNCTIONS VARCHAR(1000),
	ID VARCHAR(1000),
	NAME VARCHAR(1000),
	REGIONS VARCHAR(1000),
	REPORTING_CURRENCY VARCHAR(1000),
	SERVICES VARCHAR(1000),
	SOURCE_NAME_TITLE VARCHAR(1000),
	STATUS VARCHAR(1000),
	SUPPLIER VARCHAR(1000),
	TCV INTEGER,
	TERM_TYPE VARCHAR(1000),
	TIME_ZONE VARCHAR(1000),
	TITLE VARCHAR(1000)
) [/INST] Here is the SQLite query to answer to the question:Display contracts with Paper Tpe as Own and Statu

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample_df[f"Answers_{model_id.replace('/', '_')}"] = answers
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample_df[f"Latencies_{model_id.replace('/', '_')}"] = latencies  # Add latencies to DataFrame


Answer: SELECT COUNT(NAME) FROM CONTRACT_DATA WHERE SUPPLIER = "Counterparty"
```<|endoftext|>
Latency: 8.59594202041626 seconds
sql_query: 
for question : Display contracts with Paper Tpe as Own and Status as archivd.
[INST] Write SQLite query to answer the following question given the database schema. Please wrap your code answer using ```: Schema: CREATE TABLE CONTRACT_DATA (
	AGREEMENT_TYPE VARCHAR(1000),
	COUNTRIES VARCHAR(1000),
	CURRENCIES VARCHAR(1000),
	DB_ID VARCHAR(1000),
	DOCUMENT_TYPE VARCHAR(1000),
	EFFECTIVE_DATE VARCHAR(1000),
	EXPIRATION_DATE VARCHAR(1000),
	FUNCTIONS VARCHAR(1000),
	ID VARCHAR(1000),
	NAME VARCHAR(1000),
	REGIONS VARCHAR(1000),
	REPORTING_CURRENCY VARCHAR(1000),
	SERVICES VARCHAR(1000),
	SOURCE_NAME_TITLE VARCHAR(1000),
	STATUS VARCHAR(1000),
	SUPPLIER VARCHAR(1000),
	TCV INTEGER,
	TERM_TYPE VARCHAR(1000),
	TIME_ZONE VARCHAR(1000),
	TITLE VARCHAR(1000)
) [/INST] Here is the SQLite query to answer to the question:Display contracts with Paper Tpe as Own

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample_df[f"Answers_{model_id.replace('/', '_')}"] = answers
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample_df[f"Latencies_{model_id.replace('/', '_')}"] = latencies  # Add latencies to DataFrame


Answer: SELECT document_type FROM contract_data WHERE TITLE LIKE "Sales%" AND EXPIRATION_DATE LIKE "%1%"<|endoftext|>
Latency: 7.014578104019165 seconds
sql_query: 
for question : Display contracts with Paper Tpe as Own and Status as archivd.
[INST] Write SQLite query to answer the following question given the database schema. Please wrap your code answer using ```: Schema: CREATE TABLE CONTRACT_DATA (
	AGREEMENT_TYPE VARCHAR(1000),
	COUNTRIES VARCHAR(1000),
	CURRENCIES VARCHAR(1000),
	DB_ID VARCHAR(1000),
	DOCUMENT_TYPE VARCHAR(1000),
	EFFECTIVE_DATE VARCHAR(1000),
	EXPIRATION_DATE VARCHAR(1000),
	FUNCTIONS VARCHAR(1000),
	ID VARCHAR(1000),
	NAME VARCHAR(1000),
	REGIONS VARCHAR(1000),
	REPORTING_CURRENCY VARCHAR(1000),
	SERVICES VARCHAR(1000),
	SOURCE_NAME_TITLE VARCHAR(1000),
	STATUS VARCHAR(1000),
	SUPPLIER VARCHAR(1000),
	TCV INTEGER,
	TERM_TYPE VARCHAR(1000),
	TIME_ZONE VARCHAR(1000),
	TITLE VARCHAR(1000)
) [/INST] Here is the SQLite query to answer to the question:Display contrac

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample_df[f"Answers_{model_id.replace('/', '_')}"] = answers
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample_df[f"Latencies_{model_id.replace('/', '_')}"] = latencies  # Add latencies to DataFrame


Answer: ```
SELECT TERM_TYPE 
FROM CONTRACT_DATA 
WHERE ID = 'CO100123' AND DOCUMENT_TYPE = 'invoice'; 
         ```
Latency: 3.0900192260742188 seconds
sql_query: 
for question : Display contracts with Paper Tpe as Own and Status as archivd.
[INST] Write SQLite query to answer the following question given the database schema. Please wrap your code answer using ```: Schema: CREATE TABLE CONTRACT_DATA (
	AGREEMENT_TYPE VARCHAR(1000),
	COUNTRIES VARCHAR(1000),
	CURRENCIES VARCHAR(1000),
	DB_ID VARCHAR(1000),
	DOCUMENT_TYPE VARCHAR(1000),
	EFFECTIVE_DATE VARCHAR(1000),
	EXPIRATION_DATE VARCHAR(1000),
	FUNCTIONS VARCHAR(1000),
	ID VARCHAR(1000),
	NAME VARCHAR(1000),
	REGIONS VARCHAR(1000),
	REPORTING_CURRENCY VARCHAR(1000),
	SERVICES VARCHAR(1000),
	SOURCE_NAME_TITLE VARCHAR(1000),
	STATUS VARCHAR(1000),
	SUPPLIER VARCHAR(1000),
	TCV INTEGER,
	TERM_TYPE VARCHAR(1000),
	TIME_ZONE VARCHAR(1000),
	TITLE VARCHAR(1000)
) [/INST] Here is the SQLite query to answer to the question:Display contrac

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample_df[f"Answers_{model_id.replace('/', '_')}"] = answers
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample_df[f"Latencies_{model_id.replace('/', '_')}"] = latencies  # Add latencies to DataFrame


Answer: SELECT DISTINCT DOCUMENT_TYPE 
         FROM CONTRACT_DATA 
         WHERE REGIONS LIKE '%Sales%' AND 
               EFFECTIVE_DATE >= date('now','-1 month')
Latency: 4.747956037521362 seconds
sql_query: 
for question : Display contracts with Paper Tpe as Own and Status as archivd.
[INST] Write SQLite query to answer the following question given the database schema. Please wrap your code answer using ```: Schema: CREATE TABLE CONTRACT_DATA (
	AGREEMENT_TYPE VARCHAR(1000),
	COUNTRIES VARCHAR(1000),
	CURRENCIES VARCHAR(1000),
	DB_ID VARCHAR(1000),
	DOCUMENT_TYPE VARCHAR(1000),
	EFFECTIVE_DATE VARCHAR(1000),
	EXPIRATION_DATE VARCHAR(1000),
	FUNCTIONS VARCHAR(1000),
	ID VARCHAR(1000),
	NAME VARCHAR(1000),
	REGIONS VARCHAR(1000),
	REPORTING_CURRENCY VARCHAR(1000),
	SERVICES VARCHAR(1000),
	SOURCE_NAME_TITLE VARCHAR(1000),
	STATUS VARCHAR(1000),
	SUPPLIER VARCHAR(1000),
	TCV INTEGER,
	TERM_TYPE VARCHAR(1000),
	TIME_ZONE VARCHAR(1000),
	TITLE VARCHAR(1000)
) [/INST] Here is the SQLit

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample_df[f"Answers_{model_id.replace('/', '_')}"] = answers
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample_df[f"Latencies_{model_id.replace('/', '_')}"] = latencies  # Add latencies to DataFrame


Answer: SELECT TERM_TYPE
         FROM CONTRACT_DATA
         WHERE ID = 'CO200456';
```
This query will return the `TERM_TYPE` column value for the row with `ID` equal to 'CO200456'. The `TERM_TYPE` column represents the payment terms of the contract.
Latency: 7.970069885253906 seconds
sql_query: 
for question : Display contracts with Paper Tpe as Own and Status as archivd.
[INST] Write SQLite query to answer the following question given the database schema. Please wrap your code answer using ```: Schema: CREATE TABLE CONTRACT_DATA (
	AGREEMENT_TYPE VARCHAR(1000),
	COUNTRIES VARCHAR(1000),
	CURRENCIES VARCHAR(1000),
	DB_ID VARCHAR(1000),
	DOCUMENT_TYPE VARCHAR(1000),
	EFFECTIVE_DATE VARCHAR(1000),
	EXPIRATION_DATE VARCHAR(1000),
	FUNCTIONS VARCHAR(1000),
	ID VARCHAR(1000),
	NAME VARCHAR(1000),
	REGIONS VARCHAR(1000),
	REPORTING_CURRENCY VARCHAR(1000),
	SERVICES VARCHAR(1000),
	SOURCE_NAME_TITLE VARCHAR(1000),
	STATUS VARCHAR(1000),
	SUPPLIER VARCHAR(1000),
	TCV INTEGER,
	TERM_TYPE VAR

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample_df[f"Answers_{model_id.replace('/', '_')}"] = answers
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample_df[f"Latencies_{model_id.replace('/', '_')}"] = latencies  # Add latencies to DataFrame


Answer: SELECT OBLIGATIONS
         FROM CONTRACT_DATA
         WHERE ID = 'CO300789';
```Answer: SELECT TERM_TYPE
         FROM CONTRACT_DATA
         WHERE ID = 'CO200456';
```
Latency: 3.469627857208252 seconds
sql_query: 
for question : Display contracts with Paper Tpe as Own and Status as archivd.
[INST] Write SQLite query to answer the following question given the database schema. Please wrap your code answer using ```: Schema: CREATE TABLE CONTRACT_DATA (
	AGREEMENT_TYPE VARCHAR(1000),
	COUNTRIES VARCHAR(1000),
	CURRENCIES VARCHAR(1000),
	DB_ID VARCHAR(1000),
	DOCUMENT_TYPE VARCHAR(1000),
	EFFECTIVE_DATE VARCHAR(1000),
	EXPIRATION_DATE VARCHAR(1000),
	FUNCTIONS VARCHAR(1000),
	ID VARCHAR(1000),
	NAME VARCHAR(1000),
	REGIONS VARCHAR(1000),
	REPORTING_CURRENCY VARCHAR(1000),
	SERVICES VARCHAR(1000),
	SOURCE_NAME_TITLE VARCHAR(1000),
	STATUS VARCHAR(1000),
	SUPPLIER VARCHAR(1000),
	TCV INTEGER,
	TERM_TYPE VARCHAR(1000),
	TIME_ZONE VARCHAR(1000),
	TITLE VARCHAR(1000)
) [/INST] Here 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample_df[f"Answers_{model_id.replace('/', '_')}"] = answers
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample_df[f"Latencies_{model_id.replace('/', '_')}"] = latencies  # Add latencies to DataFrame


Answer: SELECT DISTINCT DOCUMENT_TYPE
FROM CONTRACT_DATA
WHERE SOURCE_NAME_TITLE = 'Sales' AND EFFECTIVE_DATE >= DATE('now', '-1 month');
```

This query will return the distinct document types of contracts created by the Sales department in the last month.
Latency: 4.292485952377319 seconds
sql_query: 
for question : Display contracts with Paper Tpe as Own and Status as archivd.
[INST] Write SQLite query to answer the following question given the database schema. Please wrap your code answer using ```: Schema: CREATE TABLE CONTRACT_DATA (
	AGREEMENT_TYPE VARCHAR(1000),
	COUNTRIES VARCHAR(1000),
	CURRENCIES VARCHAR(1000),
	DB_ID VARCHAR(1000),
	DOCUMENT_TYPE VARCHAR(1000),
	EFFECTIVE_DATE VARCHAR(1000),
	EXPIRATION_DATE VARCHAR(1000),
	FUNCTIONS VARCHAR(1000),
	ID VARCHAR(1000),
	NAME VARCHAR(1000),
	REGIONS VARCHAR(1000),
	REPORTING_CURRENCY VARCHAR(1000),
	SERVICES VARCHAR(1000),
	SOURCE_NAME_TITLE VARCHAR(1000),
	STATUS VARCHAR(1000),
	SUPPLIER VARCHAR(1000),
	TCV INTEGER,
	TERM_TYP

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample_df[f"Answers_{model_id.replace('/', '_')}"] = answers
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample_df[f"Latencies_{model_id.replace('/', '_')}"] = latencies  # Add latencies to DataFrame


In [12]:
latencies

[4.292485952377319,
 16.79535698890686,
 6.967696905136108,
 16.79479479789734,
 9.046193838119507,
 16.705487966537476,
 5.2123777866363525,
 4.585055112838745,
 5.271995782852173,
 4.604974031448364,
 7.101496934890747,
 8.010590076446533,
 9.987627983093262,
 8.90891408920288,
 10.03431510925293,
 9.040706872940063,
 10.55818486213684,
 10.812006950378418,
 12.289252996444702,
 11.707170724868774,
 12.540761709213257,
 13.636936902999878,
 13.767843008041382,
 13.261212825775146,
 13.824185848236084,
 13.517896890640259,
 27.036103010177612,
 27.230125188827515,
 16.386695861816406,
 16.228922843933105,
 12.195663928985596,
 12.756841897964478,
 12.784880876541138,
 16.106523990631104,
 25.824928998947144,
 11.872287034988403,
 24.34653902053833,
 10.781704902648926,
 9.88059401512146,
 11.314658880233765,
 11.180660009384155,
 12.697525978088379,
 11.933978796005249,
 10.737761974334717,
 13.66307806968689,
 11.746760845184326,
 11.671434164047241,
 14.436419010162354,
 11.46365976

In [45]:
len(answers)

197

In [46]:
answers

[' SELECT COUNT(Name) FROM contract GROUP BY Name;',
 '',
 ' "SELECT * FROM contract WHERE contract_id = \'CO200456\' and keywords LIKE \'%payment terms%\';" ',
 "\nSELECT * FROM contract WHERE contract_id = 'CO300789'; \n",
 '',
 " SELECT * FROM contract WHERE contract_id = 'CO800987' and keywords LIKE '%indemnification%'; ",
 " SELECT * FROM contract WHERE contract_id = 'CO100123' and keywords LIKE '%Invoice%'; ",
 '',
 "\nSELECT * FROM contract WHERE document_type = 'NDA' AND keywords LIKE '%indemnification%';\n",
 "\nSELECT *\nFROM contract \nWHERE \nSupplier = 'John Doe' AND\nkeywords LIKE '%governance%'\n",
 '',
 '',
 "\n\nSELECT  ID, Title , Name , Source_Name_Title  FROM contract WHERE  keywords LIKE '%confidetial Clause%';\n\n",
 '\n SELECT *\n FROM contract\n WHERE\n Document_Type = "MSA"\n AND keywords LIKE "%renewal%"\n',
 '\nSELECT * FROM contract WHERE tcv > 200000;\n',
 '\nSELECT * FROM contract WHERE (document_type="Amendment" OR document_type="MOU" OR document_type="MS

In [13]:
sample_df.shape

(197, 21)

In [15]:
sample_df.describe()

Unnamed: 0,Latencies_ibm_granite-8b-code-instruct,Latencies_ibm_granite-34b-code-instruct,Latencies_ibm_granite-20b-code-instruct,Latencies_ibm_granite-13b-instruct-v2,Latencies_meta-llama_llama-3-70b-instruct,Latencies_mistralai_mixtral-8x7b-instruct-v01,Latencies_kaist-ai_prometheus-8x7b-v2,Latencies_codellama_codellama-34b-instruct,Latencies_deepseek-ai_deepseek-coder-33b-instruct
count,197.0,197.0,197.0,197.0,197.0,197.0,197.0,197.0,197.0
mean,3.51271,8.605804,33.018921,11.301663,10.669473,27.423053,15.530839,8.415629,13.220504
std,0.900513,3.313303,10.671722,3.667292,3.082846,7.52252,3.117219,2.237939,3.89509
min,2.106297,3.176662,8.595942,7.014578,3.090019,4.747956,7.97007,3.468878,4.292486
25%,3.043841,7.094661,29.653571,9.354405,8.596117,23.208023,13.439407,7.310114,11.308406
50%,3.279701,8.146687,31.96203,10.419875,10.354609,27.097236,15.228674,8.195616,12.525572
75%,3.66732,9.300545,35.619434,11.636557,12.227829,32.121184,17.650978,9.236171,14.390573
max,10.655881,26.878607,106.961834,28.735786,21.756629,46.006535,26.224931,20.532664,27.230125
