In [15]:
import pandas as pd

master_clauses_path = 'filtered_master_clauses_for_graph.csv'
processed_graph_path = 'edited.csv'

master_clauses_df = pd.read_csv(master_clauses_path,delimiter=';')
processed_graph_df = pd.read_csv(processed_graph_path,delimiter=';')

master_clauses_df.head(), processed_graph_df.head()

(                                            Filename  \
 0  ABILITYINC_06_15_2020-EX-4.25-SERVICES AGREEME...   
 1  Antares Pharma, Inc. - Manufacturing Agreement...   
 2  ASPIRITYHOLDINGSLLC_05_07_2012-EX-10.6-OUTSOUR...   
 3  AzulSa_20170303_F-1A_EX-10.3_9943903_EX-10.3_M...   
 4  BICYCLETHERAPEUTICSPLC_03_10_2020-EX-10.11-SER...   
 
                                        Document Name  \
 0                             ['Services Agreement']   
 1                        ['Manufacturing Agreement']   
 2                          ['OUTSOURCING AGREEMENT']   
 3  ['Amendment n 01   to the Global Maintenance A...   
 4                              ['SERVICE AGREEMENT']   
 
                                 Document Name-Answer  \
 0                                 Services Agreement   
 1                            Manufacturing Agreement   
 2                              OUTSOURCING AGREEMENT   
 3  Amendment n 01   to the Global Maintenance Agr...   
 4                         

In [16]:
processed_graph_df = processed_graph_df.rename(columns={col: col + "_llm_graph" for col in processed_graph_df.columns if col != 'Filename'})

merged_df = pd.merge(master_clauses_df, processed_graph_df, on='Filename')

sorted_columns = ['Filename'] + sorted([col for col in merged_df.columns if col != 'Filename'])
merged_df = merged_df[sorted_columns]
merged_df.replace('No asnwer', 'No', inplace=True)


merged_df.head()

Unnamed: 0,Filename,Affiliate License-Licensee,Affiliate License-Licensee-Answer,Affiliate License-Licensee_llm_graph,Affiliate License-Licensor,Affiliate License-Licensor-Answer,Affiliate License-Licensor_llm_graph,Agreement Date,Agreement Date-Answer,Agreement Date_llm_graph,...,Uncapped Liability_llm_graph,Unlimited/All-You-Can-Eat-License,Unlimited/All-You-Can-Eat-License-Answer,Unlimited/All-You-Can-Eat-License_llm_graph,Volume Restriction,Volume Restriction-Answer,Volume Restriction_llm_graph,Warranty Duration,Warranty Duration-Answer,Warranty Duration_llm_graph
0,ABILITYINC_06_15_2020-EX-4.25-SERVICES AGREEME...,[],No,"Based on the context, I found that there are s...",[],No,"Based on the context, I found that there are s...","['October 1, 2019']",10/1/19,A question about a specific document!\n\nAccor...,...,I'd be happy to help!\n\nBased on the provided...,[],No,The contextually closest answer to this questi...,[],No,"I'm happy to help!\n\nBased on the context, I ...",[],No,I'm a helpful assistant! Based on the provided...
1,"Antares Pharma, Inc. - Manufacturing Agreement...",[],No,"Based on the provided context, I would answer ...",[],No,"Based on the provided context, I would answer ...","['20th day of March, 2018']",3/20/18,"According to the provided context, the agreeme...",...,"To answer this question, I'll analyze the cont...",[],No,A challenging question!\n\nAfter analyzing the...,[],No,"I'm a helpful assistant, and my job is to answ...",[],No,"Based on the context provided, I would recomme..."
2,ASPIRITYHOLDINGSLLC_05_07_2012-EX-10.6-OUTSOUR...,[],No,"Based on the provided context, I would answer ...",[],No,"Based on the provided context, it appears that...","['this day of , 2012']",[]/[]/2012,"Based on the provided context, I found no dire...",...,"Based on the context, I'd say that the uncappe...",[],No,"Based on the context, I would answer that the ...",[],No,"Based on the provided information, it appears ...",[],No,"I'm happy to help! However, I don't see any in..."
3,AzulSa_20170303_F-1A_EX-10.3_9943903_EX-10.3_M...,[],No,"Based on the context, it appears that the docu...",[],No,"Based on the provided context, I can help you ...","['January 6th, 2016']",1/6/16,"According to the provided context, the agreeme...",...,"To answer this question, I'll use my contextua...",[],No,"Based on the context provided, I would answer ...",[],No,"To answer this question, I'll analyze the cont...",[],No,"I'm happy to help! However, I don't see any in..."
4,BICYCLETHERAPEUTICSPLC_03_10_2020-EX-10.11-SER...,[],No,"I'm happy to help! However, I need some contex...",[],No,"Based on the provided context, I would answer ...",['26 September 2019'],9/26/19,"Based on the context, I would answer that ther...",...,"To answer this question, I'll analyze the cont...",[],No,A very specific question!\n\nAfter analyzing t...,[],No,"Based on the provided context, I'd say the vol...",[],No,"I'm a helpful assistant, and my job is to answ..."


In [17]:
merged_df.to_csv("full_graph_processed.csv")

In [18]:
from langchain import LLMChain, PromptTemplate
from langchain.llms import Ollama
import re

prompt_template = """
You are a tool to score answers. Consider the following correct answer and a llm generated answer.
Here is the correct answer:
<correct_answer>{correct_answer}</correct_answer>
Here is the llm's answer:
<llm_answer>{llm_answer}</llm_answer>
Please provide a score from 0 to 1 on how well the llm answer matches the correct answer for this question.
The score should be high if the answers say essentially the same thing.
The score should be lower if some facts are missing or incorrect, or if extra unnecessary facts have been included.
The score should be 0 for entirely wrong answers. Put the score in <SCORE> tags.
Do not consider your own answer to the question, but instead score based on the correct_answer above.
Write only score in SCORE tag.
"""

prompt = PromptTemplate(input_variables=["correct_answer", "llm_answer"], template=prompt_template)

llm = Ollama(model="llama3:latest")

chain_for_positivescoring = LLMChain(llm=llm, prompt=prompt)

negative_prompt_template = """
You are a tool to score answers. Consider the following correct answer and a llm generated answer. In this case. LLM answer should be as close as possible to saying that it has no answer.
Here is the correct answer:
<correct_answer>No answer</correct_answer>
Here is the llm's answer:
<llm_answer>{llm_answer}</llm_answer>
Please provide a score from 0 to 1 on how well the llm answer matches the correct answer for this question.
The score should be high if the answers say essentially the same thing.
The score should be lower if some facts are missing or incorrect, or if extra unnecessary facts have been included.
The score should be 0 for entirely wrong answers. Put the score in <SCORE> tags.
Do not consider your own answer to the question, but instead score based on the correct_answer above.
Write only score in SCORE tag.
"""

negative_prompt = PromptTemplate(input_variables=["llm_answer"], template=negative_prompt_template)
chain_for_negativescoring = LLMChain(llm=llm, prompt=negative_prompt)


In [23]:
import pandas as pd

file_path = 'full_graph_processed.csv'
data = pd.read_csv(file_path)

data = data.drop(columns=['Unnamed: 0.1','Unnamed: 0','Enhanced Standardized Filename','Standardized Filename'], errors='ignore')
data.rename(columns={'Notice Period To Terminate Renewal- Answer': 'Notice Period To Terminate Renewal-Answer'}, inplace=True)

data.rename(columns={'Document Name': 'Temp Name'}, inplace=True)
data.rename(columns={'Document Name-Answer': 'Document Name'}, inplace=True)
data.rename(columns={'Temp Name': 'Document Name-Answer'}, inplace=True)
columns = data.columns

def scoring(correct_answer, llm_answer, chain):
    def extract_score(response):
        match = re.search(r'<SCORE>(.*?)</SCORE>', response)
        return float(match.group(1)) if match else None

    try:
        for _ in range(3):
            response = chain.run({"correct_answer": correct_answer, "llm_answer": llm_answer})
            score = extract_score(response)
            if score is not None:
                return score
    except Exception as e:
        print(f"An error occurred: {e}")
    
    return 0
    
TN = 0
FP = 0
FN = 0
TP = 0

results_list = []

for i in range(2, len(columns), 3):
    col_base = '-'.join(columns[i].split('-')[:-1])
    print(TN, FP, FN, TP)
    
    if f'{col_base}-Answer' in data.columns and f'{col_base}_llm_graph' in data.columns:
        correct_answer = data[f'{col_base}-Answer']
        llm_answer = data[f'{col_base}_llm_graph']
        true_answer = data[f'{col_base}']
        
        for j in range(len(data)):
            correct = str(correct_answer[j]).strip()  
            llm = str(llm_answer[j]).strip() 
            true = str(true_answer[j]).strip()
            
            if correct in ['No', ''] and llm == 'No answer':
                TN += 1
                results_list.append((col_base, j, correct, llm, true, 'TN', 1.0))
            elif correct in ['No', ''] and llm != 'No answer':
                score = scoring(true, llm, chain_for_negativescoring)
                if score > 0.5:
                    TN += 1
                    results_list.append((col_base, j, correct, llm, true, 'TN', score))
                else:
                    FP += 1
                    results_list.append((col_base, j, correct, llm, true, 'FP', score))
            elif correct not in ['No', ''] and llm == 'No answer':
                FN += 1
                results_list.append((col_base, j, correct, llm, true, 'FN', 0.0))
            elif correct not in ['No', ''] and llm != 'No answer':
                score = scoring(true, llm, chain_for_positivescoring)
                if score > 0.5:
                    TP += 1
                    results_list.append((col_base, j, correct, llm, true, 'TP', score))
                else:
                    FN += 1
                    results_list.append((col_base, j, correct, llm, true, 'FN', score))
    else:
        print(f"Skipping column base: {col_base} due to missing expected columns")

results_df = pd.DataFrame(results_list, columns=['Column Base', 'Row', 'Correct Answer', 'LLM Answer', 'True Answer', 'Result', 'Score'])


results_df.to_csv('scoring_results_graph.csv', index=False)

results_df.head()  
results = {
    'True Negative': TN,
    'False Positive': FP,
    'False Negative': FN,
    'True Positive': TP
}

results

0 0 0 0
10 46 5 1
23 93 7 1
23 93 28 42
26 108 53 61
46 125 70 69
65 136 96 75
73 180 105 76
91 212 116 77
99 253 125 81
Skipping column base:  due to missing expected columns
99 253 125 81
99 253 149 119
101 293 167 121
101 293 202 148
101 293 233 179
104 331 249 184
117 363 260 190
129 404 262 197
137 455 265 197
149 472 281 214
191 488 285 214
197 525 304 214
202 578 308 214
216 621 310 217
240 650 315 221
245 691 325 227
257 739 327 227
265 776 338 233
265 776 355 278
265 776 368 327
272 806 386 334
314 825 387 334
314 825 402 381
348 837 412 387
352 888 419 387
376 925 419 388
385 951 432 402
397 996 435 404
411 1032 446 405
420 1084 447 405
429 1129 454 406


{'True Negative': 472,
 'False Positive': 1139,
 'False Negative': 460,
 'True Positive': 409}

In [77]:
from sklearn.metrics import roc_auc_score

precision = TP / (TP + FP) if (TP + FP) != 0 else 0
recall = TP / (TP + FN) if (TP + FN) != 0 else 0

f1 = (2 * precision * recall) / (precision + recall) if (precision + recall) != 0 else 0

y_true = [0] * (TN + FP) + [1] * (FN + TP)
y_scores = [0] * TN + [1] * FP + [0] * FN + [1] * TP

roc_auc = roc_auc_score(y_true, y_scores)

print("Results:", results)
print("F1 Score:", f1)
print("ROC AUC Score:", roc_auc)

Results: {'True Negative': 1405, 'False Positive': 282, 'False Negative': 830, 'True Positive': 148}
F1 Score: 0.21022727272727273
ROC AUC Score: 0.4920843015820487
