In [11]:
from sentence_transformers import SentenceTransformer

# 1. Load a pretrained Sentence Transformer model
model = SentenceTransformer("nlpaueb/legal-bert-base-uncased")

# The sentences to encode
sentences = [
    "Who are the parties to the Agreement and what are their defined names?",
    "Cloud Investments Ltd. (“Company”) and Jack Robinson (“Advisor”)",
]
# 2. Calculate embeddings by calling model.encode()
embeddings = model.encode(sentences)
print(embeddings.shape)
# [3, 384]

# 3. Calculate the embedding similarities
similarities = model.similarity(embeddings, embeddings)
print(similarities)

No sentence-transformers model found with name nlpaueb/legal-bert-base-uncased. Creating a new one with mean pooling.


config.json:   0%|          | 0.00/1.02k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/222k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

(2, 768)
tensor([[1.0000, 0.8051],
        [0.8051, 1.0000]])


In [3]:
from sentence_transformers import SentenceTransformer

# 1. Load a pretrained Sentence Transformer model
model = SentenceTransformer("nlpaueb/legal-bert-base-uncased")

# The sentences to encode
sentences = [
    "Who are the parties to the Agreement and what are their defined names?",
    "Cloud Investments Ltd. (“Company”) and Jack Robinson (“Advisor”)",
]
# 2. Calculate embeddings by calling model.encode()
embeddings = model.encode(sentences)
print(embeddings.shape)
# [3, 384]

# 3. Calculate the embedding similarities
similarities = model.similarity(embeddings, embeddings)
print(similarities)

  from tqdm.autonotebook import tqdm, trange
No sentence-transformers model found with name nlpaueb/legal-bert-base-uncased. Creating a new one with mean pooling.


(2, 768)
tensor([[1.0000, 0.8051],
        [0.8051, 1.0000]])


In [1]:
import os 
os.chdir("../../")

from src.utils import extract_qa_pairs_to_df

In [2]:
df = extract_qa_pairs_to_df("data/evaluation_sets/Robinson_Q&A.docx")

In [3]:
df.head()

Unnamed: 0,question,ground_truths
0,Who are the parties to the Agreement and what ...,Cloud Investments Ltd. (“Company”) and Jack Ro...
1,What is the termination notice?,According to section 4:14 days for convenience...
2,What are the payments to the Advisor under the...,According to section 6: 1. Fees of $9 per hour...
3,Can the Agreement or any of its obligations be...,1. Under section 1.1 the Advisor can’t assign ...
4,Who owns the IP?,According to section 4 of the Undertaking (App...


In [19]:
import pandas as pd
from sentence_transformers import SentenceTransformer
import numpy as np

def create_sentence_pairs(df, question_col, answer_col, model_name="nlpaueb/legal-bert-base-uncased"):
    """
    Creates sentence pairs (question, answer) from a DataFrame, calculates embeddings, and similarities.

    Args:
        df: The DataFrame containing the questions and answers.
        question_col: The name of the column containing the questions.
        answer_col: The name of the column containing the answers.
        model_name: The name of the SentenceTransformer model to use (default is a legal-domain model).

    Returns:
        A list of tuples, each containing a sentence pair (question, answer), their embeddings, and similarity.
    """

    # Load the SentenceTransformer model
    model = SentenceTransformer(model_name)

    # Initialize a list to store the sentence pairs, embeddings, and similarities
    sentence_pairs_with_data = []

    # Iterate through the DataFrame rows
    for index, row in df.iterrows():
        question = row[question_col]
        answer = row[answer_col]

        # Create the sentence pair
        sentence_pair = [question, answer]

        # Calculate embeddings for the sentence pair
        embeddings = model.encode(sentence_pair)

        # Calculate cosine similarity between question and answer embeddings
        similarity = np.dot(embeddings[0], embeddings[1]) / (np.linalg.norm(embeddings[0]) * np.linalg.norm(embeddings[1]))

        # Append the sentence pair, embeddings, and similarity to the list
        sentence_pairs_with_data.append((sentence_pair, embeddings, similarity))

    return sentence_pairs_with_data

# Example Usage:
# df = pd.DataFrame({
#     "question": ["Who are the parties to the Agreement?", "What is the termination notice period?"],
#     "answer": ["Cloud Investments Ltd. and Jack Robinson", "30 days"]
# })

sentence_pairs_with_data = create_sentence_pairs(df, "question", "ground_truths")

# Print the results
for pair, embeddings, similarity in sentence_pairs_with_data:
    print(f"Sentence Pair: {pair}")
    # print(f"Embeddings: {embeddings}")
    print(f"Similarity: {similarity}")



No sentence-transformers model found with name nlpaueb/legal-bert-base-uncased. Creating a new one with mean pooling.


Sentence Pair: ['Who are the parties to the Agreement and what are their defined names?', 'Cloud Investments Ltd. (“Company”) and Jack Robinson (“Advisor”)\n']
Similarity: 0.8050569295883179
Sentence Pair: ['What is the termination notice?', 'According to section 4:14 days for convenience by both parties. The Company may terminate without notice if the Advisor refuses or cannot perform the Services or is in breach of any provision of this Agreement.  \n']
Similarity: 0.7069092988967896
Sentence Pair: ['What are the payments to the Advisor under the Agreement? ', 'According to section 6: 1. Fees of $9 per hour up to a monthly limit of $1,500, 2. Workspace expense of $100 per month, 3. Other reasonable and actual expenses if approved by the company in writing and in advance.\n']
Similarity: 0.827538013458252
Sentence Pair: ['Can the Agreement or any of its obligations be assigned?', '1. Under section 1.1 the Advisor can’t assign any of his obligations without the prior written consent of

In [21]:
import pandas as pd
import openai
from openai.embeddings_utils import get_embedding, cosine_similarity

def create_sentence_pairs_with_openai(df, question_col, answer_col, openai_api_key, engine="text-embedding-ada-002"):
    """
    Creates sentence pairs (question, answer) from a DataFrame, calculates OpenAI embeddings, and similarities.

    Args:
        df: The DataFrame containing the questions and answers.
        question_col: The name of the column containing the questions.
        answer_col: The name of the column containing the answers.
        openai_api_key: Your OpenAI API key.
        engine: The OpenAI embedding engine to use (default is "text-embedding-ada-002").

    Returns:
        A list of tuples, each containing a sentence pair (question, answer), their embeddings, and similarity.
    """

    # openai.api_key = openai_api_key  # Set your OpenAI API key

    sentence_pairs_with_data = []

    for index, row in df.iterrows():
        question = row[question_col]
        answer = row[answer_col]

        sentence_pair = [question, answer]

        # Calculate OpenAI embeddings for the sentence pair
        question_embedding = get_embedding(question, engine=engine)
        answer_embedding = get_embedding(answer, engine=engine)
        embeddings = [question_embedding, answer_embedding]

        # Calculate cosine similarity
        similarity = cosine_similarity(question_embedding, answer_embedding)

        sentence_pairs_with_data.append((sentence_pair, embeddings, similarity))

    return sentence_pairs_with_data

# Example Usage:
df = pd.DataFrame({
    "question": ["Who are the parties to the Agreement?", "What is the termination notice period?"],
    "answer": ["Cloud Investments Ltd. and Jack Robinson", "30 days"]
})

# openai_api_key = "YOUR_API_KEY"  # Replace with your actual API key
sentence_pairs_with_data = create_sentence_pairs_with_openai(df, "question", "answer", openai_api_key)

# Print the results
for pair, embeddings, similarity in sentence_pairs_with_data:
    print(f"Sentence Pair: {pair}")
    print(f"Embeddings: {embeddings}")
    print(f"Similarity: {similarity}")


ModuleNotFoundError: No module named 'openai.embeddings_utils'

### OpenaAi embeddings similarity between quiz and answers

In [5]:
import pandas as pd
import openai
from langchain_openai import OpenAIEmbeddings
from langchain.evaluation import load_evaluator

def evaluate_similarity_with_openai(df, question_col, answer_col, engine="text-embedding-ada-002"):
    """
    Evaluates the similarity between question and answer pairs in a DataFrame using OpenAI embeddings.

    Args:
        df: The DataFrame containing the questions and answers.
        question_col: The name of the column containing the questions.
        answer_col: The name of the column containing the answers.
        openai_api_key: Your OpenAI API key.
        engine: The OpenAI embedding engine to use (default is "text-embedding-ada-002").

    Returns:
        A DataFrame with the original question and answer pairs, along with their similarity scores.
    """

    

    # Load the evaluator
    evaluator = load_evaluator("pairwise_embedding_distance", embedding=OpenAIEmbeddings())

    # Initialize lists to store results
    questions = []
    answers = []
    similarities = []

    # Iterate through the DataFrame rows
    for index, row in df.iterrows():
        question = row[question_col]
        answer = row[answer_col]

        # Evaluate similarity using OpenAI embeddings
        similarity = evaluator.evaluate_string_pairs(prediction=question, prediction_b=answer)

        questions.append(question)
        answers.append(answer)
        similarities.append(similarity)

    # Create a new DataFrame with the results
    results_df = pd.DataFrame({
        question_col: questions,
        answer_col: answers,
        "similarity": similarities
    })

    return results_df

# Example Usage:
# df = pd.DataFrame({
#     "question": ["Who are the parties to the Agreement?", "What is the termination notice period?"],
#     "answer": ["Cloud Investments Ltd. and Jack Robinson", "30 days"]
# })

results_df = evaluate_similarity_with_openai(df, "question", "ground_truths")

print(results_df)


                                            question  \
0  Who are the parties to the Agreement and what ...   
1                    What is the termination notice?   
2  What are the payments to the Advisor under the...   
3  Can the Agreement or any of its obligations be...   
4                                  Who owns the IP?    
5  Is there a non-compete obligation to the Advisor?   
6              Can the Advisor charge for meal time?   
7             In which street does the Advisor live?   
8       Is the Advisor entitled to social benefits?    
9  What happens if the Advisor claims compensatio...   

                                       ground_truths  \
0  Cloud Investments Ltd. (“Company”) and Jack Ro...   
1  According to section 4:14 days for convenience...   
2  According to section 6: 1. Fees of $9 per hour...   
3  1. Under section 1.1 the Advisor can’t assign ...   
4  According to section 4 of the Undertaking (App...   
5  Yes. During the term of engagement with the 

In [4]:
import pandas as pd
from sentence_transformers import SentenceTransformer
import numpy as np

def create_sentence_pairs_multiple_models(df, question_col, answer_col, model_names):
    """
    Creates sentence pairs and calculates similarities using multiple SentenceTransformer models.

    Args:
        df: The DataFrame containing the questions and answers.
        question_col: The name of the column containing the questions.
        answer_col: The name of the column containing the answers.
        model_names: A list of SentenceTransformer model names to use.

    Returns:
        A DataFrame with questions, answers, and similarity scores for each model.
    """

    results = pd.DataFrame(columns=[question_col, answer_col] + model_names)

    for index, row in df.iterrows():
        question = row[question_col]
        answer = row[answer_col]

        sentence_pair = [question, answer]
        results.loc[index, question_col] = question
        results.loc[index, answer_col] = answer

        for model_name in model_names:
            model = SentenceTransformer(model_name)
            embeddings = model.encode(sentence_pair)
            similarity = np.dot(embeddings[0], embeddings[1]) / (np.linalg.norm(embeddings[0]) * np.linalg.norm(embeddings[1]))
            results.loc[index, model_name] = similarity

    return results

# Example Usage:
# df = pd.DataFrame({
#     "question": ["Who are the parties to the Agreement?", "What is the termination notice period?"],
#     "answer": ["Cloud Investments Ltd. and Jack Robinson", "30 days"]
# })

model_names = ["nlpaueb/legal-bert-base-uncased", "nlpaueb/bert-base-uncased-contracts", "all-mpnet-base-v2", "all-MiniLM-L6-v2"]  # Add more models as needed
results_df = create_sentence_pairs_multiple_models(df, "question", "ground_truths", model_names)

print(results_df)


  from tqdm.autonotebook import tqdm, trange
No sentence-transformers model found with name nlpaueb/legal-bert-base-uncased. Creating a new one with mean pooling.
No sentence-transformers model found with name nlpaueb/bert-base-uncased-contracts. Creating a new one with mean pooling.
No sentence-transformers model found with name nlpaueb/legal-bert-base-uncased. Creating a new one with mean pooling.
No sentence-transformers model found with name nlpaueb/bert-base-uncased-contracts. Creating a new one with mean pooling.
No sentence-transformers model found with name nlpaueb/legal-bert-base-uncased. Creating a new one with mean pooling.
No sentence-transformers model found with name nlpaueb/bert-base-uncased-contracts. Creating a new one with mean pooling.
No sentence-transformers model found with name nlpaueb/legal-bert-base-uncased. Creating a new one with mean pooling.
No sentence-transformers model found with name nlpaueb/bert-base-uncased-contracts. Creating a new one with mean pool

                                            question  \
0  Who are the parties to the Agreement and what ...   
1                    What is the termination notice?   
2  What are the payments to the Advisor under the...   
3  Can the Agreement or any of its obligations be...   
4                                  Who owns the IP?    
5  Is there a non-compete obligation to the Advisor?   
6              Can the Advisor charge for meal time?   
7             In which street does the Advisor live?   
8       Is the Advisor entitled to social benefits?    
9  What happens if the Advisor claims compensatio...   

                                       ground_truths  \
0  Cloud Investments Ltd. (“Company”) and Jack Ro...   
1  According to section 4:14 days for convenience...   
2  According to section 6: 1. Fees of $9 per hour...   
3  1. Under section 1.1 the Advisor can’t assign ...   
4  According to section 4 of the Undertaking (App...   
5  Yes. During the term of engagement with the 