In [16]:
import pandas as pd

df = pd.read_csv('data/NLI_trial.csv')
vanilla_1 = "Your task is to solve NLI tasks. Given a premise and a hypothesis, you will determine whether the hypothesis entails (1) or doesn't entail (0) the premise. You should return your response as a JSON object with the 'label' key, containing the value 1 or 0. For example, {'label': 1}. You will not provide any other information in your response besides the valid JSON object containing only the predicted label."

vanilla_2 = "You are an AI assistant trained to classify the relationship between a given premise and hypothesis. Provide your prediction as a JSON object with the key 'label' and the value as either 0 (no entailment) or 1 (entailment). "

vanilla_3 = 'Your task is to determine if a given hypothesis is entailed by a premise. Output your prediction in the following JSON format: {"label": 0} for no entailment or {"label": 1} for entailment.'

vanilla_4 = 'As an AI language model, your role is to classify the entailment relationship between a premise and a hypothesis. Provide your prediction in JSON format with the key "label" and the value as 0 or 1, where 0 indicates no entailment and 1 indicates entailment.'

cot_1 = "Your task is to solve NLI tasks. Given a premise and a hypothesis, you will determine whether the hypothesis entails (1) or doesn't entail (0) the premise. You can use any information you have to make your decision, which can be deduced from logical reasoning, entity recognition, or general common-sense. You should return your response as a json object with the key 'thoughts' to contain your logical reasoning and deduction steps, which should always be generated BEFORE the 'label' key which contains the prediction value 0 or 1. Do not provide any other information in your response besides the valid JSON object. For example, {'thoughts': 'your deductive/common-sense reasoning steps here', 'label': 0 or 1}."

cot_2 = """You are an expert in natural language reasoning and inference. Your task is to analyze pairs of sentences and determine if the second sentence (hypothesis) can be logically inferred from the first sentence (premise), while considering logical rules, common sense, and factual accuracy.

For each example, I will provide the premise and hypothesis. Your response should be in the following JSON format:
{
    "thought_process": 
        [
            "step 1: <Identify key information and relationships in the premise, considering logical connections, commonsense understanding, and factual consistency>",
            "step 2: <Analyze how the hypothesis relates to or contradicts the premise based on the information identified in Step 1. Evaluate if the hypothesis can be reasonably inferred from the premise>",
            "step 3: <Explain your final reasoning and conclusion on whether the hypothesis is entailed by the premise or not>"
         ],
    "label": "<0 for no entailment, 1 for entailment>"
}
Please provide a clear multi-step reasoning chain explaining how you arrived at your final answer, breaking it down into logical components. Ground your response in the given information, logical principles, common sense, and factual knowledge. Identify any inconsistencies or contradictions.

Example:
Premise: The dog chased the cat up the tree. Hypothesis: The cat climbed the tree.

Your response:
{
  "thought_process": "step 1: The premise indicates that the cat was chased up the tree by the dog, suggesting the cat climbed the tree. It is common sense that a cat would climb a tree to escape a chasing dog, and there are no known facts that contradict the premise or hypothesis. step 2: The hypothesis logically follows from the premise, as the cat climbing the tree is a reasonable consequence of being chased up the tree by the dog. step3: Based on the logical reasoning, common sense, and lack of contradictory facts, the hypothesis can be inferred from the premise."
  "label": 1
}
"""

cot_3 = """You are an expert in natural language reasoning and inference. Your task is to analyze pairs of sentences and determine if the second sentence (hypothesis) can be logically inferred from the first sentence (premise), while considering logical rules, common sense, and factual accuracy.

For each example, I will provide the premise and hypothesis. Your response should be in the following JSON format:
{
    "thought_process": " 
    Step 1. <Analyze the premise, identifying key information, logical rules and connections.> 
    Step 2. <Use common-sense reasoning and world knowledge to verify consistency.> 
    Step 3. <Analyze the logical connections and contradictions between the premise and hypothesis. Evaluate if the hypothesis can be reasonably inferred from the premise>. 
    Step 4. <Explain your final reasoning and conclusion on whether the hypothesis is entailed by the premise or not>",
    "label": "<0 for no entailment, 1 for entailment>
    "
}
Please provide a clear multi-step reasoning chain explaining how you arrived at your final answer, breaking it down into logical components. Ground your response in the given information, logical principles, common sense, and factual knowledge. Identify any inconsistencies or contradictions.

Example:
Premise: The dog chased the cat up the tree. Hypothesis: The cat climbed the tree.

Your response:
{
  "thought_process": "
  Step 1.
}
"""

In [2]:
from service.prediction_service import predict_label
from llm.mistral import Mistral
import os
from dotenv import load_dotenv

load_dotenv()

def generate_predictions(df, sys, file_path):
    df['response_json'] = df.apply(lambda x: predict_label(sys, x['premise'], x['hypothesis'], Mistral(os.getenv('MISTRAL_API_KEY')), model_name='open-mistral-7b'), axis=1)
    df['predicted_label'] = df['response_json'].apply(lambda x: x['label'])
    # evaluate accuracy, precision, recall, f1
    df['correct'] = df['predicted_label'] == df['label']
    df['correct'].value_counts(normalize=True)
    
    df[['premise', 'hypothesis', 'label', 'predicted_label', 'correct', 'response_json']].to_csv(f'data/NLI_trial_predictions_{file_path}.csv', index=False)
    
    precision = df[df['correct'] == True].groupby('label').count()['correct'] / df.groupby('label').count()['correct']
    recall = df[df['correct'] == True].groupby('label').count()['correct'] / df[df['correct'] == True].groupby('label').count()['correct'].sum()
    accuracy = df['correct'].value_counts(normalize=True)[True]
    f1 = 2 * (precision * recall) / (precision + recall)
    
    return precision, recall, accuracy, f1
