In [1]:
# UNCOMMENT IF USING GOOGLE-DRIVE:
# from google.colab import drive
# drive.mount('/content/drive')
# root = '/content/drive/My Drive/Colab Notebooks/COMP34812/'

# UNCOMMENT IF RUN-POD OR LOCAL
root = ''

In [2]:
%cd /content/drive/MyDrive/Colab Notebooks/COMP34812/
!pip install pandas
!pip install python-dotenv

/content/drive/MyDrive/Colab Notebooks/COMP34812
Collecting python-dotenv
  Downloading python_dotenv-1.0.1-py3-none-any.whl (19 kB)
Installing collected packages: python-dotenv
Successfully installed python-dotenv-1.0.1


In [2]:
import pandas as pd

train = pd.read_csv(root + 'data/training_data/train.csv')
dev = pd.read_csv(root + 'data/training_data/dev.csv')

train['id'] = train.index; dev['id'] = dev.index

# Define Prompt
json_schema = {
    "thought_process": "<deductive/common-sense reasoning steps>",
    "label": "<0 or 1>"
}

system_prompt = """You are an expert in natural language reasoning and inference. Your task is to analyze pairs of sentences and determine if the second sentence (hypothesis) can be logically inferred from the first sentence (premise). For each example, I will provide the premise and hypothesis. Your response should be in the following JSON format: {"thought_process": "Step 1. <Identify key information and relationships in the premise, considering logical connections, commonsense understanding, and factual consistency>. Step 2. <Analyze how the hypothesis relates to or contradicts the premise based on the information identified in Step 1. Evaluate if the hypothesis can be reasonably inferred from the premise>. Step 3. <Explain your final reasoning and conclusion on whether the hypothesis is entailed by the premise or not>", "label": "<0 for no entailment, 1 for entailment>"}
Please provide a clear multi-step reasoning chain explaining how you arrived at your final answer, breaking it down into logical components. Ground your response in the given information, logical principles and common-sense reasoning.

Example:

Premise: The dog chased the cat up the tree. Hypothesis: The cat climbed the tree.
Your response: {"thought_process": "Step 1: the premise indicates a scenario where a dog chases a cat, resulting in the cat moving up a tree. The movement 'up the tree' suggests a vertical ascent, typical of climbing behavior. It is common sense that a cat would climb a tree to escape a chasing dog, and there are no known facts that contradict the premise or hypothesis. Step 2: 'The cat climbed the tree' can be logically inferred from the premise because the action of climbing is a reasonable and necessary part of the cat moving 'up the tree' as described. Thus, the hypothesis logically follows from the premise. Step 3: Based on the logical reasoning, common sense, and lack of contradictory facts, the hypothesis can be inferred from the premise.", "label": 1}
"""

In [3]:
import os
from dotenv import load_dotenv

from llm.mistral import Mistral
from service.prediction_service import predict_label

load_dotenv()
API_KEY = os.getenv('MISTRAL_API_KEY')

In [14]:
start_index, end_index = 0, train.shape[0]

# Generate predictions and thoughts for train set
train['response_json'].iloc[start_index:end_index] = train.iloc[start_index:end_index].apply(lambda x: predict_label(
    id=x['id'],
    sys=system_prompt,
    premise=x['premise'],
    hypothesis=x['hypothesis'],
    true_label=x['label'],
    llm=Mistral(API_KEY),
    model_name='open-mistral-7b',
    json_format=json_schema,
    json_filepath=f'{root}data/json_data/thoughts_{start_index}_{end_index}.json'
), axis=1)

KeyboardInterrupt: 

In [ ]:
start_index, end_index = 0, dev.shape[0]

# Generate predictions and thoughts for dev set
dev['response_json'].iloc[start_index:end_index] = dev.iloc[start_index:end_index].apply(lambda x: predict_label(
    id=x['id'],
    sys=system_prompt,
    premise=x['premise'],
    hypothesis=x['hypothesis'],
    true_label=x['label'],
    llm=Mistral(API_KEY),
    model_name='open-mistral-7b',
    json_format=json_schema,
    json_filepath=f'{root}data/json_data_dev/thoughts_{start_index}_{end_index}.json'
), axis=1)

In [4]:
# Read JSON data into dataframes and concatenate
import glob
train_paths = sorted(glob.glob(root + 'data/json_data/*.json'))
dev_paths = sorted(glob.glob(root + 'data/json_data_dev/*.json'))

def read_data(paths: list[str]):
  dfs = []
  dfs_filtered = []
  for file in paths:
    df = pd.read_json(file, lines=True)
    df['thoughts_len'] = df['thoughts'].apply(len)
    dfs.append(df)
    df = df[df['correct']]
    dfs_filtered.append(df)

  return pd.concat(dfs), pd.concat(dfs_filtered)

full_data, filtered_data = read_data(train_paths)
full_data_dev, filtered_data_dev = read_data(dev_paths)

In [5]:
# Analyse skew of data
print(full_data['true_label'].value_counts(), full_data['prediction'].value_counts())
filtered_data

true_label
1    13449
0    12687
Name: count, dtype: int64 prediction
0    16850
1     9286
Name: count, dtype: int64


Unnamed: 0,id,premise,hypothesis,thoughts,prediction,true_label,correct,chat_history,reprompt_counts,thoughts_len
0,0,"However, Fort Charles was rebuilt as a militar...",Fort Charles was rebuilt as an amusement park ...,"step 1: in the premise, fort charles is descri...",0,0,True,"[{'role': 'system', 'content': '[INST] You are...",0,716
1,1,Buchanan's The Democrats and Republicans have...,THe parties will never be similar.,"step 1: in the premise, buchanan expresses the...",0,0,True,"[{'role': 'system', 'content': '[INST] You are...",0,1014
2,2,In order to review an acquisition that is usin...,The auditor only reviews the acquisition itsel...,"step 1: in the premise, the context is about a...",0,0,True,"[{'role': 'system', 'content': '[INST] You are...",0,870
3,3,Three young people sit outside and engage with...,There is a tin can and string telephone.,"step 1: in the premise, three young people are...",0,0,True,"[{'role': 'system', 'content': '[INST] You are...",0,576
4,4,The lucrative tin mines of Kuala Lumpur in the...,The Chinese labor was seen as less costly and ...,"step 1: in the premise, it is mentioned that c...",1,1,True,"[{'role': 'system', 'content': '[INST] You are...",0,1126
...,...,...,...,...,...,...,...,...,...,...
898,26939,Information in agencies' plans and reports pro...,"Thanks to agencies' plans and reports, over $3...",step 1: the premise indicates that information...,0,0,True,"[{'role': 'system', 'content': '[INST] You are...",0,1552
899,26940,"He is the Mr. Magoo of scientific theory, geni...",He understands everything he can't see.,"step 1: in the premise, daniel mendelsohn desc...",0,0,True,"[{'role': 'system', 'content': '[INST] You are...",0,1083
900,26941,"Over the past 25 years, the Postal Service has...",Classifying mail is important to the function ...,"step 1: in the premise, it is mentioned that t...",1,1,True,"[{'role': 'system', 'content': '[INST] You are...",0,831
901,26942,Whoever first stepped ashore on Madeira discov...,The British discovered the Canary Islands first.,"step 1: in the premise, it is stated that the ...",0,0,True,"[{'role': 'system', 'content': '[INST] You are...",0,809


In [17]:
# Evaluate metrics of augmented predictions
full_data.drop_duplicates(subset='id', inplace=True)
filtered_data.drop_duplicates(subset='id', inplace=True)

def evaluate_metrics(df):
    true_positive = df[(df['correct'] == True) & (df['true_label'] == 1)].shape[0]
    true_negative = df[(df['correct'] == True) & (df['true_label'] == 0)].shape[0]
    false_positive = df[(df['correct'] == False) & (df['true_label'] == 0)].shape[0]
    false_negative = df[(df['correct'] == False) & (df['true_label'] == 1)].shape[0]

    precision = true_positive / (true_positive + false_positive)
    recall = true_positive / (true_positive + false_negative)
    accuracy = (true_positive + true_negative) / df.shape[0]
    f1 = 2 * (precision * recall) / (precision + recall)

    return {'precision': precision, 'recall': recall, 'accuracy': accuracy, 'f1': f1}

evaluate_metrics(full_data)

In [None]:
from pprint import pprint
def analyse_thought_len(df):
  print('CHARACTER LENGTH')
  # Analyse distribution of thought-explanation length, and its effect on predictions
  print('\n*********\n')

  print('ALL')
  pprint(evaluate_metrics(df))

  print('\n*********\n')

  print('< 350')
  pprint(evaluate_metrics(df[(df['thoughts_len'] < 350)]))
  print(df[(df['thoughts_len'] < 350)].shape)

  print('\n*********\n')

  print('> 1000')
  pprint(evaluate_metrics(df[(df['thoughts_len'] > 1000)]))
  print(df[(df['thoughts_len'] > 1000)].shape)

  print('\n*********\n')

  print('> 1100')
  pprint(evaluate_metrics(df[(df['thoughts_len'] > 1000)]))
  print(df[(df['thoughts_len'] > 1000)].shape)

  print('\n*********\n')

  print('350 < x < 1000')
  pprint(evaluate_metrics(df[(df['thoughts_len'] > 300) & (df['thoughts_len'] < 1000)]))

  print('\n*********\n')

  print('1000 < x < 1100')
  pprint(evaluate_metrics(df[(df['thoughts_len'] > 300) & (df['thoughts_len'] < 1000)]))

analyse_thought_len(full_data)
analyse_thought_len(full_data_dev)

In [6]:
def clean_shave(df):
  # Clean thoughts; expand lists to string
  df['thoughts'] = df['thoughts'].apply(lambda x: '. '.join(x) if isinstance(x, list) else x)
  # Shave thoughts exceeding character range
  return df[(df['thoughts_len'] > 300) & (df['thoughts_len'] < 1200)]

filtered_train = clean_shave(filtered_data)
filtered_dev = clean_shave(filtered_data_dev)

In [38]:
# Retrieve the compliment of the full data against the filtered data 
train_ids = pd.DataFrame({'id': range(0, train.shape[0])})
dropped_train = train[~train['id'].isin(filtered_data['id'])]

dev_ids = pd.DataFrame({'id': range(0, dev.shape[0])})
dropped_dev = dev[~dev['id'].isin(filtered_data_dev['id'])]

# Persist filtered data for further-processing
filtered_train.to_csv(root + 'data/training_data/filtered/train_keep.csv')
filtered_dev.to_csv(root + 'data/training_data/filtered/dev_keep.csv')
dropped_train.to_csv(root + 'data/training_data/filtered/train_drop.csv')
dropped_dev.to_csv(root + 'data/training_data/filtered/dev_drop.csv')

In [10]:
filtered_train

NameError: name 'filtered_train' is not defined

In [ ]:
filtered

In [ ]:
# Format data for fine-tuning
import json

def extract_prompt(premise, hypothesis):
  input = f"Premise: '{premise}' Hypothesis: '{hypothesis}' Label:"
  prompt = """You are an expert in natural language reasoning and inference. Your task is to analyze pairs of sentences and determine if the second sentence (hypothesis) can be logically inferred from the first sentence (premise). For each example, I will provide the premise and hypothesis. Your response should be in the following JSON format: {"thought_process": "Step 1. <Identify key information and relationships in the premise, considering logical connections, commonsense understanding, and factual consistency>. Step 2. <Analyze how the hypothesis relates to or contradicts the premise based on the information identified in Step 1. Evaluate if the hypothesis can be reasonably inferred from the premise>. Step 3. <Explain your final reasoning and conclusion on whether the hypothesis is entailed by the premise or not>", "label": "<0 for no entailment, 1 for entailment>"} Please provide a clear multi-step reasoning chain explaining how you arrived at your final answer, breaking it down into logical components. Ground your response in the given information, logical principles and common-sense reasoning."""
  prompt_template = f"[INST] {prompt} \n{input} \n[/INST]"

  return prompt_template

def extract_completion(thoughts, prediction):
  completion_json = {
      "thoughts": thoughts,
      "prediction": prediction
  }
  return json.dumps(completion_json)

def format_ft_data(df):
  df['prompt'] = df.apply(lambda x : extract_prompt(x['premise'], x['hypothesis']), axis=1)
  df['completion'] = df.apply(lambda x : extract_completion(x['thoughts'], x['prediction']), axis=1)
  res = df.apply(lambda x : f"<s>{x['prompt']} \n\n{x['completion']}</s>", axis=1)
  return res

ft_train = format_ft_data(ft_train)
ft_dev = format_ft_data(ft_dev)