In [1]:
# UNCOMMENT IF USING GOOGLE-DRIVE:
# from google.colab import drive
# drive.mount('/content/drive')
# root = '/content/drive/My Drive/Colab Notebooks/COMP34812/'

# UNCOMMENT IF RUN-POD OR LOCAL
root = ''

In [2]:
import pandas as pd

# train = pd.read_csv(root + 'data/training_data/filtered/train_keep.csv')
# dev = pd.read_csv(root + 'data/training_data/filtered/dev_keep.csv')
# 
# train

# Read JSON data into dataframes and concatenate
import glob
train_paths = sorted(glob.glob(root + 'data/json_data/*.json'))
dev_paths = sorted(glob.glob(root + 'data/json_data_dev/*.json'))

def read_data(paths: list[str]):
  dfs = []
  for file in paths:
    df = pd.read_json(file, lines=True)
    df['thoughts_len'] = df['thoughts'].apply(len)
    dfs.append(df)
  return pd.concat(dfs)

train = read_data(train_paths).drop_duplicates(subset=['id'])
train.drop_duplicates(subset=['id'], inplace=True)
train.sort_values(by='id', inplace=True)
train.reset_index(drop=True, inplace=True)

dev = read_data(dev_paths)
dev.drop_duplicates(subset=['id'], inplace=True)
dev.sort_values(by='id', inplace=True)
dev.reset_index(drop=True, inplace=True)

In [3]:
import os
from dotenv import load_dotenv

from llm.mistral import Mistral
from service.scoring_service import generate_score

load_dotenv()
API_KEY = os.getenv('MISTRAL_API_KEY')

In [32]:
import json

json_schema =  {
    "analysis": " <Analysis of thought-chains usefulness, relevance, and logical coherence>", 
    "score": "<A score integer between 1 and 5 inclusive>",
    "improved_thoughts": "Step 1. <Identify key information and relationships in the premise, considering logical connections, commonsense understanding, and factual consistency>. Step 2. <Analyze how the hypothesis relates to or contradicts the premise based on the information identified in Step 1. Evaluate if the hypothesis can be reasonably inferred from the premise>. Step 3. <Explain your final reasoning and conclusion on whether the hypothesis is entailed by the premise or not>",
    "label": "<0 for no-entailment, 1 for entailment>"
 }

system_prompt = f"""You are an expert in natural language reasoning and inference. I will give you the predictions and chain-of-thought reasoning steps for an NLI binary-classification task involving premise, hypothesis, and whether the hypothesis entails the premise. Your goal is to score the intermediate thought-process used to generate these predictions. Your score should be between 1-5 inclusive(with 5 being the best, 3 indicating room for improvement and 1 being the worst) based on the thought-chains' logical coherence, relevance to the premise and hypothesis, use of common-sense reasoning, and effectiveness in determining the entailment relationship. Provide a brief analysis/justification before generating each score, followed by a revised and improved version of the thought chains. The shorter the thought-chains, the better. Thoughts with clear room for improvement should be scored 3 or lower.
 
 Your response should be in the following JSON format: {json.dumps(json_schema)}
 
Keep your thoughts and analysis concise.
"""

generate_prompt = lambda p, h, l, t: f"""
Premise: {p}
Hypothesis: {h}
Thought-Process: {t}
Predicted Label: {l}
"""

start_index = train[train['id'] == 14000].index[0]
end_index = train[train['id'] == 17000].index[0]
train_sample = train.iloc[start_index:end_index]

# train_sample['score_json'] = train_sample.apply(lambda x: generate_score(
#     id=x['id'],
#     sys=system_prompt, 
#     premise=x['premise'], 
#     hypothesis=x['hypothesis'],
#     thoughts=x['thoughts'],
#     predicted_label=x['prediction'],
#     true_label=x['true_label'],
#     llm=Mistral(os.getenv('MISTRAL_API_KEY')), 
#     model_name='open-mixtral-8x7b',
#     json_format=json_schema,
#     json_filepath=f'{root}data/json_data/score'
# ), axis=1)

train.iloc[train[train['id'] == 1877].index].apply(
    lambda x: generate_score(
        id=x['id'],
        sys=system_prompt, 
        premise=x['premise'], 
        hypothesis=x['hypothesis'],
        thoughts=x['thoughts'],
        predicted_label=x['prediction'],
        true_label=x['true_label'],
        llm=Mistral(os.getenv('MISTRAL_API_KEY')), 
        model_name='open-mixtral-8x7b',
        json_format=json_schema,
        json_filepath=f'{root}data/json_data/score'
    ), axis=1
)

In [4]:
gs = pd.read_json(root + 'data/json_data/score/gold_standard.json', lines=True)

In [26]:
train = train[train['id'] < 16000]
gs = gs[gs['id'] < 16000]

In [27]:
import pandas as pd

# Assume df1 and df2 are your dataframes
set_ids_df1 = set(gs['id'])
set_ids_df2 = set(train['id'])

# Find IDs that are unique to each dataframe
unique_to_df1 = set_ids_df1 - set_ids_df2
unique_to_df2 = set_ids_df2 - set_ids_df1

# Combine the unique sets to get all non-overlapping IDs
non_overlapping_ids = unique_to_df1.union(unique_to_df2)
non_overlapping_ids

{210,
 211,
 212,
 213,
 214,
 215,
 216,
 217,
 262,
 263,
 487,
 488,
 489,
 490,
 491,
 492,
 493,
 494,
 495,
 497,
 499,
 500,
 502,
 503,
 504,
 1616,
 1683,
 1692,
 1696,
 1795,
 1803,
 1815,
 1825,
 1826,
 1866,
 1888,
 1908,
 1914,
 1921,
 1951,
 1988,
 1994,
 2029,
 2077,
 2105,
 2124,
 2158,
 2190,
 2202,
 2254,
 2260,
 2266,
 2314,
 2327,
 2338,
 2367,
 2393,
 2418,
 2424,
 2480,
 2490,
 2507,
 2508,
 2513,
 2526,
 2559,
 2594,
 2626,
 2627,
 2637,
 2672,
 2693,
 2741,
 2767,
 2828,
 2862,
 2868,
 2869,
 2870,
 2888,
 2929,
 2931,
 2946,
 2955,
 2958,
 2972,
 2973,
 2979,
 2991,
 3000,
 3001,
 3002,
 3004,
 3005,
 3006,
 3007,
 3008,
 3009,
 3010,
 3011,
 3012,
 3013,
 3014,
 3015,
 3016,
 3017,
 3019,
 3020,
 3021,
 3022,
 3023,
 3024,
 3025,
 3026,
 3027,
 3028,
 3029,
 3030,
 3031,
 3032,
 3034,
 3035,
 3036,
 3037,
 3038,
 3039,
 3040,
 3041,
 3042,
 3043,
 3044,
 3045,
 3046,
 3047,
 3049,
 3050,
 3051,
 3052,
 3053,
 3054,
 3055,
 3056,
 3057,
 3058,
 3059,
 3060,
 30

In [29]:
train = train[train['id'].isin(non_overlapping_ids)]
train

Unnamed: 0,id,premise,hypothesis,thoughts,prediction,true_label,correct,chat_history,reprompt_counts,thoughts_len
202,210,What the hell Tommy interrupted him.,"What are you doing, Tommy stopped him midway.","step 1: in the premise, there are two actions ...",0,1,False,"[{'role': 'system', 'content': '[INST] You are...",0,948
203,211,"On the airport side of town, there are several...",Other luxury hotels are on the opposite side o...,"step 1: in the premise, it is mentioned that t...",1,1,True,"[{'role': 'system', 'content': '[INST] You are...",0,1134
204,212,i think the rate of processing is just about u...,The rate of processing just reached the rate o...,"step 1. in the premise, the speaker expresses ...",0,1,False,"[{'role': 'system', 'content': '[INST] You are...",0,1076
205,213,Interior of the restaurant chain: Planet Holly...,The inside of a restaurant.,step 1: the premise describes the interior of ...,1,1,True,"[{'role': 'system', 'content': '[INST] You are...",0,1088
206,214,i work at the Lewisville site,I don't work at the Lewisville site.,"step 1: in the premise, 'i work at the lewisvi...",0,0,True,"[{'role': 'system', 'content': '[INST] You are...",0,623
...,...,...,...,...,...,...,...,...,...,...
15101,15888,yeah that's that's a little bit too much so,"No, that's not enough.","step 1: in the premise, the speaker expresses ...",0,0,True,"[{'role': 'system', 'content': '[INST] You are...",0,805
15114,15901,Hugging the slopes of Mont Blanc (Monte Bianco...,Mont Blanc's Courmayeur was established in the...,"step 1: in the premise, it is mentioned that c...",1,1,True,"[{'role': 'system', 'content': '[INST] You are...",0,555
15115,15902,The Harem housed the private quarters of the s...,The sultan had 1000 concubines and 10 wives.,step 1: the premise establishes that the harem...,0,1,False,"[{'role': 'system', 'content': '[INST] You are...",0,1646
15119,15906,"That first glimpse of the towering, steepled a...",The abbey is the region's most photographed bu...,"step 1: in the premise, the abbey is described...",1,1,True,"[{'role': 'system', 'content': '[INST] You are...",0,685


In [33]:
train.apply(
    lambda x: generate_score(
        id=x['id'],
        sys=system_prompt, 
        premise=x['premise'], 
        hypothesis=x['hypothesis'],
        thoughts=x['thoughts'],
        predicted_label=x['prediction'],
        true_label=x['true_label'],
        llm=Mistral(os.getenv('MISTRAL_API_KEY')), 
        model_name='open-mixtral-8x7b',
        json_format=json_schema,
        json_filepath=f'{root}data/json_data/score'
    ), axis=1
)

210
211
212
213
****** RE-SCORE: ID 213 ******

213
214
215
216
217
Error: 429 - {"message":"Inactive subscription or usage limit reached"}
****** UKNOWN ERROR: ID-217 ******
 
262
Error: 429 - {"message":"Inactive subscription or usage limit reached"}
****** UKNOWN ERROR: ID-262 ******
 
263
Error: 429 - {"message":"Inactive subscription or usage limit reached"}
****** UKNOWN ERROR: ID-263 ******
 
487
Error: 429 - {"message":"Inactive subscription or usage limit reached"}
****** UKNOWN ERROR: ID-487 ******
 
488
Error: 429 - {"message":"Inactive subscription or usage limit reached"}
****** UKNOWN ERROR: ID-488 ******
 
489
Error: 429 - {"message":"Inactive subscription or usage limit reached"}
****** UKNOWN ERROR: ID-489 ******
 
490
Error: 429 - {"message":"Inactive subscription or usage limit reached"}
****** UKNOWN ERROR: ID-490 ******
 
491
Error: 429 - {"message":"Inactive subscription or usage limit reached"}
****** UKNOWN ERROR: ID-491 ******
 
492
Error: 429 - {"message":"Ina


KeyboardInterrupt



In [2]:
list = [1, 2, 3, 4, 5]

[1, 2, 3]