### **Import libraries**
`lab_v2` is a library containing methods that are often used throughout our GPT experiments.  
This library can be found at the following url:

In [129]:
from typing import List
from collections import Counter
from enum import Enum

import pandas
import numpy
from sentence_transformers import SentenceTransformer
from scipy.spatial.distance import euclidean

from lab_v2 import gpt_eval
from lab_v2 import io
from lab_v2 import stats
from lab_v2.xlsx_creation import XlsxWorkbook

### **Constants**
Specify a few constants to make the notebook easier to configure in the future.

In [130]:
class Dataset(Enum):
    LAST_LETTERS = 0
    CSQA = 1
    DRAW = 2

APPEND = 'T0.7'

DATASET = Dataset.CSQA
QUESTION_SET_FILE_PATH = f'data/question-set/csqa.jsonl'
RESPONSE_FILE_PATH = f'data/responses/base-{APPEND}/csqa/sample_0.jsonl'
MID_FILE_PATH = f'ml/csqa-{APPEND}.jsonl'
OUT_FILE_PATH = f'out/CSQA-{APPEND}.xlsx'
NUM_SAMPLES = 20

# DATASET = Dataset.LAST_LETTERS
# QUESTION_SET_FILE_PATH = f'data/question-set/last_letters.jsonl'
# RESPONSE_FILE_PATH = f'data/responses/base-{APPEND}/last_letters/sample_0.jsonl'
# MID_FILE_PATH = f'ml/last_letters-{APPEND}.jsonl'
# OUT_FILE_PATH = f'out/Last-Letters-{APPEND}.xlsx'
# NUM_SAMPLES = 20

# DATASET = Dataset.DRAW
# QUESTION_SET_FILE_PATH = f'data/question-set/draw.json'
# RESPONSE_FILE_PATH = f'data/responses/base-{APPEND}/draw/sample_0.jsonl'
# MID_FILE_PATH = f'ml/draw-{APPEND}.jsonl'
# OUT_FILE_PATH = f'out/Draw-{APPEND}.xlsx'
# NUM_SAMPLES = 20

if DATASET == Dataset.LAST_LETTERS:
    QUESTION_SET_INDEX_NAME = 'iIndex'
    QUESTION_SET_ANSWER_NAME = 'answer'
    RESPONSE_INDEX_NAME = 'question_id'
    RESPONSE_SAMPLE_NAME = 'choices'
    EXTRACT_RESPONSE = lambda response: response['message']['content']
    ANSWER_EXTRACTION = gpt_eval.extract_last_letters
    COMPARE_ANSWERS = lambda x, y: x == y
if DATASET == Dataset.CSQA:
    QUESTION_SET_INDEX_NAME = 'id'
    QUESTION_SET_ANSWER_NAME = 'answerKey'
    RESPONSE_INDEX_NAME = 'question_id'
    RESPONSE_SAMPLE_NAME = 'choices'
    EXTRACT_RESPONSE = lambda response: response['message']['content']
    ANSWER_EXTRACTION = gpt_eval.extract_csqa
    COMPARE_ANSWERS = lambda x, y: x.lower() == y.lower()
if DATASET == Dataset.DRAW:
    QUESTION_SET_INDEX_NAME = 'iIndex'
    QUESTION_SET_ANSWER_NAME = 'lSolutions'
    RESPONSE_INDEX_NAME = 'question_id'
    RESPONSE_SAMPLE_NAME = 'choices'
    EXTRACT_RESPONSE = lambda response: response['message']['content']
    ANSWER_EXTRACTION = gpt_eval.extract_draw
    COMPARE_ANSWERS = lambda response, answer: response.issubset(set(answer))


MODEL = 'all-MiniLM-L6-v2'
RESPONSE_ANSWERS_NAME = 'answers'
MAJORITY_ANSWER_NAME = 'majority_answer'
MAJORITY_ANSWER_DISTANCE = 'majority_distance'
MAJORITY_ANSWER_DISTANCE_SQUARED = 'majority_distance_squared'
MAJORITY_CORRECT_NAME = 'majority_correct'

MAJORITY_ANSWER_LIST_NAME = 'majority_answer_list'
MAJORITY_ANSWER_LIST_DISTANCE = 'majority_distance_list'
MAJORITY_ANSWER_LIST_CORRECT = 'majority_correct_list'

ENTROPY_COLUMN = 'shannon_entropy'
GINI_IMPURITY_COLUMN = 'gini_impurity'
EMBEDDING_NAME = 'embedding'

In [131]:
model = SentenceTransformer(MODEL)

### **File I/O**
Specify a few functions that make it easier to read from files using pandas

In [132]:
question_set = io.read_file(QUESTION_SET_FILE_PATH)
question_set = question_set[[QUESTION_SET_INDEX_NAME, QUESTION_SET_ANSWER_NAME]]
display(len(question_set))
question_set.head(1)

9741

Unnamed: 0,id,answerKey
0,075e483d21c29a511267ef62bedc0461,A


In [133]:
responses = io.read_file(RESPONSE_FILE_PATH)
display(len(responses))
responses.head(1)

9741

Unnamed: 0,question_id,id,object,created,model,usage,choices,question,n,temperature
0,ac0351d8649fb60af40c5638061a2e21,chatcmpl-7K6z3XYWOvPh7edXDkJhKGmQwlKek,chat.completion,1685027761,gpt-3.5-turbo,"{'prompt_tokens': 75, 'completion_tokens': 140...","[{'message': {'role': 'assistant', 'content': ...","Answer A, B, C, D or E. At the end, say 'the a...",20,0.7


In [134]:
joined = responses.set_index(RESPONSE_INDEX_NAME).join(question_set.set_index(QUESTION_SET_INDEX_NAME))
display(len(joined))
joined.head(1)

9741

Unnamed: 0_level_0,id,object,created,model,usage,choices,question,n,temperature,answerKey
question_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
ac0351d8649fb60af40c5638061a2e21,chatcmpl-7K6z3XYWOvPh7edXDkJhKGmQwlKek,chat.completion,1685027761,gpt-3.5-turbo,"{'prompt_tokens': 75, 'completion_tokens': 140...","[{'message': {'role': 'assistant', 'content': ...","Answer A, B, C, D or E. At the end, say 'the a...",20,0.7,D


In [135]:
def extract_answers(row, column):
    answers = []
    for index, element in enumerate(row[column]): 
        if index > NUM_SAMPLES: break
        response = EXTRACT_RESPONSE(element)
        answers.append(ANSWER_EXTRACTION(response))
    row[RESPONSE_ANSWERS_NAME] = answers
    return row

joined = joined.apply(lambda row: extract_answers(row, RESPONSE_SAMPLE_NAME), axis=1)
joined.head(1)

Unnamed: 0_level_0,id,object,created,model,usage,choices,question,n,temperature,answerKey,answers
question_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
ac0351d8649fb60af40c5638061a2e21,chatcmpl-7K6z3XYWOvPh7edXDkJhKGmQwlKek,chat.completion,1685027761,gpt-3.5-turbo,"{'prompt_tokens': 75, 'completion_tokens': 140...","[{'message': {'role': 'assistant', 'content': ...","Answer A, B, C, D or E. At the end, say 'the a...",20,0.7,D,"[d, d, d, d, d, d, d, d, d, d, d, d, d, d, d, ..."


In [136]:
def embed_answers(row):
    return model.encode(row)

joined['temp'] = joined[RESPONSE_SAMPLE_NAME].apply(lambda row : [EXTRACT_RESPONSE(r) for r in row])
joined[EMBEDDING_NAME] = joined['temp'].apply(lambda row: embed_answers(row))
joined.head(1)

Unnamed: 0_level_0,id,object,created,model,usage,choices,question,n,temperature,answerKey,answers,temp,embedding
question_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
ac0351d8649fb60af40c5638061a2e21,chatcmpl-7K6z3XYWOvPh7edXDkJhKGmQwlKek,chat.completion,1685027761,gpt-3.5-turbo,"{'prompt_tokens': 75, 'completion_tokens': 140...","[{'message': {'role': 'assistant', 'content': ...","Answer A, B, C, D or E. At the end, say 'the a...",20,0.7,D,"[d, d, d, d, d, d, d, d, d, d, d, d, d, d, d, ...","[The answer is D) beach., The answer is D) bea...","[[0.06594438, 0.038977712, 0.008883611, 0.0056..."


In [137]:
def get_average(row):
    return numpy.average(row, axis=0, keepdims=True)

joined['average'] = joined[EMBEDDING_NAME].apply(lambda row: get_average(row))
joined.head(1)

Unnamed: 0_level_0,id,object,created,model,usage,choices,question,n,temperature,answerKey,answers,temp,embedding,average
question_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
ac0351d8649fb60af40c5638061a2e21,chatcmpl-7K6z3XYWOvPh7edXDkJhKGmQwlKek,chat.completion,1685027761,gpt-3.5-turbo,"{'prompt_tokens': 75, 'completion_tokens': 140...","[{'message': {'role': 'assistant', 'content': ...","Answer A, B, C, D or E. At the end, say 'the a...",20,0.7,D,"[d, d, d, d, d, d, d, d, d, d, d, d, d, d, d, ...","[The answer is D) beach., The answer is D) bea...","[[0.06594438, 0.038977712, 0.008883611, 0.0056...","[[0.06594439, 0.038977724, 0.008883611, 0.0056..."


In [138]:
def get_distance(row):
    embeddings = row[EMBEDDING_NAME]
    average = row['average'][0]
    return [euclidean(e, average) for e in embeddings]
joined['distance'] = joined.apply(lambda row : get_distance(row), axis=1)

In [139]:
def get_majority(row):
    answers = row[RESPONSE_ANSWERS_NAME]
    counter = Counter(answers)
    distance = row['distance']
    row[MAJORITY_ANSWER_NAME] = counter.most_common()[0][0]
    for index, i in enumerate(answers):
        if i == row[MAJORITY_ANSWER_NAME]:
            row[MAJORITY_ANSWER_DISTANCE] = distance[index]
            break
    return row

joined = joined.apply(lambda row : get_majority(row), axis=1)
joined[MAJORITY_CORRECT_NAME] = joined.apply(lambda row : COMPARE_ANSWERS(row[MAJORITY_ANSWER_NAME], row[QUESTION_SET_ANSWER_NAME]), axis=1)
joined.head(1)

Unnamed: 0_level_0,id,object,created,model,usage,choices,question,n,temperature,answerKey,answers,temp,embedding,average,distance,majority_answer,majority_distance,majority_correct
question_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
ac0351d8649fb60af40c5638061a2e21,chatcmpl-7K6z3XYWOvPh7edXDkJhKGmQwlKek,chat.completion,1685027761,gpt-3.5-turbo,"{'prompt_tokens': 75, 'completion_tokens': 140...","[{'message': {'role': 'assistant', 'content': ...","Answer A, B, C, D or E. At the end, say 'the a...",20,0.7,D,"[d, d, d, d, d, d, d, d, d, d, d, d, d, d, d, ...","[The answer is D) beach., The answer is D) bea...","[[0.06594438, 0.038977712, 0.008883611, 0.0056...","[[0.06594439, 0.038977724, 0.008883611, 0.0056...","[1.377946006186903e-07, 1.377946006186903e-07,...",d,1.377946e-07,True


In [140]:
joined[ENTROPY_COLUMN] = joined[RESPONSE_ANSWERS_NAME].apply(lambda row : stats.shannon_entropy(row))
joined[GINI_IMPURITY_COLUMN] = joined[RESPONSE_ANSWERS_NAME].apply(lambda row : stats.gini_impurity(row))
joined[MAJORITY_ANSWER_DISTANCE_SQUARED] = joined[MAJORITY_ANSWER_DISTANCE].apply(lambda row : row * row)
joined[[MAJORITY_ANSWER_DISTANCE, MAJORITY_ANSWER_DISTANCE_SQUARED, ENTROPY_COLUMN, GINI_IMPURITY_COLUMN, MAJORITY_CORRECT_NAME]].to_json(MID_FILE_PATH,lines=True, orient='records')
joined.head(1)

Unnamed: 0_level_0,id,object,created,model,usage,choices,question,n,temperature,answerKey,...,temp,embedding,average,distance,majority_answer,majority_distance,majority_correct,shannon_entropy,gini_impurity,majority_distance_squared
question_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ac0351d8649fb60af40c5638061a2e21,chatcmpl-7K6z3XYWOvPh7edXDkJhKGmQwlKek,chat.completion,1685027761,gpt-3.5-turbo,"{'prompt_tokens': 75, 'completion_tokens': 140...","[{'message': {'role': 'assistant', 'content': ...","Answer A, B, C, D or E. At the end, say 'the a...",20,0.7,D,...,"[The answer is D) beach., The answer is D) bea...","[[0.06594438, 0.038977712, 0.008883611, 0.0056...","[[0.06594439, 0.038977724, 0.008883611, 0.0056...","[1.377946006186903e-07, 1.377946006186903e-07,...",d,1.377946e-07,True,0.0,0.0,1.898735e-14


In [141]:
def generate_chart(dataframe: pandas.DataFrame, x_axis: str, y_axis: str, filter):
    ret = []
    values = dataframe[x_axis].sort_values().unique().tolist()
    for value in values:
        filtered_entropy = dataframe[filter(dataframe[x_axis], value)]
        filtered_wrong = filtered_entropy[~filtered_entropy[y_axis]]
        
        if len(filtered_entropy) == 0: break
        ret.append({
            x_axis: value, 
            'support': len(filtered_entropy), 
            'amount_wrong': len(filtered_wrong), 
            'probability_of_failure': len(filtered_wrong) / len(filtered_entropy)
        })

    return pandas.DataFrame(ret)

def generate_geq(dataframe: pandas.DataFrame, x_axis: str, y_axis: str):
    return generate_chart(dataframe, x_axis, y_axis, lambda dat, y: dat >= y)

def generate_leq(dataframe: pandas.DataFrame, x_axis: str, y_axis: str):
    return generate_chart(dataframe, x_axis, y_axis, lambda dat, y: dat <= y)

In [142]:
entropy_geq = generate_geq(joined, ENTROPY_COLUMN, MAJORITY_CORRECT_NAME)
gini_impurity_geq = generate_geq(joined, GINI_IMPURITY_COLUMN, MAJORITY_CORRECT_NAME)
distance_geq = generate_geq(joined, MAJORITY_ANSWER_DISTANCE, MAJORITY_CORRECT_NAME)
distance_squared_geq = generate_geq(joined, MAJORITY_ANSWER_DISTANCE_SQUARED, MAJORITY_CORRECT_NAME)

display(entropy_geq.head(1))
display(gini_impurity_geq.head(1))

Unnamed: 0,shannon_entropy,support,amount_wrong,probability_of_failure
0,0.0,9741,4300,0.441433


Unnamed: 0,gini_impurity,support,amount_wrong,probability_of_failure
0,0.0,9741,4300,0.441433


In [143]:
entropy_leq = generate_leq(joined, ENTROPY_COLUMN, MAJORITY_CORRECT_NAME)
gini_impurity_leq = generate_leq(joined, GINI_IMPURITY_COLUMN, MAJORITY_CORRECT_NAME)
distance_leq = generate_leq(joined, MAJORITY_ANSWER_DISTANCE, MAJORITY_CORRECT_NAME)
distance_squared_leq = generate_leq(joined, MAJORITY_ANSWER_DISTANCE_SQUARED, MAJORITY_CORRECT_NAME)
display(entropy_leq.head(1))
display(gini_impurity_leq.head(1))

Unnamed: 0,shannon_entropy,support,amount_wrong,probability_of_failure
0,0.0,6832,2388,0.349532


Unnamed: 0,gini_impurity,support,amount_wrong,probability_of_failure
0,0.0,6832,2388,0.349532


In [144]:
workbook = XlsxWorkbook(OUT_FILE_PATH)

entropy_geq_worksheet = workbook.add_sheet('Entropy_GEQ', entropy_geq)
entropy_geq_worksheet.add_scatter_chart('Entropy vs Probability of failure (GEQ)', ENTROPY_COLUMN, 'probability_of_failure', 'H1')

entropy_leq_worksheet = workbook.add_sheet('Entropy_LEQ', entropy_leq)
entropy_leq_worksheet.add_scatter_chart('Entropy vs Probability of failure (LEQ)', ENTROPY_COLUMN, 'probability_of_failure', 'H1')

gini_impurity_leq_worksheet = workbook.add_sheet('Gini_Impurity_LEQ', gini_impurity_geq)
gini_impurity_leq_worksheet.add_scatter_chart('Gini Impurity vs Probability of failure (LEQ)', GINI_IMPURITY_COLUMN, 'probability_of_failure', 'H1')

gini_impurity_geq_worksheet = workbook.add_sheet('Gini_Impurity_GEQ', gini_impurity_leq)
gini_impurity_geq_worksheet.add_scatter_chart('Gini Impurity vs Probability of failure (GEQ)', GINI_IMPURITY_COLUMN, 'probability_of_failure', 'H1')

distance_leq_worksheet = workbook.add_sheet('Avg_Distance_LEQ', distance_geq)
distance_leq_worksheet.add_scatter_chart('Distance From Centroid vs Probability of failure (LEQ)', MAJORITY_ANSWER_DISTANCE, 'probability_of_failure', 'H1')

distance_geq_worksheet = workbook.add_sheet('Avg_Distance_GEQ', distance_leq)
distance_geq_worksheet.add_scatter_chart('Distance From Centroid vs Probability of failure (GEQ)', MAJORITY_ANSWER_DISTANCE, 'probability_of_failure', 'H1')

distance_squared_leq_worksheet = workbook.add_sheet('Avg_Distance_Squared_LEQ', distance_squared_geq)
distance_squared_leq_worksheet.add_scatter_chart('Distance From Centroid vs Probability of failure (LEQ)', MAJORITY_ANSWER_DISTANCE_SQUARED, 'probability_of_failure', 'H1')

distance_squared_geq_worksheet = workbook.add_sheet('Avg_Distance_Squared_GEQ', distance_squared_geq)
distance_squared_geq_worksheet.add_scatter_chart('Distance From Centroid vs Probability of failure (GEQ)', MAJORITY_ANSWER_DISTANCE_SQUARED, 'probability_of_failure', 'H1')

workbook.close()