In [1]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
!pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
!pip install jsonlines

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [4]:
!pip install evaluate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [5]:
!pip install flask

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [6]:
!pip install bert_score

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [7]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
from evaluate import load
from flask import request, jsonify
import jsonlines
import torch

In [8]:
tokenizer = T5Tokenizer.from_pretrained("laschulz/t5-large")
model = T5ForConditionalGeneration.from_pretrained("laschulz/t5-large")

In [9]:
def extract_jsonl(file_path):
    data = []
    with jsonlines.open(file_path) as reader:
        for item in reader:
            data.append(item)
    return data

In [10]:
database = extract_jsonl("./test_socratic.jsonl")

In [11]:
def parser_expectedAnswer(expected_answer):
    temp = str(expected_answer).split('\n') #str might be redundant
    temp = [line.split('**')[0] for line in temp]
    if '###' in temp[-1]:
        temp.pop()
    return temp

In [12]:
def parser_output(model_output):
    parsed_output = model_output.split('?')
    #parsed_output = [entry.strip() + '?' if entry.strip() != "" else "" for entry in model_output.split('?')]
    for i in range(len(parsed_output)):
        entry = parsed_output[i]
        stripped = entry.strip()
        if (stripped != ""):
            parsed_output[i] = stripped + '?'
        else:
            parsed_output.pop(i)
    return parsed_output

In [13]:
bertscore = load("bertscore")
def bert_score(references, predictions): 
    results = bertscore.compute(predictions=predictions, references=references, lang="en")
    return results['f1']

In [17]:
def process_database(database):
    all_middle = 0
    all_lower = 0
    some_lower = 0
    last_lower = 0
    all_upper = 0
    some_upper = 0
    first_upper = 0
    both = 0

    for item in database:
        input_text = item["question"]
        expected_answer = item["answer"]
        expected_answer = parser_expectedAnswer(expected_answer)
        inputs = tokenizer([input_text], return_tensors="pt", padding=True)

        # Check if a GPU is available
        if torch.cuda.is_available():
            # Set the device to GPU
            device = torch.device("cuda")
        else:
            # If GPU is not available, fall back to CPU
            device = torch.device("cpu")

        # Move the model and inputs to the GPU
        model.to(device)
        inputs = inputs.to(device)

        output_sequence = model.generate(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            do_sample=False,
        )
        decoded_output = tokenizer.batch_decode(output_sequence, skip_special_tokens=True)

        parsed_output = parser_output(decoded_output[0])

        if (len(parsed_output) == len(expected_answer)):
            bertScore = bert_score(expected_answer,parsed_output)

            if all(0.8925 < score < 0.9925 for score in bertScore):
                all_middle += 1
            if all(score < 0.8925 for score in bertScore):
                all_lower += 1
            elif any(score < 0.8925 for score in bertScore):
                some_lower += 1
            if all(score > 0.9925 for score in bertScore):
                all_upper += 1
            elif any(score > 0.9925 for score in bertScore):
                some_upper += 1
            if bertScore[(len(bertScore)-1)] < 0.8925:
                last_lower += 1
            if bertScore[(len(bertScore)-1)] > 0.9925:
                first_upper += 1

            if any(score > 0.9925 for score in bertScore) and any(score < 0.8925 for score in bertScore):
                both += 1

            
    return (all_middle, all_lower, some_lower, last_lower, all_upper, some_upper, first_upper, both)
  

all_middle, all_lower, some_lower, last_lower, all_upper, some_upper, first_upper, both = process_database(database)

KeyboardInterrupt: ignored

In [None]:
print(all_middle, all_lower, some_lower, last_lower, all_upper, some_upper, first_upper, both)

191 2 69 66 33 119 81 19


In [20]:
def process_database(database):
    results = []
    raw_results = []
    length_is_not_the_same = 0
    length_is_same = 0
    over_generation = 0
    question_missing = 0
    for item in database:
        input_text = item["question"]
        expected_answer = item["answer"]
        expected_answer = parser_expectedAnswer(expected_answer)
        inputs = tokenizer([input_text], return_tensors="pt", padding=True)

        # Check if a GPU is available
        if torch.cuda.is_available():
            # Set the device to GPU
            device = torch.device("cuda")
        else:
            # If GPU is not available, fall back to CPU
            device = torch.device("cpu")

        # Move the model and inputs to the GPU
        model.to(device)
        inputs = inputs.to(device)

        output_sequence = model.generate(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            do_sample=False,
        )
        decoded_output = tokenizer.batch_decode(output_sequence, skip_special_tokens=True)

        parsed_output = parser_output(decoded_output[0])

        if (len(parsed_output) != len(expected_answer)):
            length_is_not_the_same += 1
            if (len(parsed_output) > len(expected_answer)):
                over_generation += 1
            else:
                question_missing += 1
        else:
            length_is_same += 1
            print("same: ", length_is_same)
            bertScore = bert_score(expected_answer,parsed_output)
            result_dict = {
                "bert_score": bertScore,
                "model_output": parsed_output,
                "expected_answer": expected_answer
            }
            for i in range(len(bertScore)):
                score = bertScore[i]
                if (bertScore[i] > 0.9925 ):
                  break;

                if (bertScore[i] < 0.8925):
                  break
                if (i == len(bertScore)-1):
                    raw_results_dict = { 
                        "question": item["question"],
                        "answer": item["answer"]
                    }
                    raw_results.append(raw_results_dict)
                    print("parsed_output: ", parsed_output)
                    print("expected_outut: ", expected_answer)

            results.append(result_dict)
            
    return (results, raw_results, length_is_not_the_same, over_generation, question_missing)

In [None]:
def process_database(database):
    raw_results_good = []
    raw_results_bad = []
    length_is_not_the_same = 0
    length_is_same = 0
    for item in database:
        input_text = item["question"]
        expected_answer = item["answer"]
        expected_answer = parser_expectedAnswer(expected_answer)
        inputs = tokenizer([input_text], return_tensors="pt", padding=True)

        # Check if a GPU is available
        if torch.cuda.is_available():
            # Set the device to GPU
            device = torch.device("cuda")
        else:
            # If GPU is not available, fall back to CPU
            device = torch.device("cpu")

        # Move the model and inputs to the GPU
        model.to(device)
        inputs = inputs.to(device)

        output_sequence = model.generate(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            do_sample=False,
        )
        decoded_output = tokenizer.batch_decode(output_sequence, skip_special_tokens=True)

        parsed_output = parser_output(decoded_output[0])

        if (len(parsed_output) != len(expected_answer)):
            length_is_not_the_same += 1
 
        else:
            length_is_same += 1
            print("same: ", length_is_same)
            bertScore = bert_score(expected_answer,parsed_output)
            result_dict = {
                "bert_score": bertScore,
                "model_output": parsed_output,
                "expected_answer": expected_answer
            }
            for i in range(len(bertScore)):
                score = bertScore[i]
                #looking at cases where there is a correct answer
                if (bertScore[i] > 0.995):
                  raw_results_dict = { 
                        "question": item["question"],
                        "answer": item["answer"]
                  }
                  raw_results_good.append(raw_results_dict)
                  break;
            
            for i in range(len(bertScore)):
                score = bertScore[i]
                #looking at cases where there is a wrong answer
                if (bertScore[i] < 0.90):
                  raw_results_dict = { 
                        "question": item["question"],
                        "answer": item["answer"]
                  }
                  raw_results_bad.append(raw_results_dict)
                  break;
            
    return (raw_results_good, raw_results_bad, length_is_not_the_same)

In [None]:
#adjust this line to define which process_database function is being used
results, raw_results, length_is_not_the_same, over_generation, question_missing = process_database(database)

same:  1
parsed_output:  ['How many eggs does Janet eat every day?', 'How many eggs does Janet have left?']
expected_outut:  ['How many eggs does Janet sell? ', "How much does Janet make at the farmers' market? "]
same:  2
parsed_output:  ['How many white fibers does the robe need?', 'How many bolts in total does it?']
expected_outut:  ['How many bolts of white fiber does it take? ', 'How many bolts in total does it take? ']
same:  3
parsed_output:  ['How many meters does James run in each sprint?', 'How many meters does James run in total?']
expected_outut:  ['How many sprints does James run in a week? ', 'How many meters does James run in a week? ']
same:  4
same:  5
same:  6
same:  7
parsed_output:  ['How much did Mishka spend on shorts?', 'How much did Mishka spend on?']
expected_outut:  ['How many dollars did Mishka spend on all the clothing items? ', 'How many dollars did Mishka spend on all the clothing items? ']
same:  8
same:  9
same:  10
parsed_output:  ['How much do the two 

In [None]:
print(length_is_not_the_same)

In [None]:
import jsonlines

def save_results_to_jsonl(results, filename):
    with jsonlines.open(filename, 'w') as writer:
        for result in results:
            writer.write(result)

In [None]:
save_results_to_jsonl(results_analysis, 'results.jsonl')

In [None]:
save_results_to_jsonl(raw_results_good, 'filtered_test_good.jsonl')

In [None]:
save_results_to_jsonl(raw_results_bad, 'filtered_test_bad.jsonl')

In [None]:
save_results_to_jsonl(raw_results[0:50], 'filtered_test_1-50.jsonl')

In [None]:
print(len(raw_results_good))
print(len(raw_results_bad))