In [None]:
%reload_ext autoreload
%autoreload 2

In [None]:
import os
import sys
sys.path.append(".bin")

DATA_PATH = "data"
CODE_CONTEST_DATA_PATH = f"{DATA_PATH}/code_contest_data/"
os.makedirs(DATA_PATH, exist_ok=True)
os.makedirs(CODE_CONTEST_DATA_PATH, exist_ok=True)

# Creating Contest Problem Set

In [None]:
from domain.domain_dao import CompressedDomainFileDAO
from domain.problems_d import ContestProblemD, ContestProblemSetD

In [None]:
import dask
dask.config.set({'dataframe.query-planning': True})
import dask.dataframe as dd

df = dd.read_parquet("code_contests/data/*.parquet").map_partitions(
    lambda x: ContestProblemSetD.compressed_from_df(x),
    meta={}
    ).compute()

In [None]:
for i, problem_set in enumerate(df):
    f_name = f"{CODE_CONTEST_DATA_PATH}/chunk_{i}.bin"
    with open(f_name, "wb") as f:
        f.write(problem_set)

### Create Filtered Set

# Initial Test Evaluation

In [None]:
from typing import List

from domain.domain_dao import CompressedDomainFileDAO
from domain.problems_d import ContestProblemSetD, PatchedSolutionD

reader = CompressedDomainFileDAO(CODE_CONTEST_DATA_PATH, ContestProblemSetD)
problem_sets: List[ContestProblemSetD] = []
for problem_set in reader.read():
    problem_sets.append(problem_set)

In [None]:
from domain.problems_d import PatchedSolutionD, TestResultSetD
from domain.domain_dao import CompressedDomainFileDAO

BASE_EVAL_RESULTS_PATH = "data/base_eval_results"

mocked_patched_solutions = [
    PatchedSolutionD(
        problem_id=problem.proto_id,
        patched_solution=solution.solution,
        solution_id=solution.proto_id,
        prompt_id="prompt_id",
        model="MODEL_TYPE_GPT_4_TURBO",
        patched_response={})
    for problem_set in problem_sets[:1]
    for problem in problem_set.problems
    for solution in problem.solutions[:1]
]
mocked_tests = [
    test
    for problem_set in problem_sets[:1]
    for problem in problem_set.problems
    for test in problem.public_tests[:1]
]
print(f"Running {len(mocked_patched_solutions)} patched solutions on {len(mocked_tests)} tests")


In [None]:
import logging
logging.basicConfig(level=logging.INFO)

from code_patching.solution_evaluator import eval_patched_solutions

test_result_dao = CompressedDomainFileDAO(BASE_EVAL_RESULTS_PATH, TestResultSetD)
test_result_sets = list(eval_patched_solutions(
        problem_tests=mocked_tests,
        patched_solutions=mocked_patched_solutions,
        domain_writer=test_result_dao,
        batch_size=5000))


# Generating Patched Solutions

### Generating Filtered Problems

In [None]:
from typing import List

from domain.domain_dao import CompressedDomainFileDAO
from domain.problems_d import ContestProblemSetD, PatchedSolutionD

reader = CompressedDomainFileDAO(CODE_CONTEST_DATA_PATH, ContestProblemSetD)
problem_sets: List[ContestProblemSetD] = []
for problem_set in reader.read():
    problem_sets.append(problem_set)

In [None]:
FILTERED_DIR = "data/filtered_code_contest_data"

In [None]:
import dataclasses

from domain.domain_dao import CompressedDomainFileDAO
from domain.problems_d import ContestProblemSetD

compressed_dao = CompressedDomainFileDAO(FILTERED_DIR, ContestProblemSetD)
filtered_problem_sets = []
for problem_set in problem_sets:
    filtered_problems = []
    for problem in problem_set.problems[:5]:
        filtered_problem = dataclasses.replace(
            problem,
            solutions=problem.solutions[:3],
            public_tests=problem.public_tests[:3],
            incorrect_solutions=problem.incorrect_solutions[:3])
        filtered_problems.append(filtered_problem)
    filtered_problem_set = dataclasses.replace(problem_set, problems=filtered_problems)
    filtered_problem_sets.append(filtered_problem_set)

num_inc_sol = sum(
    len(problem.incorrect_solutions) 
    for filtered_problem_set in filtered_problem_sets
    for problem in filtered_problem_set.problems)    
num_pub_tests = sum(len(problem.public_tests) 
                    for filtered_problem_set in filtered_problem_sets
    for problem in filtered_problem_set.problems)
print(f"Filtered {len(filtered_problem_sets)} problem sets, {num_inc_sol} incorrect solutions, and {num_pub_tests} public tests")

compressed_dao.write(filtered_problem_sets)    
       

### Generating Patched

In [None]:
from domain.domain_dao import CompressedDomainFileDAO
from domain.problems_d import ContestProblemSetD

reader = CompressedDomainFileDAO(FILTERED_DIR, ContestProblemSetD)
problem_sets = list(reader.read())

In [None]:

num_inc_sol = sum(
    len(problem.incorrect_solutions) 
    for problem_set in problem_sets
    for problem in problem_set.problems
    )    
num_pub_tests = sum(len(problem.public_tests) 
    for problem_set in problem_sets
    for problem in problem_set.problems)
print(f"Filtered {len(problem_sets)} problem sets, {num_inc_sol} incorrect solutions, and {num_pub_tests} public tests")


### Setting OPENAI_API_KEY

In [None]:
import os

with open(".env.secret", "r") as f:
    for line in f:
        key, value = line.strip().split("=")
        os.environ[key] = value

In [None]:
import logging
logging.basicConfig(level=logging.INFO)


from code_patching.prompts import PROMPTS
from code_patching.solution_generator import generate_prompted_dataset
from domain.domain_dao import CompressedDomainFileDAO
from domain.problems_d import ContestProblemSetD, PatchedSolutionD, TestResultSetD, PatchedSolutionSetD
import proto.patched_solutions_pb2 as ps_pb2
from llm_handler.openai_handler import OpenAIHandler


PROMPTED_DIR = "data/patched_solutions"
MODELS = [ps_pb2.MODEL_TYPE_GPT_4_TURBO, ps_pb2.MODEL_TYPE_GPT_3_5_TURBO]
openai_handler = OpenAIHandler()
prompted_dao = CompressedDomainFileDAO(PROMPTED_DIR, PatchedSolutionSetD)

In [None]:
generated_solution_sets = list(
    generate_prompted_dataset(
        contest_problems=problem_sets,
        model_types=MODELS,
        prompts=PROMPTS,
        result_batch_size=100,
        domain_reader=prompted_dao))

# Running Patched Evaluation

In [None]:
import logging
logging.basicConfig(level=logging.INFO)
from collections import defaultdict

from domain.domain_dao import CompressedDomainFileDAO
from domain.problems_d import PatchedSolutionSetD, ContestProblemSetD

problem_test_cases = defaultdict(list)
filtered_problems = CompressedDomainFileDAO(FILTERED_DIR, ContestProblemSetD)
for problem_set in reader.read():
    for problem in problem_set.problems:
        problem_test_cases[problem.proto_id].extend(problem.public_tests)
       
problem_patched_solutions = defaultdict(list)
prompted_dao = CompressedDomainFileDAO(PROMPTED_DIR, PatchedSolutionSetD)
for patched_solution_set in prompted_dao.read():
    for patched_solution in patched_solution_set.solutions:
        problem_patched_solutions[patched_solution.problem_id].append(patched_solution)

assert set(problem_test_cases.keys()) == set(problem_patched_solutions.keys()), "Problem ids do not match"

In [None]:
print(f"{len(problem_test_cases)} problems")
num_cases = 0
case_args = []
for problem_id, test_cases in problem_test_cases.items():
    num_solutions = len(problem_patched_solutions[problem_id])
    num_tests = len(test_cases)
    print(f"Problem {problem_id} has {num_solutions} solutions and {num_tests} test cases")
    num_cases += (num_tests * num_solutions)
print(f"Running {num_cases} test cases")

In [None]:

import logging
logging.basicConfig(level=logging.INFO)

from code_patching.solution_evaluator import eval_patched_solutions
from domain.domain_dao import CompressedDomainFileDAO
from domain.problems_d import TestResultSetD, TestResultD


PATCHED_EVAL_RESULTS_PATH = "data/patched_eval_results"
test_result_dao = CompressedDomainFileDAO(PATCHED_EVAL_RESULTS_PATH, TestResultSetD)
test_result_sets = list(eval_patched_solutions(
        problem_tests=problem_test_cases,
        patched_solutions=problem_patched_solutions,
        domain_writer=test_result_dao,
        process_batch_size=10,
        batch_size=1000))


In [None]:
from typing import List, Dict

test_results = [
    test_result
    for test_result_set in test_result_sets
    for test_result in test_result_set.test_results]

total_tests = len(test_results)
problem_to_results: Dict[str, List[TestResultD]] = defaultdict(list)
for test_result in test_results:
    problem_to_results[test_result.problem_id].append(test_result)

exception_pct = sum(1 for test_result in test_results if test_result.exception_info) / total_tests
total_exception_pct = round(exception_pct*100, 2)
exception_pct_by_problem: Dict[str, float] = {}
for problem_id, results in problem_to_results.items():
    expcetion_pct = sum(1 for result in results if result.exception_info) / len(results)
    exception_pct_by_problem[problem_id] = round(expcetion_pct*100, 2)


In [None]:
print(f"Average exception rate: {total_exception_pct}%")
for problem_id, problem_exception_pct in exception_pct_by_problem.items():
    print(f"Problem {problem_id}\n      Exception Rate - {problem_exception_pct}%\n      {round((problem_exception_pct/total_exception_pct*100)-100, 2)}% delta to Avg")