# Setup

In [None]:
%reload_ext autoreload
%autoreload 2

In [None]:
import os
import sys
sys.path.append(".bin")

import logging
logging.basicConfig(level=logging.INFO)

FILTERED_DIR = "data/filtered_code_contest_data"
CODE_CONTEST_DATA_PATH = "data/code_contest_data"
PROMPTED_DIR = "data/patched_solutions"
PATCHED_EVAL_RESULTS_PATH = "data/patched_eval_results"
BASE_EVAL_RESULTS_PATH = "data/eval_results"

GRAPH_DIR = "data/graphs"
os.makedirs(GRAPH_DIR, exist_ok=True)

# Data Setup

### Generating Data Dicts

In [None]:
from domain.domain_dao import CompressedDomainFileDAO
from domain.problems_d import TestResultSetD, ContestProblemSetD, ContestProblemSetD, PatchedSolutionSetD
from code_patching.prompts import PROMPTS


test_result_dao = CompressedDomainFileDAO(PATCHED_EVAL_RESULTS_PATH, TestResultSetD)
test_result_sets = list(test_result_dao.read())
test_results = [
    test_result for test_result_set in test_result_sets 
    for test_result in test_result_set.test_results]
logging.info(f"Loaded {len(test_results)} test results")

base_result_dao = CompressedDomainFileDAO(BASE_EVAL_RESULTS_PATH, TestResultSetD)   
base_result_sets = list(base_result_dao.read())
base_results = [
    test_result for test_result_set in base_result_sets 
    for test_result in test_result_set.test_results]
logging.info(f"Loaded {len(base_results)} base test results")

problem_dao = CompressedDomainFileDAO(FILTERED_DIR, ContestProblemSetD)
problem_sets = list(problem_dao.read())
problem_ds = [
    problem for problem_set in problem_sets
    for problem in problem_set.problems]
logging.info(f"Loaded {len(problem_ds)} problems")

patched_solution_dao = CompressedDomainFileDAO(PROMPTED_DIR, PatchedSolutionSetD)
patched_solution_sets = list(patched_solution_dao.read())
patched_solutions = {
    patched_solution.proto_id: patched_solution
    for patched_solution_set in patched_solution_sets
    for patched_solution in patched_solution_set.solutions}
logging.info(f"Loaded {len(patched_solutions)} patched solutions")

patching_prompts = {
    prompt.proto_id: prompt
    for prompt in PROMPTS}
logging.info(f"Loaded {len(patching_prompts)} patching prompts")


### Validating Alignment

In [None]:
# Problem ID Alignment
result_problem_ids = set([test_result.problem_id for test_result in test_results])
logging.info(f"{len(result_problem_ids)} unique problems in test results")
problem_ids = set([problem.proto_id for problem in problem_ds])
logging.info(f"{len(problem_ids)} unique problems in problem set")
unified_problem_ids = result_problem_ids.union(problem_ids)
logging.info(f"{len(unified_problem_ids)} unique problems in both test results and problem set")
if result_problem_ids != problem_ids:
    difference = result_problem_ids.symmetric_difference(problem_ids)
    logging.warning(f"{len(difference)} test results do not have a corresponding problem in the problem set.")

# Test ID Alignment
result_test_ids = set([test_result.test_id for test_result in test_results])
logging.info(f"{len(result_test_ids)} unique tests in test results")
test_ids = set([test.proto_id for problem in problem_ds for test in problem.public_tests])
logging.info(f"{len(test_ids)} unique tests in problem set")
unified_test_ids = result_test_ids.union(test_ids)
logging.info(f"{len(unified_test_ids)} unique tests in both test results and problem set")
if result_test_ids != test_ids:
    raise ValueError(f"Test ids in test results and problem set do not match with {result_test_ids.symmetric_difference(test_ids)}")

### Data Transformation Functions

In [None]:
from typing import Dict, Any, Union

import proto.contest_problem_pb2 as cp_pb2
import proto.patched_solutions_pb2 as ps_pb2
from domain.problems_d import ContestProblemD, TestResultD
from llm_handler.openai_handler import OpenAIHandler


def difficulty_to_int(difficulty: int) -> float:
    """ Translates to 1-20 scale for difficulty then quantizes to 0-1 float""" 
    DIFFICULTY_SCALER_MAP = {
        cp_pb2.ContestProblem.Difficulty.UNKNOWN_DIFFICULTY: -1,  # to purposefully segregate unknown difficulties
        cp_pb2.ContestProblem.Difficulty.EASY: 1,
        cp_pb2.ContestProblem.Difficulty.MEDIUM: 10,
        cp_pb2.ContestProblem.Difficulty.HARD: 15,
        cp_pb2.ContestProblem.Difficulty.HARDER: 17,
        cp_pb2.ContestProblem.Difficulty.HARDEST: 20,
        cp_pb2.ContestProblem.Difficulty.A: 1,
        cp_pb2.ContestProblem.Difficulty.B: 2,
        cp_pb2.ContestProblem.Difficulty.C: 3,
        cp_pb2.ContestProblem.Difficulty.D: 4,
        cp_pb2.ContestProblem.Difficulty.E: 5,
        cp_pb2.ContestProblem.Difficulty.F: 6,
        cp_pb2.ContestProblem.Difficulty.G: 7,
        cp_pb2.ContestProblem.Difficulty.H: 8,
        cp_pb2.ContestProblem.Difficulty.I: 9,
        cp_pb2.ContestProblem.Difficulty.J: 10,
        cp_pb2.ContestProblem.Difficulty.K: 11,
        cp_pb2.ContestProblem.Difficulty.L: 12,
        cp_pb2.ContestProblem.Difficulty.M: 13,
        cp_pb2.ContestProblem.Difficulty.N: 14,
        cp_pb2.ContestProblem.Difficulty.O: 15,
        cp_pb2.ContestProblem.Difficulty.P: 16,
        cp_pb2.ContestProblem.Difficulty.Q: 16,
        cp_pb2.ContestProblem.Difficulty.R: 17,
        cp_pb2.ContestProblem.Difficulty.S: 17,
        cp_pb2.ContestProblem.Difficulty.T: 18,
        cp_pb2.ContestProblem.Difficulty.U: 19,
        cp_pb2.ContestProblem.Difficulty.V: 20}
    if difficulty not in DIFFICULTY_SCALER_MAP:
        raise ValueError(f"Unknown difficulty {difficulty}")
    diff_scaler = DIFFICULTY_SCALER_MAP[difficulty]
    return diff_scaler / 20

def output_transformer(test_output: str) -> str:
    return str([int(char) for char in test_output if char.isdigit()])

def problem_to_df_dict(problem: ContestProblemD) -> Dict[str, Any]:
    difficulty = difficulty_to_int(problem.difficulty)
    return {
        "problem_id": problem.proto_id,
        "problem_name": problem.name,
        "problem_difficulty": problem.difficulty,
        "mapped_difficulty": difficulty,
        "cf_points": problem.cf_points,
        "cf_rating": problem.cf_rating,
        "time_limit_nsec": problem.time_limit_nsec,
        "memory_limit_bytes": problem.memory_limit_bytes}

def test_result_to_df_dict(result: TestResultD) -> Dict[str, Any]:
    transformed_expected_output = output_transformer(result.expected_output)
    transformed_solution_output = output_transformer(result.solution_output)
    correct = transformed_expected_output == transformed_solution_output
    return {
            "expected_output": transformed_expected_output,
            "solution_output": transformed_solution_output,
            "result_id": result.proto_id,
            "test_id": result.test_id,
            "solution_id": result.solution_id,
            "correct": int(correct),
            "failed": int(bool(result.exception_info)),
            "exception_info": result.exception_info}

def model_name(model: Union[str, 'ps_pb2.ModelType']) -> str:
    if model in OpenAIHandler._MODEL_NAME_TO_VERSION:
        return OpenAIHandler._MODEL_NAME_TO_VERSION[model]
    return str(model)
    
def format_prompt_name(prompt: str) -> str:
    _PROMPT_NAME_MAP = {
        'code_patching_prompt_base': "Base Prompt",
        'code_patching_prompt_base_explanation': "Change Explanation",
        "code_patching_prompt_base_test_generation": "Test Generation",
        "code_patching_prompt_self_evaluation": "Self Evaluation",
        "code_patching_prompt_minimal": "Minimal Context"
    }
    if prompt in _PROMPT_NAME_MAP:
        return _PROMPT_NAME_MAP[prompt]
    return prompt


In [None]:
from typing import List, Dict
from collections import defaultdict

from domain.problems_d import TestResultD


unified_result_dict: Dict[str, List[TestResultD]] = defaultdict(list)
for test_result in test_results:
    unified_result_dict[test_result.problem_id].append(test_result)

base_unified_result_dict: Dict[str, List[TestResultD]] = defaultdict(list)
for test_result in base_results:
    base_unified_result_dict[test_result.problem_id].append(test_result)

unified_problem_ds = [
    problem for problem in problem_ds
    if problem.proto_id in unified_problem_ids]


In [7]:
from typing import Any, List, Dict
import pandas as pd


unified_dict_records: List[Dict[str, Any]] = []
for problem in unified_problem_ds:
    patched_test_results = unified_result_dict[problem.proto_id]
    base_test_results = base_unified_result_dict[problem.proto_id]
    test_results = patched_test_results + base_test_results

    difficulty = difficulty_to_int(problem.difficulty)
    problem_dict = problem_to_df_dict(problem)
    
    
    for result in test_results:
        model = "base_result"
        prompt_name = "base_result"
        # required as base results exist in the same set but don't have model or prompt
        if result.solution_id in patched_solutions: 
            solution = patched_solutions[result.solution_id]
            model = solution.model
            prompt_name = patching_prompts[solution.prompt_id].prompt_name
        
        test_dict = test_result_to_df_dict(result)
        df_dict = {
            **problem_dict, 
            **test_dict,
            "model": model_name(model),
            "prompt_name": format_prompt_name(prompt_name)}
        unified_dict_records.append(df_dict)

In [8]:
RESULTS_DF = pd.DataFrame(unified_dict_records)
logging.info(f"Results DF: {RESULTS_DF.shape}")

INFO:root:Results DF: (54154, 18)


### Graph Setup

In [9]:
from plotly import graph_objs as go

def fig_handler(func, graph_dir: str = GRAPH_DIR):
    def wrapper(*args, show: bool = True, save: bool=True, **kwargs):
        fig = func(*args, **kwargs)
        if not isinstance(fig, go.Figure):
            raise ValueError(f"Function {func.__name__} did not return a plotly figure")
       
        if show: fig.show()    
        if save:
            file_name = str(fig.to_dict()["layout"]["title"]["text"]).lower().replace(" ", "_")
            img_path = os.path.join(graph_dir, file_name) 
            fig.write_image(img_path + ".png")
    return wrapper

# Analysis 

### Model Type Analysis

In [None]:
import plotly.express as px

@fig_handler
def model_type_performance(results_df: pd.DataFrame):
    
    correct_df = results_df[results_df['failed'] == 0] 
    correct_pct = correct_df.groupby('model')[['correct']].mean() * 100
    failed_pct = results_df.groupby('model')['failed'].mean() * 100
    
    combined_df = pd.concat([correct_pct, failed_pct], axis=1)
    combined_df.columns = ["correct", "failed"]
    combined_df = combined_df.reset_index().sort_values(by="correct", ascending=False)
    fig = px.bar(
        combined_df, 
        x='model',
        y="correct",
        color='model',    
        title=f"Model Type Performance",
        labels={"correct": "Correct (%)"},
        error_y=combined_df["failed"]/2,
        height=600,
        width=800)
    #  put text on error bars
    for _, row in combined_df.iterrows():
        fig.add_annotation(
            x=row['model'], 
            y=row['correct'] + row['failed']/2 + 5, 
            text=f"± {row['failed']:.2f}%", 
            showarrow=False)
    fig.update_layout(showlegend=False, title_x=0.5)
    return fig

model_type_performance(RESULTS_DF, show=True, save=True)

### Model Type Performance Distribution

In [None]:
# plot model performance as distribution of scores across problems
import plotly.graph_objects as go
import plotly.figure_factory as ff


@fig_handler
def model_problem_performance_distribution(results_df: pd.DataFrame):
    correct_df = results_df[results_df['failed'] == 0]
    model_df = correct_df[correct_df['model'] != 'base_result']
    model_problem_scores = model_df.groupby(['problem_id', 'model'])['correct'].mean().unstack().dropna()
    dist_plot = ff.create_distplot(
        [model_problem_scores[model] for model in model_problem_scores.columns],
        model_problem_scores.columns,
        show_hist=False, 
        show_rug=True,
        bin_size=0.1,
        )
    dist_plot.update_layout(
        title="Model Performance Distribution",
        xaxis_title="Correct (%)",
        yaxis_title="Density",
        height=600,
        width=800)
    return dist_plot

model_problem_performance_distribution(RESULTS_DF, show=True, save=False)

### Prompt Problem Performance Distribution

In [None]:
# plot model performance as distribution of scores across problems
import plotly.graph_objects as go
import plotly.figure_factory as ff


@fig_handler
def prompt_problem_performance_distribution(results_df: pd.DataFrame):
    correct_df = results_df[results_df['failed'] == 0]
    prompt_df = correct_df[correct_df['model'] != 'base_result']
    prompt_problem_scores = prompt_df.groupby(['problem_id', 'prompt_name'])['correct'].mean().unstack().dropna()
    dist_fig = ff.create_distplot(
        [prompt_problem_scores[prompt] for prompt in prompt_problem_scores.columns],
        prompt_problem_scores.columns,
        show_hist=False, 
        show_rug=True,
        )
   
    return dist_fig

prompt_problem_performance_distribution(RESULTS_DF, show=True, save=False)

### Prompt Type Analysis

In [None]:
import plotly.express as px

@fig_handler
def prompt_type_performance(results_df: pd.DataFrame):
    
    results_df = results_df[results_df['model'] != "base_result"]
    correct_df = results_df[results_df['failed'] == 0] 
    correct_pct = correct_df.groupby('prompt_name')[['correct']].mean() * 100    
    combined_df = correct_pct.reset_index().sort_values(by="correct", ascending=False)
    fig = px.bar(
        combined_df, 
        x='prompt_name',
        y="correct",
        color='prompt_name',    
        title=f"Prompt Type Performance",
        labels={"correct": "Correct (%)"},
        height=600,
        width=800)

    fig.update_layout(showlegend=False, title_x=0.5)
    return fig

prompt_type_performance(RESULTS_DF, show=True, save=True)

### Prompt Imact Relative to Model Type

In [None]:
@fig_handler
def prompt_performance_by_model(results_df: pd.DataFrame):
    import seaborn as sns

    results_df = results_df[results_df['model'] != "base_result"]
    correct_df = results_df[results_df['failed'] == 0] 
    correct_pct = correct_df.groupby(['model', 'prompt_name'])[['correct']].mean() * 100    
    combined_df = correct_pct.reset_index().sort_values(by="correct", ascending=False)
    
    fig = px.bar(
        combined_df, 
        x='model',
        y="correct",
        color='prompt_name',
        barmode='group',
        title=f"Prompt Performance by Model",
        labels={"correct": "Correct (%)"},
        height=600,
        width=800)
    fig.update_layout(showlegend=True, title_x=0.5)
    return fig

prompt_performance_by_model(RESULTS_DF, show=True, save=False)

### CF Rating Model Analysis

In [None]:
# plot mapped difficulty's impact on model performance
@fig_handler
def cf_rating_model_performance(results_df: pd.DataFrame):
    correct_df = results_df[results_df['failed'] == 0]
    cf_rating_df = correct_df[correct_df['cf_rating'] > 0]
    difficulty_performance = cf_rating_df.groupby(['model', 'cf_rating'])['correct'].mean()
    difficulty_sem = cf_rating_df.groupby(['model', 'cf_rating'])['correct'].sem()
    difficulty_performance = difficulty_performance.reset_index()
    difficulty_sem = difficulty_sem.reset_index()

    #connect with a line
    fig = px.scatter(
        difficulty_performance, 
        x='cf_rating', 
        y='correct', 
        color='model',
        color_continuous_scale=px.colors.sequential.Viridis,
        error_y=difficulty_sem['correct'],
        title="Model Performance by CF Rating",
        labels={"cf_rating": "Codeforces Rating", "correct": "Correct (%)"},
        height=600,
        width=800)
    fig.update_traces(mode='markers+lines')
    fig.update_layout(title_x=0.5)
    return fig
    
cf_rating_model_performance(RESULTS_DF, show=True, save=False)

### CF Rating Prompt Impact

In [None]:
@fig_handler
def cf_rating_prompt_performance(results_df: pd.DataFrame):
    correct_df = results_df[\
        (results_df['failed'] == 0) & \
        (results_df['cf_rating'] > 0 )&\
        (results_df['model'] != "base_result")]

    difficulty_performance = correct_df.groupby(['prompt_name', 'cf_rating'])['correct'].mean()
    difficulty_performance = difficulty_performance.reset_index()

    #connect with a line
    fig = px.scatter(
        difficulty_performance, 
        x='cf_rating', 
        y='correct', 
        color='prompt_name',
        opacity=0.7,
        color_continuous_scale=px.colors.sequential.Viridis,
        title="Prompt Performance by CF Rating",
        labels={"prompt_name": "CF Rating", "correct": "Correct (%)"},
        height=600,
        width=800)
    fig.update_layout(title_x=0.5)
    # add average line 
    prompt_avg_correct = difficulty_performance.groupby('cf_rating')['correct'].mean()
    fig.add_scatter(
        x=prompt_avg_correct.index, 
        y=prompt_avg_correct.values, 
        mode='lines', 
        name='Average',
        opacity=0.6,
        line=dict(color='black', dash='dash'))
    return fig

cf_rating_prompt_performance(RESULTS_DF, show=True, save=False)

### Failure Analysis

In [None]:
# plot graph illustrating model and prompt impact on the rate of failure
import plotly.graph_objects as go
import plotly.figure_factory as ff


@fig_handler
def failure_rate_by_model(results_df: pd.DataFrame):
    results_df = results_df[\
        (results_df['model'] != "base_result")]
    
    problem_failed_pct = results_df.groupby(['problem_id', 'prompt_name'])['failed'].mean().unstack().dropna()
    hist_data = [problem_failed_pct[model] for model in problem_failed_pct.columns]
    group_labels = problem_failed_pct.columns
    
    
    fig = ff.create_distplot(
        hist_data, 
        group_labels, 
        show_hist=False, 
        show_rug=True,
        bin_size=0.05)
    fig.update_layout(
        title="Prompt Failure Distribution",
        xaxis_title="Correct (%)",
        yaxis_title="Density",
        height=600,
        width=800)
    
    
    
    return fig


failure_rate_by_model(RESULTS_DF, show=True, save=False)
   

# Prediction Analysis

In [None]:
import concurrent.futures as cf
import tqdm
from typing import List, Dict

from llm_handler.openai_handler import OpenAIHandler
OpenAIHandler.set_openai_api_key('.env.secret')


def get_embedding(text: str) -> List[float]:
    embedding = OpenAIHandler.get_text_embedding(text)
    return embedding



from domain.domain_dao import CompressedDomainFileDAO
from domain.problems_d import ContestProblemSetD


problem_dao = CompressedDomainFileDAO(FILTERED_DIR, ContestProblemSetD)
problem_sets = list(problem_dao.read())

problem_ds = [
    problem for problem_set in problem_sets
    for problem in problem_set.problems]

problem_id_to_description = {
    problem.proto_id: problem.description
    for problem in problem_ds}


with cf.ThreadPoolExecutor() as executor:

    future_map: Dict[cf.Future[List[float]], str] = {}
    for problem in problem_ds:
        embedding_future = executor.submit(get_embedding, problem.description)
        future_map[embedding_future] = problem.proto_id

    embeddings = {}
    for future in tqdm.tqdm(cf.as_completed(future_map), total=len(future_map)):
        problem_id = future_map[future]
        embeddings[problem_id] = future.result()


In [15]:
filtered_results = RESULTS_DF[(RESULTS_DF['model'] != "base_result")].copy()
# get average problem across all models and prompts
QUANT_COLS = ['cf_rating', 'mapped_difficulty', 'cf_rating', 'time_limit_nsec', 'memory_limit_bytes', 'correct', 'failed']
EVAL_DATA = filtered_results.groupby('problem_id')[QUANT_COLS].mean().reset_index().drop('problem_id', axis=1)

In [16]:
# use gpc from sklearn to learn distribution of correct
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, WhiteKernel
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score


scaler = StandardScaler()
problem_performance = EVAL_DATA.copy()
problem_performance = problem_performance.drop('correct', axis=1)

X = scaler.fit_transform(problem_performance)
y = EVAL_DATA['correct']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

kernel = RBF() + WhiteKernel()
gpc = GaussianProcessRegressor(kernel=kernel, random_state=42).fit(X_train, y_train)

score=gpc.score(X_test, y_test) 
logging.info(f"Correct Base: {score}")
logging.info(f"train score: {gpc.score(X_train, y_train)}")
base_pred = gpc.predict(X_test)

INFO:root:Correct Base: 0.1779501103723018
INFO:root:train score: 0.31614878717522066


In [17]:
filtered_results = RESULTS_DF[(RESULTS_DF['model'] != "base_result")].copy()
# get average problem across all models and prompts
QUANT_COLS = ['cf_rating', 'mapped_difficulty', 'cf_rating', 'time_limit_nsec', 'memory_limit_bytes', 'correct']
EVAL_DATA = filtered_results.groupby('problem_id')[QUANT_COLS].mean()

EMBEDDING_DF = pd.DataFrame(embeddings).T
# create df with embedding and correct frmo RESULTS_DF
EMBEDDING_DF = EMBEDDING_DF.reset_index()
EMBEDDING_DF.rename(columns={'index': 'problem_id'}, inplace=True)
EMBEDDING_DF = EMBEDDING_DF.merge(EVAL_DATA['correct'], on='problem_id')

Y = EMBEDDING_DF['correct']
X = EMBEDDING_DF.drop(['problem_id', 'correct'], axis=1)


In [18]:
# use gpc from sklearn to learn distribution of correct
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, WhiteKernel
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

kernel = RBF() + WhiteKernel()
gpc = GaussianProcessRegressor(kernel=kernel, random_state=42).fit(X_train, y_train)

score = gpc.score(X_test, y_test)
logging.info(f"Correct Embedding Prediction Model Score: {score}")
logging.info(f"Training Score: {gpc.score(X_train, y_train)}")



INFO:root:Correct Embedding Prediction Model Score: 0.22780370145169015
INFO:root:Training Score: 0.6698477641747034


In [20]:
# plot model performance as distribution of scores across problems
import plotly.graph_objects as go
import plotly.figure_factory as ff

fig = go.Figure()

pred_line_vals = gpc.predict(X_test)
fig.add_trace(go.Scatter(y=pred_line_vals, mode='lines', name='Embedding Trained Pred'))
fig.add_trace(go.Scatter(y=base_pred, mode='lines', name='Base Trained Pred'))
fig.add_trace(go.Scatter(y=y_test, mode='markers', name='True'))

fig.update_layout(title="Correct Prediction Model", xaxis_title="True Correct", yaxis_title="Predicted Correct")
fig.show()



In [21]:
filtered_results = RESULTS_DF[(RESULTS_DF['model'] != "base_result") & RESULTS_DF['failed'] == 0].copy()
# get average problem across all models and prompts
QUANT_COLS = ['cf_rating', 'mapped_difficulty', 'cf_rating', 'time_limit_nsec', 'memory_limit_bytes', 'correct', 'problem_id']
EVAL_DATA = filtered_results[QUANT_COLS]

EMBEDDING_DF = pd.DataFrame(embeddings).T
EMBEDDING_DF = EMBEDDING_DF.reset_index().rename(columns={'index': 'problem_id'})
EMBEDDING_DF = EMBEDDING_DF.merge(EVAL_DATA[['correct', 'problem_id']], on='problem_id')

Y = EMBEDDING_DF['correct']
X = EMBEDDING_DF.drop(['problem_id', 'correct'], axis=1)


In [22]:
# use gpc from sklearn to learn distribution of correct
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF, WhiteKernel
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

kernel = RBF() + WhiteKernel()
gpc = GaussianProcessClassifier(kernel=kernel, random_state=42, n_jobs=-1).fit(X_train, y_train)

score = gpc.score(X_test, y_test)
logging.info(f"Failed Embedding Prediction Model Score: {score}")
logging.info(f"Training Score: {gpc.score(X_train, y_train)}")

TypeError: GaussianProcessClassifier.__init__() got an unexpected keyword argument 'verbose'