In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import json
import csv
import ast
import os
import pandas as pd
import os
import sys
import textwrap

In [2]:
# function to read jsonl file with delimiter
def read_jsonl(file_path):
    data = []
    with open(file_path, 'r') as file:
        for line in file:
            data.append(json.loads(line))
    return data

# function to read csv file with delimiter
# def read_csv(file_path, delimiter='$'):
#     data = []
#     with open(file_path, 'r') as file:
#         reader = csv.reader(file, delimiter=delimiter)
#         for row in reader:
#             data.append(row)
#     return data


def read_csv(file_path, delimiter=','):
    if not os.path.exists(file_path):
        print("File not found: {}".format(file_path))
        return None
    return pd.read_csv(file_path, delimiter=delimiter)

def write_csv(df, file_path):
    df.to_csv(file_path, index=False)


def plot_dict_old(dictionary, title, xlabel, ylabel, width=15, height=7, same_color=True, color='gray'):

    # 32 pastel colors
    colors = ['purple', 'brown', 'pink', 'teal', 'olive', 'cyan', 'blue', 'orange', 'green', 'red', 'yellow', 'gray', 'maroon', 'lime', 'aqua', 'fuchsia', 'silver', 'skyblue']

    
    plt.figure(figsize=(width, height))
    plt.xticks(rotation=90)
    dictionary = dict(sorted(dictionary.items(), key=lambda item: item[1]))
    
    if not same_color:
        color_dict = {}
        for key, value in dictionary.items():
            if value not in color_dict:
                color_dict[value] = colors.pop(0)
            plt.bar(key, value, color=color_dict[value], width=0.5)
            plt.bar(key, value, width=0.5)

    else: 
        
        plt.bar(dictionary.keys(), dictionary.values(), width=0.5, color=color)

    # display value on the bar
    for key, value in dictionary.items():
        # value = round(value, 2)        
        plt.text(key, value, str(value), ha='center')

    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    file_name = title.replace(' ', '_').replace('/', '_') + '.png'
    path = '../../results/gpt4/plots/' + file_name
    plt.savefig(path)
    plt.show()
    

# function to plot a dictionary as a bar chart
def plot_dict(dictionary, title, xlabel, ylabel, width=15, height=7, same_color=True, color='gray'):
    colors = ['purple', 'brown', 'pink', 'teal', 'olive', 'cyan', 'blue', 'orange', 'green', 'red', 'yellow', 'gray', 'maroon', 'lime', 'aqua', 'fuchsia', 'silver', 'skyblue']
    
    plt.figure(figsize=(width, height))
    plt.xticks(rotation=90)
    
    # Extract means and standard deviations
    keys = list(dictionary.keys())
    means = [dictionary[key]['mean'] for key in keys]
    std_devs = [dictionary[key]['std_dev'] for key in keys]
    
    if same_color:
        color = colors.pop(0)
        bar_colors = [color] * len(keys)
    else:
        bar_colors = colors[:len(keys)]  # Assign different colors

    # Calculate asymmetric error bars
    lower_errors = [min(mean, std_dev) for mean, std_dev in zip(means, std_devs)]  # Prevent negative values
    upper_errors = std_devs  # No adjustment needed for upper errors
    error_bars = [lower_errors, upper_errors]

    # Create bar plot with asymmetric error bars
    plt.bar(keys, means, yerr=error_bars, color=bar_colors, capsize=5)
    
    # Display mean values on the bar plot
    for i, (key, mean) in enumerate(zip(keys, means)):
        plt.text(i, mean, f'{mean:.2f}', ha='center', va='bottom', color='black')

    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    file_name = title.replace(' ', '_').replace('/', '_') + '.png'
    path = '../../results/gpt4/plots/' + file_name
    plt.savefig(path)
    plt.show()

    
def extract_function_name(groundtruth_code):
    groundtruth_code = textwrap.dedent(groundtruth_code)
    tree = ast.parse(groundtruth_code)
    for node in ast.walk(tree):
        if isinstance(node, ast.FunctionDef):
            return node.name
    return None

def add_list_to_dict(dictionary, key, value):
    if key in dictionary:
        dictionary[key].append(value)
    else:
        dictionary[key] = [value]


def average_results(version):

    directory = '../../results/gpt4/evaluation_results/'
    dataframes = []

    for filename in os.listdir(directory):
        if '_v'+ str(version) in filename and filename.endswith('.csv'):
            file_path = os.path.join(directory, filename)
            df = pd.read_csv(file_path, delimiter='$')
            dataframes.append(df)

    if dataframes:
        concatenated_df = pd.concat(dataframes, ignore_index=True)  # row-wise concatenation
        
        # Identify numeric columns
        numeric_columns = concatenated_df.select_dtypes(include='number').columns
        # Identify non-numeric columns
        non_numeric_columns = concatenated_df.select_dtypes(exclude='number').columns
        
        # Apply mean to numeric columns
        averaged_numeric_df = concatenated_df.groupby('comb_id')[numeric_columns].mean().reset_index()
        
        # Take the first non-null value for each non-numeric column
        first_non_null = lambda x: x.dropna().iloc[0] if not x.dropna().empty else None
        averaged_non_numeric_df = concatenated_df.groupby('comb_id')[non_numeric_columns].agg(first_non_null).reset_index()
        
        # Merge the numeric and non-numeric dataframes
        averaged_df = pd.merge(averaged_numeric_df, averaged_non_numeric_df, on='comb_id')
        
        # Save the result to a CSV file

        averaged_df.to_csv('../../results/gpt4/evaluation_results/test_results_codereval_AVERAGED_v' + str(version) + '.csv', index=False, delimiter='$')

    print("Averaged CSV file created successfully.")



In [3]:
import matplotlib.pyplot as plt

# Function to detect outliers (points more than 'threshold' standard deviations from the mean)
def detect_outliers(data, mean, std_dev, threshold=2):
    outliers = [x for x in data if abs(x - mean) > threshold * std_dev]
    return outliers

# Function to plot a dictionary as a bar chart with outliers
def plot_dict_with_outliers(dictionary, title, xlabel, ylabel, width=15, height=7, same_color=True, color='gray', threshold=2):
    colors = ['purple', 'brown', 'pink', 'teal', 'olive', 'cyan', 'blue', 'orange', 'green', 'red', 'yellow', 'gray', 'maroon', 'lime', 'aqua', 'fuchsia', 'silver', 'white']
    
    plt.figure(figsize=(width, height))
    plt.xticks(rotation=90)
    
    # Extract means, standard deviations, and data values
    keys = list(dictionary.keys())
    means = [dictionary[key]['mean'] for key in keys]
    std_devs = [dictionary[key]['std_dev'] for key in keys]
    values = [dictionary[key]['data'] for key in keys]  # Assuming raw data is stored in 'data'

    if same_color:
        color = colors.pop(0)
        bar_colors = [color] * len(keys)
    else:
        bar_colors = colors[:len(keys)]  # Assign different colors

    lower_errors = [min(mean, std_dev) for mean, std_dev in zip(means, std_devs)]  # Prevent negative values
    upper_errors = std_devs  # No adjustment needed for upper errors
    error_bars = [lower_errors, upper_errors]

    plt.bar(keys, means, yerr=error_bars, color=bar_colors, capsize=5)

    for i, (key, mean, std_dev, value_list) in enumerate(zip(keys, means, std_devs, values)):
        outliers = detect_outliers(value_list, mean, std_dev, threshold)
        
        if outliers:  # If there are any outliers
            plt.scatter([keys[i]] * len(outliers), outliers, color='red', zorder=5, label='Outliers' if i == 0 else "")  # Add label only once for the legend

    for i, (key, mean) in enumerate(zip(keys, means)):
        plt.text(i, mean, f'{mean:.2f}', ha='center', va='bottom', color='black')


    plt.legend()
    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    
    # Save and show the plot
    file_name = title.replace(' ', '_').replace('/', '_') + '.png'
    path = '../../results/gpt4/plots/' + file_name
    plt.savefig(path)
    plt.show()


In [4]:
def average_results(version):
    directory = '../../results/gpt4/evaluation_results/'
    dataframes = []
    column_order = None

    for filename in os.listdir(directory):
        if '_v'+ str(version) in filename and filename.endswith('.csv'):
            file_path = os.path.join(directory, filename)
            df = pd.read_csv(file_path, delimiter='$')
            dataframes.append(df)
            if column_order is None:
                column_order = df.columns.tolist()  # Save the column order from the first DataFrame

    if dataframes:
        concatenated_df = pd.concat(dataframes, ignore_index=True)  # row-wise concatenation
        
        numeric_columns = concatenated_df.select_dtypes(include='number').columns.difference(['comb_id'])
        non_numeric_columns = concatenated_df.select_dtypes(exclude='number').columns
        
        averaged_numeric_df = concatenated_df.groupby('comb_id')[numeric_columns].mean().reset_index()
        first_non_null = lambda x: x.dropna().iloc[0] if not x.dropna().empty else None
        averaged_non_numeric_df = concatenated_df.groupby('comb_id')[non_numeric_columns].agg(first_non_null).reset_index()
        
        averaged_df = pd.merge(averaged_numeric_df, averaged_non_numeric_df, on='comb_id')
        averaged_df = averaged_df[column_order]
        
        averaged_df.to_csv(f'../../results/gpt4/evaluation_results/test_results_codereval_AVERAGED_v{version}.csv', index=False, sep='$')
        print("Averaged CSV file created successfully.")




In [5]:
import math
def average_dict(dict_to_average):
    for key, value in dict_to_average.items():
        mean = sum(value) / len(value)
        variance = sum((x - mean) ** 2 for x in value) / len(value)
        std_dev = math.sqrt(variance)
        dict_to_average[key] = {
            'median': np.median(value),
            'mean': mean,
            'std_dev': std_dev,
            'data': value
        }
    return dict_to_average

In [6]:
import subprocess
import tempfile
import os

def check_lints(code_string, code_type="Original"):
    """
    Function to check linting for the given Python code string using pylint.
    :param code_string: The Python code as a string.
    :param code_type: Type of code ("Original" or "Generated") for display purposes.
    :return: None
    ignores #C0114, C0116, C0415 lints
    """
    print(f"\nLinting {code_type} code:")
    # dedent the code string
    code_string = textwrap.dedent(code_string)
    # Create a temporary file
    with tempfile.NamedTemporaryFile(delete=False, suffix=".py") as temp_file:
        temp_file.write(code_string.encode('utf-8'))
        temp_file_path = temp_file.name

    try:
        # Run pylint on the temporary file and capture both stdout and stderr
        result = subprocess.run(
            ["pylint", temp_file_path, "--output-format=text"],  # Force plain text output
            capture_output=True,
            text=True,
            timeout=60
        )
        
        # Print the linting output from stdout and stderr
        if result.stdout:
            
            
            # remove lints that have the codes: C0114, C0116, C0415
            list_of_lints = result.stdout.split('\n')[1:] # this to skip the first line
            print("STDOUT:")
            print(list_of_lints)

            lint_res = result.stdout
            if len(list_of_lints) > 0:
                for lint in list_of_lints:
                    if 'C0114' in lint or 'C0116' in lint or 'C0415' in lint:
                        list_of_lints.remove(lint)
                    lint = ':'.join(lint.split(':')[1:])
                lint_res = '\n'.join(list_of_lints)
            
            else:
                lint_res = "No linting issues found."
        if result.stderr:
            print("STDERR:")
            print(result.stderr)

        # Determine if linting issues were found
        if result.returncode != 0:
            print(f"\nLinting issues found in {code_type} code.")
        else:
            print(f"\nNo linting issues found in {code_type} code.")
    except subprocess.TimeoutExpired:
        lint_res = "Linting took too long and was terminated."
    
    finally:
        # Clean up the temporary file
        os.remove(temp_file_path)
    return lint_res

In [7]:
def cyclomatic_complexity(code_string):
    # dedent the code string
    code_string = textwrap.dedent(code_string)
    # Write the code string to a temporary file
    with tempfile.NamedTemporaryFile(delete=False, suffix=".py") as temp_file:
        temp_file.write(code_string.encode('utf-8'))
        temp_file_path = temp_file.name

    try:
        # Run radon via subprocess and capture the output
        result = subprocess.run(
            ["radon", "cc", temp_file_path, "-s"],
            capture_output=True,
            text=True
        )
        
        # Print the radon output
        complexity = None
        if result.stdout:
            complexity = result.stdout.split('(')[1].split(')')[0]
            print("Cyclomatic complexity:", result.stdout)
        if result.stderr:
            print("Error:", result.stderr)
    
    finally:
        # Clean up the temporary file
        os.remove(temp_file_path)
    return complexity

In [None]:
# %pip install cognitive_complexity
from cognitive_complexity.api import get_cognitive_complexity
import ast
# %pip install complexipy
from complexipy import code_complexity

def cognitive_complexity(code_string):
    code_dedented = textwrap.dedent(code_string)
    try:
        funcdef = ast.parse(code_dedented).body[0]
        return get_cognitive_complexity(funcdef)
    except Exception as e:
        print("Could not parse, trying with complexipy")
        try:
            return code_complexity(code_dedented).complexity
        except Exception as e:
            print("Could not be parsed:", e)
            return None
        

code ="""
    def f(self, a):
        return a * f(a - x)  # +1 for recursion
    """

cognitive_complexity(code)

In [9]:
import matplotlib.pyplot as plt
import seaborn as sns

# function to plot a dictionary as a violin plot
def plot_dict_violin(dictionary, title, xlabel, ylabel, width=15, height=7, model='gpt4'):
    # Extract keys and data (list of values) from the dictionary
    keys = list(dictionary.keys())
    data = [dictionary[key]['values'] for key in keys]  # Assume original data is stored in 'values'
    
    # Create a figure and set the size
    plt.figure(figsize=(width, height))
    plt.xticks(rotation=90)
    
    # Convert the data into a format suitable for seaborn's violinplot
    all_data = []
    labels = []
    for key, values in zip(keys, data):
        all_data.extend(values)
        labels.extend([key] * len(values))
    
    # Plot violin plot using seaborn
    sns.violinplot(x=labels, y=all_data)
    
    # Customize the plot with titles and labels
    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    
    # Save the plot
    file_name = title.replace(' ', '_').replace('/', '_') + '.png'
    path = '../../results/' + model + '/plots/' + file_name
    plt.savefig(path)
    plt.show()


In [96]:
def get_error_type(error_message):
    if "isT is not True" in error_message or "Result is not True" in error_message or "Result not True" in error_message:
        error_type = "AssertionError"
    elif "Module not found" in error_message:
        error_type = "ModuleNotFoundError"
        #  everything after ModuleNotFoundError:
    elif "KeyError" in error_message:
        error_type = "KeyError"
    elif "NameError" in error_message:
        error_type = "NameError"
    elif "SyntaxError" in error_message:
        error_type = "SyntaxError"
    elif "TypeError" in error_message:
        error_type = "TypeError"
    elif "ValueError" in error_message:
        error_type = "ValueError"
    elif "IndexError" in error_message:
        error_type = "IndexError"
    elif "AttributeError" in error_message:
        error_type = "AttributeError"
    elif "FileNotFoundError" in error_message:
        error_type = "FileNotFoundError"
    elif "ImportError" in error_message:
        error_type = "ImportError"
    elif "AssertionError" in error_message:
        error_type = "AssertionError"
    elif "has no attribute" in error_message:
        error_type = "AttributeError"
    elif "cannot import name" or "cannot import module" in error_message:
        error_type = "ImportError"
    else:
        error_type = "Error"

    return error_type

In [53]:
import Levenshtein
# %pip install scikit-learn

def levenshtein_distance(str1, str2):
    return Levenshtein.distance(str1, str2)

def get_jaccard_similarity_1gram(str1, str2):
    set1 = set(str1.split())
    set2 = set(str2.split())
    return len(set1.intersection(set2)) / len(set1.union(set2))

def generate_ngrams(text, n):
    """
    Generate n-grams for a given text.
    """
    return {text[i:i+n] for i in range(len(text) - n + 1)}

def get_jaccard_similarity(string1, string2, n=3):
    """
    Calculate the Jaccard Index using n-grams.
    """
    # # remove the signature
    # string1 = string1.split('\n', 1)[1]
    # string2 = string2.split('\n', 1)[1]

    ngrams1 = generate_ngrams(string1, n)
    ngrams2 = generate_ngrams(string2, n)
    
    # Calculate intersection and union
    intersection = ngrams1.intersection(ngrams2)
    union = ngrams1.union(ngrams2)
    
    # Compute Jaccard Index
    return len(intersection) / len(union) if union else 0



def cosine_similarity(str1, str2):
    set1 = set(str1.split())
    set2 = set(str2.split())
    return len(set1.intersection(set2)) / (len(set1) * len(set2))**0.5

def compute_distance(str1, str2, method="Levenshtein"):
    if method == "levenshtein":
        return levenshtein_distance(str1, str2)
    else:
        raise ValueError("Invalid similarity method")

In [17]:
import ast

class ASTNodeCounter(ast.NodeVisitor):
    """Class to count the number of nodes in an AST."""
    def __init__(self):
        self.nodes = []

    def generic_visit(self, node):
        """Visit a node in the AST and store its type."""
        self.nodes.append(type(node).__name__)
        super().generic_visit(node)

def get_ast_from_code(code_string):
    """Parse code string and return the AST."""
    return ast.parse(code_string)

def get_ast_node_list(tree):
    """Get a list of all node types in the AST."""
    node_counter = ASTNodeCounter()
    node_counter.visit(tree)
    return node_counter.nodes

def compute_similarity(nodes1, nodes2):
    """Compute similarity as a percentage of matching nodes."""
    set1 = set(nodes1)
    set2 = set(nodes2)

    # Calculate common nodes
    common_nodes = set1.intersection(set2)
    
    # Similarity score is the ratio of common nodes to the total unique nodes
    total_unique_nodes = set1.union(set2)
    similarity = len(common_nodes) / len(total_unique_nodes) if total_unique_nodes else 1

    return similarity

def detect_ast_similarity(code1, code2):
    """Detect similarity between two code snippets based on their AST structure."""
    # Parse both code snippets into ASTs
    tree1 = get_ast_from_code(code1)
    tree2 = get_ast_from_code(code2)

    # Get list of node types from each AST
    nodes1 = get_ast_node_list(tree1)
    nodes2 = get_ast_node_list(tree2)

    # Compute similarity score
    similarity_score = compute_similarity(nodes1, nodes2)

    return similarity_score

from codebleu.codebleu import calc_codebleu


def compute_code_bleu(groundtruth_code, generated_code):
    """
        {
        'codebleu': 0.5537, 
        'ngram_match_score': 0.1041, 
        'weighted_ngram_match_score': 0.1109, 
        'syntax_match_score': 1.0, 
        'dataflow_match_score': 1.0
        }
    """
    # remove the signature
    groundtruth_code = groundtruth_code.split('\n', 1)[1]
    generated_code = generated_code.split('\n', 1)[1]
    
    result = calc_codebleu([groundtruth_code], [generated_code], lang="python", weights=(0.25, 0.25, 0.25, 0.25), tokenizer=None)
    codebleu = result['codebleu']
    syntax_match_score = result['syntax_match_score']
    dataflow_match_score = result['dataflow_match_score']

    return codebleu, syntax_match_score, dataflow_match_score



In [18]:
def plot_grouped_similarities_old(dict_of_similarities, title="", xlabel="", ylabel="", n_cols = 4 , width=25, height=5, model='gpt4', color_shades='Oranges'):
    n_combinations = len(dict_of_similarities)
    n_rows = (n_combinations + 1) // n_cols 

    fig, axes = plt.subplots(n_rows, n_cols, figsize=(width, height * n_rows))  
    axes = axes.flatten()

    cmap = plt.get_cmap(color_shades)

    for idx, (combination, code_bleus) in enumerate(dict_of_similarities.items()):
        n, bins, patches = axes[idx].hist(code_bleus, bins=40, histtype='bar', rwidth=0.8)
        bin_positions = np.linspace(0, 1, len(patches))
        
        for patch, color_value in zip(patches, bin_positions):
            patch.set_facecolor(cmap(color_value))  # Assign color based on bin position

        axes[idx].set_title(title + combination)
        axes[idx].set_xlabel(xlabel)
        axes[idx].set_ylabel(ylabel)

    for i in range(idx + 1, len(axes)):
        fig.delaxes(axes[i])

    plt.tight_layout()
    plt.show()


def plot_grouped_similarities(dict_ranges_similarities, title="", xlabel="", ylabel="", n_cols = 4 , width=25, height=5, model='gpt4', color_shades='Oranges'):
    # dict_ranges_similarities = { "combination1" : {"0-10%" : [], "10-20%" : [] .... }, "combination2" : {"0-10%" : [], "10-20%" : [] .... } }
    n_combinations = len(dict_ranges_similarities)
    n_rows = (n_combinations + 1) // n_cols 

    # to know the max y value to make the y axis the same for all plots
    max_y_value = 0
    for combination, dict_ranges in dict_ranges_similarities.items():
        y = [len(dict_ranges[key]) for key in dict_ranges]
        max_y_value = max(max_y_value, max(y))

    fig, axes = plt.subplots(n_rows, n_cols, figsize=(width, height * n_rows))  
    axes = axes.flatten()

    cmap = plt.get_cmap(color_shades)

    for idx, (combination, dict_ranges) in enumerate(dict_ranges_similarities.items()):
        data = dict_ranges
        x = list(data.keys())
        y = [len(data[key]) for key in x]

        bars = axes[idx].bar(x, y, color=cmap(np.linspace(0.3, 1, len(x))))

        axes[idx].set_title(title + combination)
        axes[idx].set_xlabel(xlabel)
        axes[idx].set_ylabel(ylabel)
        axes[idx].set_ylim(0, max_y_value)
        axes[idx].bar_label(bars, labels=[f'{val}' for val in y], label_type='edge')
        
    for i in range(idx + 1, len(axes)):
        fig.delaxes(axes[i])
        
    plt.tight_layout()
    plt.show()


In [19]:
def add_score_to_per_tech_dict(per_tech_dict, score, prompt_technique):
    score_ranges = {
        "0 - 10%": 0.1,
        "10 - 25%": 0.25,
        "25 - 50%": 0.5,
        "50 - 100%": 1.0
    }

    if prompt_technique not in per_tech_dict:
        per_tech_dict[prompt_technique] = {key: [] for key in score_ranges}

    if score is not None:
        for range_label, threshold in score_ranges.items():
            if score <= threshold:
                per_tech_dict[prompt_technique][range_label].append(score)
                break 

    return per_tech_dict

In [20]:
def get_error_patterns():
    error_patterns = {
        "isT is not True": "AssertionError",
        "Result is not True": "AssertionError",
        "Module not found": "ModuleNotFoundError",
        "KeyError": "KeyError",
        "NameError": "NameError",
        "SyntaxError": "SyntaxError",
        "TypeError": "TypeError",
        "ValueError": "ValueError",
        "IndexError": "IndexError",
        "AttributeError": "AttributeError",
        "FileNotFoundError": "FileNotFoundError",
        "ImportError": "ImportError",
        "has no attribute": "AttributeError",
        "Result not True!!!": "AssertionError",
        "cannot import name": "ImportError",
        "cannot import module": "ImportError"
    }
    return error_patterns
    

In [21]:
def increment_value_in_dict(dictionary, key):
    if key in dictionary:
        dictionary[key] += 1
    else:
        dictionary[key] = 1

In [None]:
version = 14
model = "mistral"
can_average = False

if can_average:
    data_path = "../../results/" + model + "/evaluation_results/test_results_codereval_AVERAGED_v" + str(version) + ".csv"
    average_results(version)
    data = read_csv(data_path, delimiter=',')
else:
    data_path = "../../results/" + model + "/evaluation_results/evaluation_codereval_v" + str(version) + ".csv"
    data = read_csv(data_path, delimiter=',')

# get data with error message is nan and test result is Failed
# make nan as Failed test)results with "SyntaxError: Invalid generated code" error_message 
data['error_message'] = data['error_message'].fillna("SyntaxError: Invalid generated code")
data['test_result'] = data['test_result'].fillna("Failed")

# x = data[data['test_result'].isna()]
x = data[data['test_result'] == 'Failed']
# x = x['groundtruth_code']
# len(data) - len(x)
len(x)


In [None]:
analysis_file = data_path.replace('.csv', '_analysis.csv')

# read in dataframe
df = pd.read_csv(analysis_file, delimiter=',')
# for each row, calculate the similarity between the groundtruth and generated code
for i, row in df.iterrows():
    if pd.isna(row["test_result"]):
        print("Skipping row with empty test result for id: ", row['task_id'])
        continue
        
    groundtruth_code = row['groundtruth_code']
    generated_code = row['generated_code']
    # remove the signature from both codes
    groundtruth_code = groundtruth_code.split(':', 1)[1]
    generated_code = generated_code.split(':', 1)[1]
    

    jaccard_similarity = get_jaccard_similarity(groundtruth_code, generated_code)
    df.at[i, 'jaccard_similarity'] = jaccard_similarity


In [72]:
# save the dataframe to a new csv file
new_analysis_file = analysis_file.replace('.csv', '_trial.csv')
df.to_csv(new_analysis_file, index=False, sep=',')


In [None]:
x = """
def xy (one,
       two,
       three):
    return one + two + three
""".split(':', 1)[1]

y = """
def x (one,
       two,
       three):
    return one + two + three
""".split(':', 1)[1]

print(y)
print(x)
print(get_jaccard_similarity(x, y))

In [81]:
%%capture cap

errors_types = {}
stats_per_task = {}
stats_per_tech = {}
error_messages = []
coverage_per_tech = {}
stmts_coverage_per_tech = {}
lexical_distance_per_tech = {}
assert_failed_per_tech = {}
ast_similarity_per_tech = {}
semantic_similarity_per_tech = {}
code_bleu_per_tech = {}
test_output_per_comb = {}
overall_similarity_histo = {}
flow_similarity_histo = {}
syntax_similarity_histo = {}


data["error_type"] = ""
data["codebleu"] = ""
data["syntax_similarity"] = ""
data["flow_similarity"] = ""
data["jaccard_similarity"] = ""
data["lint_generated"] = ""
data["lint_groundtruth"] = ""
data["cyclo_complexity_generated"] = ""
data["cyclo_complexity_groundtruth"] = ""


for index, row in data.iterrows():
    task_id, comb_text, prompt, project, file_path, groundtruth_code, tests, packages, function_name, test_name, level, test_file_path, original_prompt_length, is_zero_shot, is_few_shot, is_chain_of_thought, is_persona, is_packages, is_signature, zero_shot_prompt, test_result, error_message, prompt_technique, generated_code, complete_response = row["task_id"], row["combination_id"], row["prompt"], row["project"], row["file_path"], row["groundtruth_code"], row["tests"], row["packages"], row["function_name"], row["test_name"], row["level"], row["test_file_path"], row["original_prompt_length"], row["is_zero_shot"], row["is_few_shot"], row["is_chain_of_thought"], row["is_persona"], row["is_packages"], row["is_signature"], row["zero_shot_prompt"], row["test_result"], row["error_message"], row["prompt_techniques_applied"], row["generated_code"], row["complete_response"]
    if test_result == "":
        print(f"Skipping task {task_id} with combination {comb_text}: Generated code could not be extracted or compiled.")
        continue

    if generated_code == "":
        print("Empty generated code for task: ", task_id)
        continue
    
    
    error_type = ""
    normalized_error_message = ""
    codebleu = None
    syntax_similarity = None
    flow_similarity = None
    jaccard_similarity = None
    cyclo_complexity_generated = None
    cyclo_complexity_groundtruth = None
    cognitive_complexity_generated = None
    cognitive_complexity_groundtruth = None
    lint_generated = None
    lint_groundtruth = None

    if test_result == "Failed":
        error_messages.append(error_message)
        print("Error message: ", error_message)
        error_type = get_error_type(error_message)
        normalized_error_message = error_message.split(error_type + ":")

        if len(normalized_error_message) > 1:
            normalized_error_message = normalized_error_message[-1].strip()
            normalized_error_message = error_type + ": " + normalized_error_message
        else:
            normalized_error_message = error_type + ": " + error_message
            if error_type == "AssertionError":
                normalized_error_message = "AssertionError: Result is not True"
        

        # most common error type
        increment_value_in_dict(errors_types, error_type)
        
        # the task that failed the most
        function_name = extract_function_name(groundtruth_code)        
        increment_value_in_dict(stats_per_task, function_name)

        # the combination with the most failed tests
        # if error_type == "AssertionError":
        increment_value_in_dict(stats_per_tech, prompt_technique)


        if comb_text in test_output_per_comb:
            increment_value_in_dict(test_output_per_comb[comb_text], error_type)
        else:
            test_output_per_comb[comb_text] = {error_type: 1}
        
    if test_result == "Passed":
        if comb_text in test_output_per_comb:
            increment_value_in_dict(test_output_per_comb[comb_text], test_result)
        else:
            test_output_per_comb[comb_text] = {test_result: 1}

        # check linting and complexity
        lint_generated = check_lints(generated_code, code_type="Generated")
        lint_groundtruth = check_lints(groundtruth_code, code_type="Groundtruth")

        cyclo_complexity_generated = cyclomatic_complexity(generated_code)
        cyclo_complexity_groundtruth = cyclomatic_complexity(groundtruth_code)

        cognitive_complexity_generated = cognitive_complexity(generated_code)
        cognitive_complexity_groundtruth = cognitive_complexity(groundtruth_code)



    # similarity measures
    try:
        jaccard_similarity = get_jaccard_similarity(groundtruth_code, generated_code)
    except:
        print("Jaccard failed for the codes: ", groundtruth_code, generated_code)
        jaccard_similarity = None

    try:
        code_bleu, syntax_similarity, flow_similarity = compute_code_bleu(groundtruth_code, generated_code)
    except:
        print("CodeBLEU failed for the codes: ", groundtruth_code, generated_code)
        code_bleu = None
        syntax_similarity = None
        flow_similarity = None

    if generated_code is not None and generated_code != "null" and code_bleu is not None:

        overall_similarity_histo = add_score_to_per_tech_dict(overall_similarity_histo, code_bleu, prompt_technique)
        syntax_similarity_histo = add_score_to_per_tech_dict(syntax_similarity_histo, syntax_similarity, prompt_technique)
        flow_similarity_histo = add_score_to_per_tech_dict(flow_similarity_histo, flow_similarity, prompt_technique)

        if prompt_technique in code_bleu_per_tech:
            code_bleu_per_tech[prompt_technique].append(float(code_bleu))
            lexical_distance_per_tech[prompt_technique].append(float(syntax_similarity))
            ast_similarity_per_tech[prompt_technique].append(float(flow_similarity))
        else:
            code_bleu_per_tech[prompt_technique] = [float(code_bleu)]
            lexical_distance_per_tech[prompt_technique] = [float(syntax_similarity)]
            ast_similarity_per_tech[prompt_technique] = [float(flow_similarity)]

    data.at[index, 'error_type'] = error_type
    data.at[index, 'normalized_error_message'] = normalized_error_message
    data.at[index, 'codebleu'] = code_bleu
    data.at[index, 'syntax_similarity'] = syntax_similarity
    data.at[index, 'flow_similarity'] = flow_similarity
    data.at[index, 'jaccard_similarity'] = jaccard_similarity
    data.at[index, 'lint_generated'] = lint_generated
    data.at[index, 'lint_groundtruth'] = lint_groundtruth
    data.at[index, 'cyclo_complexity_generated'] = cyclo_complexity_generated
    data.at[index, 'cyclo_complexity_groundtruth'] = cyclo_complexity_groundtruth
    data.at[index, 'cognitive_complexity_generated'] = cognitive_complexity_generated
    data.at[index, 'cognitive_complexity_groundtruth'] = cognitive_complexity_groundtruth
    



In [82]:
with open(f'eval_output_{version}.txt', 'w') as file:
    file.write(cap.stdout)

In [83]:

# uncomment when you want to add the similarity values to evaluation_results.csv
output_analysis_file = data_path.replace('.csv', '_analysis.csv')
write_csv(data, output_analysis_file)

In [None]:

plot_grouped_similarities(overall_similarity_histo, title="Code BLEU for ", xlabel="Code BLEU", ylabel="Frequency", model=model)
plot_grouped_similarities(syntax_similarity_histo, title="Syntax similarity for ", xlabel="Syntax similarity", ylabel="Frequency", model=model, color_shades='Blues')
plot_grouped_similarities(flow_similarity_histo, title="Flow similarity for ", xlabel="Flow similarity", ylabel="Frequency", model=model, color_shades='Purples')


# plot_grouped_similarities_old(code_bleu_per_tech, title="Code BLEU for " , xlabel="Code BLEU", ylabel="Frequency", model=model)
# plot_grouped_similarities_old(lexical_distance_per_tech, title="Syntax similarity for " , xlabel="Syntax similarity", ylabel="Frequency", model=model, color_shades='Blues')
# plot_grouped_similarities_old(ast_similarity_per_tech, title="Flow similarity for " , xlabel="Flow similarity", ylabel="Frequency", model=model, color_shades='Purples')


# plot lexical_distance_per_tech, ast_similarity_per_tech, semantic_similarity_per_tech in one grouped bar chart side to side
plt.figure(figsize=(20, 7))
plt.xticks(rotation=90)
width = 0.25
keys = list(lexical_distance_per_tech.keys())

means_lexical = [np.mean(lexical_distance_per_tech[key]) for key in keys]
std_devs_lexical = [np.std(lexical_distance_per_tech[key]) for key in keys]
means_ast = [np.mean(ast_similarity_per_tech[key]) for key in keys]
std_devs_ast = [np.std(ast_similarity_per_tech[key]) for key in keys]
means_semantic = [np.mean(code_bleu_per_tech[key]) for key in keys]
std_devs_semantic = [np.std(code_bleu_per_tech[key]) for key in keys]

lower_errors_lexical = [min(mean, std_dev) for mean, std_dev in zip(means_lexical, std_devs_lexical)]  # Prevent negative values
upper_errors_lexical = std_devs_lexical  
error_bars_lexical = [lower_errors_lexical, upper_errors_lexical]

lower_errors_ast = [min(mean, std_dev) for mean, std_dev in zip(means_ast, std_devs_ast)] 
upper_errors_ast = std_devs_ast  
error_bars_ast = [lower_errors_ast, upper_errors_ast]

lower_errors_semantic = [min(mean, std_dev) for mean, std_dev in zip(means_semantic, std_devs_semantic)]  
upper_errors_semantic = std_devs_semantic  
error_bars_semantic = [lower_errors_semantic, upper_errors_semantic]

# Create bar plot with error bars
plt.bar(np.arange(len(keys)) - width, means_lexical, yerr=error_bars_lexical, width=width, label='Lexical Distance', color='blue')
plt.bar(np.arange(len(keys)), means_ast, yerr=error_bars_ast, width=width, label='AST Similarity', color='orange')
plt.bar(np.arange(len(keys)) + width, means_semantic, yerr=error_bars_semantic, width=width, label='Code Bleu', color='green')

# Display mean values on the bar plot
for i, (key, mean_lexical, mean_ast, mean_semantic) in enumerate(zip(keys, means_lexical, means_ast, means_semantic)):
    plt.text(i - width, mean_lexical, f'{mean_lexical:.2f}', ha='center', va='bottom', color='black')
    plt.text(i, mean_ast, f'{mean_ast:.2f}', ha='center', va='bottom', color='black')
    plt.text(i + width, mean_semantic, f'{mean_semantic:.2f}', ha='center', va='bottom', color='black')

# Set up legend, title, and labels
plt.legend()
plt.title("Similarity Measures per Prompt Technique")
plt.xlabel("Prompt Technique")
plt.ylabel("Similarity")
plt.xticks(range(len(keys)), keys)

# Save and show the plot
file_name = "Similarity_Measures_per_Prompt_Technique.png"
path = "../../results/" + model + "/plots/" + file_name
plt.savefig(path)
plt.show()



lexical_distance_per_tech = average_dict(lexical_distance_per_tech)
plot_dict(lexical_distance_per_tech, "Avg. Lexical per technique v" + str(version), "Prompt technique", "Avg. Jaccard distance", width=20, height=7, same_color=False, color='maroon')

ast_similarity_per_tech = average_dict(ast_similarity_per_tech)
plot_dict(ast_similarity_per_tech, "AST similarity per technique v" + str(version), "Prompt technique", "AST similarity", width=20, height=7, same_color=False, color='maroon')

code_bleu_per_tech = average_dict(code_bleu_per_tech)
plot_dict(code_bleu_per_tech, "Code BLEU per technique v" + str(version), "Prompt technique", "Code BLEU", width=20, height=7, same_color=False, color='maroon')

# plot the error types
plot_dict_old(errors_types, "Error types v" + str(version), "Error type", "Number of errors")

plot_dict_old(stats_per_task, "Tasks that failed the most v" + str(version), "Task ID", "Number of errors")
plot_dict_old(stats_per_tech, "Prompt techniques that failed the most v" + str(version), "Prompt techniques", "Number of errors", width=20, height=7)

In [None]:
test_output_per_comb

In [None]:
import numpy as np
import matplotlib.pyplot as plt

# Data: Example structure, replace with your test_output_per_comb dictionary
data = test_output_per_comb
num_plots = len(data)

# Define consistent category order starting with "Passed"
consistent_categories = ['Passed', 'AssertionError', 'ImportError', 'AttributeError', 'KeyError', 'ValueError', 'NameError', 'FileNotFoundError', 'IndexError', 'SyntaxError']

# Plot dimensions
cols = 4
rows = (num_plots + cols - 1) // cols
fig, axes = plt.subplots(rows, cols, figsize=(25, 5 * rows))

axes = axes.flatten()
y_max = 130  # Fixed y-axis limit for all plots

# Generate different shades of red for the non-passed categories
reds = plt.cm.Reds(np.linspace(0.4, 0.8, len(consistent_categories) - 1))

# Iterate through each combination and plot
for i, (combination, results) in enumerate(data.items()):
    ax = axes[i]
    
    # Ensure all categories are present in the plot (even if they don't appear in 'results')
    values = [results.get(category, 0) for category in consistent_categories]
    
    # Assign colors: green for 'Passed' and shades of red for others
    colors = ['green' if cat == 'Passed' else reds[j-1] for j, cat in enumerate(consistent_categories)]

    # Plot the bar chart
    bars = ax.bar(consistent_categories, values, color=colors)
    ax.set_title(combination, fontsize=10)
    ax.set_xlabel('Error Type')
    ax.set_ylabel('Count')
    ax.tick_params(axis='x', rotation=45, labelsize=8)
    ax.set_ylim(0, y_max)  # Set fixed y-axis limit for all plots
                    
    
# Delete any unused axes if necessary
for j in range(i + 1, len(axes)):
    fig.delaxes(axes[j])

plt.tight_layout()

# Save the plot
file_name = "Error_types_per_combination.png"
path = "../../results/" + model + "/plots/" + file_name
plt.savefig(path)

plt.show()
