In [169]:
import pandas as pd
import glob
from pathlib import Path
import json
import numpy as np

In [2]:
generation_metrics_df = pd.read_parquet("../results/metrics_pipeline/generation_metrics_df.parquet")

In [3]:
generation_metrics_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 12903 entries, 0 to 12902
Data columns (total 25 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   index                            12903 non-null  int64  
 1   repo                             12903 non-null  object 
 2   tasks_x                          12903 non-null  object 
 3   true_tasks                       12903 non-null  object 
 4   generated_text                   12903 non-null  object 
 5   prompt_info                      12903 non-null  object 
 6   generation                       12903 non-null  int64  
 7   input_text                       12903 non-null  object 
 8   paper_urls                       12903 non-null  object 
 9   paper_titles                     12903 non-null  object 
 10  titles                           12903 non-null  object 
 11  arxiv_ids                        12903 non-null  object 
 12  authors                

In [4]:
pd.read_json("../github_search/pipelines/results/generation_pipeline/generation_metrics_df.json").shape

(12903, 25)

In [5]:
generation_metrics_df["true_tasks"].explode().value_counts()

true_tasks
['style transfer']                                                              585
['semantic segmentation']                                                       555
['time series']                                                                 435
['word embeddings']                                                             384
['data augmentation']                                                           276
                                                                               ... 
['sparse learning', 'time series']                                                3
['text generation', 'adversarial text', 'style transfer']                         3
['traffic sign recognition']                                                      3
['sentiment analysis', 'aspect based sentiment analysis', 'graph attention']      3
['instance segmentation', 'data augmentation', 'semantic segmentation']           3
Name: count, Length: 708, dtype: int64

In [6]:
ir_metrics_df = pd.read_csv("../output/code2doc/sample_per_task_5_repos/beir_results_codellama.csv")

In [7]:
per_query_metrics_df = pd.read_csv("../results/per_query_ir_results.csv")

In [8]:
per_query_metrics_df.shape

(33240, 12)

In [9]:
generation_metrics_df["true_tasks"].explode().value_counts()

true_tasks
['style transfer']                                                              585
['semantic segmentation']                                                       555
['time series']                                                                 435
['word embeddings']                                                             384
['data augmentation']                                                           276
                                                                               ... 
['sparse learning', 'time series']                                                3
['text generation', 'adversarial text', 'style transfer']                         3
['traffic sign recognition']                                                      3
['sentiment analysis', 'aspect based sentiment analysis', 'graph attention']      3
['instance segmentation', 'data augmentation', 'semantic segmentation']           3
Name: count, Length: 708, dtype: int64

In [10]:
generation_metrics_df["true_tasks"] = generation_metrics_df["true_tasks"].str.replace("'", '"').apply(json.loads)

In [11]:
generation_metric_names = ['edit_word',
 'jaccard_lst',
 'bleurt',
 'rouge1',
 'rouge2',
 'rougeL',
 'rougeLsum',
 'sentence_transformer_similarity']

In [12]:
per_query_generation_metrics_df = (
    generation_metrics_df
        .explode("true_tasks")
        .rename(columns={"true_tasks": "task"})
        .groupby("task")
        [generation_metric_names]
        .agg("mean")
)

In [13]:
per_query_metrics_df.head()

Unnamed: 0.1,Unnamed: 0,Hits@1,Hits@5,Hits@10,Hits@25,Accuracy@1,Accuracy@5,Accuracy@10,Accuracy@25,query,retriever,corpus
0,0,1.0,4.0,7.0,13.0,1.0,1.0,1.0,1.0,general classification,bm25,readme
1,1,0.0,2.0,6.0,18.0,0.0,1.0,1.0,1.0,semantic segmentation,bm25,readme
2,2,0.0,3.0,8.0,20.0,0.0,1.0,1.0,1.0,reinforcement learning,bm25,readme
3,3,1.0,4.0,8.0,23.0,1.0,1.0,1.0,1.0,translation,bm25,readme
4,4,1.0,2.0,3.0,4.0,1.0,1.0,1.0,1.0,image classification,bm25,readme


In [14]:
per_query_generation_metrics_df.shape

(306, 8)

In [15]:
retriever_name = "bm25"
corpus_name = "readme"


In [16]:
per_query_metrics_df["retriever"].unique()

array(['bm25', 'word2vec', 'sentence-transformers/all-mpnet-base-v2',
       'sentence-transformers/all-MiniLM-L12-v2',
       'flax-sentence-embeddings/st-codesearch-distilroberta-base'],
      dtype=object)

In [17]:
checked_corpora = [
    'generated_readme',
       'dependency_signature', 'repository_signature', 'generated_tasks'
]

checked_retrievers = ['bm25', 'sentence-transformers/all-mpnet-base-v2',
       'sentence-transformers/all-MiniLM-L12-v2',
       'flax-sentence-embeddings/st-codesearch-distilroberta-base'
]


In [18]:
import seaborn as sns
import matplotlib.pyplot as plt

In [19]:
shown_generation_metrics = ["rouge2", "rougeL", "sentence_transformer_similarity"]


def get_worst_queries(df, q=0.1, metrics=shown_generation_metrics):
    is_lowest = (df[metrics] < df[metrics].quantile(q)).sum(axis=1) > 0
    return df[is_lowest].index

In [34]:
worst_queries = get_worst_queries(per_query_generation_metrics_df, q=0.2)

In [35]:
worst_queries

Index(['acoustic unit discovery', 'action localization', 'argument mining',
       'automatic machine learning model selection', 'backdoor attack',
       'bilingual lexicon induction', 'causal inference',
       'click through rate prediction', 'counterfactual inference',
       'cross lingual transfer', 'cross modal retrieval',
       'cyber attack detection', 'data visualization',
       'deformable object manipulation', 'demosaicking',
       'distant speech recognition', 'dota 2', 'end to end dialogue modelling',
       'entity disambiguation', 'explainable artificial intelligence',
       'extract aspect', 'facial inpainting', 'fact verification',
       'formation energy', 'future prediction', 'gait recognition',
       'generalization bounds', 'gesture generation',
       'grammatical error detection', 'graph reconstruction', 'hard attention',
       'humor detection', 'imbalanced classification', 'information retrieval',
       'intrinsic image decomposition', 'joint entity an

In [36]:
for corpus_name in checked_corpora:
    for retriever_name in checked_retrievers:
        print(corpus_name, retriever_name)
        selected_metrics_df = per_query_metrics_df[
            (per_query_metrics_df["retriever"] == retriever_name) &
            (per_query_metrics_df["corpus"] == corpus_name)
        ][["query", "Accuracy@10", "Hits@10"]] 
        worst_queries_df = per_query_generation_metrics_df.loc[worst_queries]
        corr_df = (
            selected_metrics_df
                .merge(worst_queries_df, left_on="query", right_on="task")
                .select_dtypes("float")
                .corr().loc[shown_generation_metrics, ["Accuracy@10", "Hits@10"]]
        )
        sns.heatmap(corr_df, annot=True)
        plt.show()

generated_readme bm25


KeyError: "['Accuracy@10', 'Hits@10'] not in index"

In [46]:
per_query_metrics_df.columns = [c.lower() for c in per_query_metrics_df.columns]

In [144]:
worst_queries = get_worst_queries(per_query_generation_metrics_df, q=0.2)

In [145]:
def get_metric_diffs(per_query_metrics_df, worst_queries):
    dfs = []
    for name, gp_df in per_query_metrics_df.groupby(["corpus", "retriever"]):
        selected_df = gp_df[gp_df["query"].isin(worst_queries)]
        selected_metrics_df = selected_df[["hits@10", "accuracy@10"]].mean()
        all_metrics_df = gp_df[["hits@10", "accuracy@10"]].mean()
        exp_difference_df = (all_metrics_df - selected_metrics_df)
        diffs = pd.concat([all_metrics_df, selected_metrics_df, exp_difference_df, 100 * (exp_difference_df / all_metrics_df)])
        diffs.index = ["hits@10", "accuracy@10", "worst records hits@10", "worst records accuracy@10", "hits@10 difference", "accuracy@10 difference", "% hits@10 difference ", "% accuracy@10 difference"]
        diffs = pd.concat([pd.Series(dict(zip(["corpus", "retriever"], name))), diffs])
        dfs.append(diffs)
    return pd.concat(dfs, axis=1).T

In [183]:
def prettify_diffs_df(diff_df):
    for col in diff_df.columns:
        if "%" in col:
            diff_df[col] = diff_df[col].apply(lambda x: np.round(x,1)).astype(str) + "%"
        else:
            diff_df[col] = diff_df[col].apply(lambda x: np.round(x,3))
    return diff_df

In [184]:
diffs_df = get_metric_diffs(per_query_metrics_df, worst_queries)

In [185]:
aggregated_diffs_df = prettify_diffs_df(diffs_df.drop(columns="retriever").groupby("corpus").agg("mean").sort_values("% hits@10 difference ", ascending=False))

In [186]:
aggregated_diffs_df

Unnamed: 0_level_0,hits@10,accuracy@10,worst records hits@10,worst records accuracy@10,hits@10 difference,accuracy@10 difference,% hits@10 difference,% accuracy@10 difference
corpus,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
generated_tasks,1.356,0.599,0.8,0.507,0.556,0.092,41.1%,15.7%
dependency_signature,1.149,0.508,0.813,0.461,0.335,0.048,38.9%,20.3%
repository_signature,1.398,0.607,0.885,0.526,0.513,0.081,36.8%,14.2%
generated_rationale,1.995,0.705,1.313,0.684,0.682,0.021,34.5%,3.7%
generation_context,1.648,0.649,1.15,0.595,0.497,0.054,33.7%,10.7%
selected_code,1.219,0.532,0.838,0.499,0.381,0.033,32.7%,10.2%
generated_readme,2.119,0.73,1.48,0.725,0.639,0.005,31.4%,1.2%
readme,3.093,0.807,2.467,0.778,0.627,0.029,21.2%,4.5%
