In [1]:
import glob
import pandas as pd
from util.file_helper import get_readme_with_repo_name
from util.readme_parser import ReadmeSectionParser
from e2e_system.reproder import ClassificationReproder, _ground_truth
import os
from tqdm.auto import tqdm
import json
from datetime import datetime
import shutil


import warnings
warnings.filterwarnings("ignore")


In [2]:
manual = pd.read_json(
    "data/acl/sections/labelled/manuel/sections_grouped_orderby-stars/processed/intersection.json")


In [3]:
manual["score"] = manual.intersection.apply(lambda x: len(x))
manual.rename(columns={"repo": "url"}, inplace=True)
manual["source"] = "acl"


In [4]:
for idx, row in manual.iterrows():
    for item in _ground_truth:
        if item in row.intersection:
            manual.loc[idx, item] = True
        else:
            manual.loc[idx, item] = False


In [5]:
keys = ["header", "parent_header", "content"]
model = "bert_base_uncased"
model_paths = glob.glob(f"model/{model}/**")
neurips_df = pd.read_csv(
    'data/paperswithcode/neurips_checklist_manuel-100.csv')
neurips_df["score"] = neurips_df.select_dtypes(bool).sum(axis=1)
neurips_df["source"] = "neurips"


In [6]:
eval_df = pd.concat([manual, neurips_df], ignore_index=True)


In [7]:
eval_df.drop(columns=["intersection", "stars"], inplace=True)


In [8]:
from functools import lru_cache


# @lru_cache(maxsize=400)
def get_readme_and_parse_sections(repo, readme, group_by_parent):
    readme_parser = ReadmeSectionParser(repo, readme)
    sections = readme_parser.parse_sections(
        group_by_parent=group_by_parent)
    return sections


In [9]:
def evaluate(reproder, m_df, group_by_parent, consecutive):
    df = m_df.copy()
    for repo in (pbar := tqdm(df.url.values, desc="Evaluating")):
        pbar.set_postfix_str(f"Evaluating {repo}")
        record = df[df["url"] == repo]
        source = record.source.values[0]
        readme = get_readme_with_repo_name(source, repo)
        sections = get_readme_and_parse_sections(
            repo, readme, group_by_parent)
        labels = reproder.classify_sections(sections)
        checklist, scores, _, _ = reproder.checklist(labels)

        if consecutive:
            labels = reproder.merge_consecutive(pd.DataFrame(labels))

        reprod_score = reproder.calculate_reproducibility(
            pd.DataFrame(labels), coeff=0.16, punishment=False)
        df.loc[record.index,
               "true_reprod_score"] = df.loc[record.index, "score"] * 0.16
        df.loc[record.index, "reprod_score"] = reprod_score
        for k, v in checklist.items():
            df.loc[record.index, f"{k}_pred"] = v
            df.loc[record.index, f"{k}_score"] = scores[k] if v else 0
    return df


In [10]:
for model in (pbar := tqdm(model_paths)):
    pbar.set_postfix_str(f"Evaluating {model}")
    reproder = ClassificationReproder(model, keys)
    base_ev = evaluate(reproder, eval_df,
                       group_by_parent=False, consecutive=False)
    grouped_ev = evaluate(reproder, eval_df,
                          group_by_parent=True, consecutive=False)
    consecutive_ev = evaluate(reproder, eval_df,
                              group_by_parent=False, consecutive=True)
    grouped_consecutive_ev = evaluate(
        reproder, eval_df, group_by_parent=True, consecutive=True)

    save_main_dir = f'data/paperswithcode/evaluated/classification/bert_base_uncased/all'
    save_dir = os.path.join(save_main_dir, model.split("\\")[-1])
    print(save_dir)
    shutil.rmtree(save_dir, ignore_errors=True)
    os.makedirs(save_dir, exist_ok=True)
    info = {
        "model_dir": model,
        "keys": keys,
        "time": datetime.now().strftime("%Y%m%d-%H%M%S")
    }
    open(os.path.join(save_dir, "info.json"), 'w').write(
        json.dumps(info, indent=4))

    base_ev.to_csv(f'{save_dir}/eval_base.csv', index=False)
    grouped_ev.to_csv(f'{save_dir}/eval_grouped.csv', index=False)
    consecutive_ev.to_csv(
        f'{save_dir}/eval_consecutive.csv', index=False)
    grouped_consecutive_ev.to_csv(
        f'{save_dir}/eval_grouped_consecutive.csv', index=False)


  0%|          | 0/12 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/307 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/307 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/307 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/307 [00:00<?, ?it/s]

data/paperswithcode/evaluated/classification/bert_base_uncased/all\content_textsim


Evaluating:   0%|          | 0/307 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/307 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/307 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/307 [00:00<?, ?it/s]

data/paperswithcode/evaluated/classification/bert_base_uncased/all\content_zeroshot


Evaluating:   0%|          | 0/307 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/307 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/307 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/307 [00:00<?, ?it/s]

data/paperswithcode/evaluated/classification/bert_base_uncased/all\grouped_textsim


Evaluating:   0%|          | 0/307 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/307 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/307 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/307 [00:00<?, ?it/s]

data/paperswithcode/evaluated/classification/bert_base_uncased/all\grouped_zeroshot


Evaluating:   0%|          | 0/307 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/307 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/307 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/307 [00:00<?, ?it/s]

data/paperswithcode/evaluated/classification/bert_base_uncased/all\header_content_textsim


Evaluating:   0%|          | 0/307 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/307 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/307 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/307 [00:00<?, ?it/s]

data/paperswithcode/evaluated/classification/bert_base_uncased/all\header_content_zeroshot


Evaluating:   0%|          | 0/307 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/307 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/307 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/307 [00:00<?, ?it/s]

data/paperswithcode/evaluated/classification/bert_base_uncased/all\header_plus_content_textsim


Evaluating:   0%|          | 0/307 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/307 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/307 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/307 [00:00<?, ?it/s]

data/paperswithcode/evaluated/classification/bert_base_uncased/all\header_plus_content_zeroshot


Evaluating:   0%|          | 0/307 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/307 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/307 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/307 [00:00<?, ?it/s]

data/paperswithcode/evaluated/classification/bert_base_uncased/all\header_plus_textsim


Evaluating:   0%|          | 0/307 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/307 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/307 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/307 [00:00<?, ?it/s]

data/paperswithcode/evaluated/classification/bert_base_uncased/all\header_plus_zeroshot


Evaluating:   0%|          | 0/307 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/307 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/307 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/307 [00:00<?, ?it/s]

data/paperswithcode/evaluated/classification/bert_base_uncased/all\header_textsim


Evaluating:   0%|          | 0/307 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/307 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/307 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/307 [00:00<?, ?it/s]

data/paperswithcode/evaluated/classification/bert_base_uncased/all\header_zeroshot
