In [1]:
import pandas as pd
import cmarkgfm
from util.github_helper import get_readme
from util.readme_parser import ReadmeSectionParser
from util.string_helper import clear_text

In [2]:
neurips = pd.read_csv("data/paperswithcode/neurips_checklist_manuel-100.csv")
neurips.rename(columns={"url": "repo"}, inplace=True)
manuel = pd.read_json(
    "data/acl/sections/labelled/manuel/sections_grouped_orderby-stars/processed/intersection.json")


In [3]:
not_reprod_score = [0]
reprod_score = [5,6]

In [4]:
neurips["score"] = neurips.select_dtypes(include=['bool']).sum(axis=1)
neurips_not_reprod = neurips[neurips.score.isin(not_reprod_score)]
neurips_reprod = neurips[neurips.score.isin(reprod_score)]


In [5]:
manuel["score"] = manuel.intersection.apply(lambda x: len(x))
manuel_not_reprod = manuel[manuel.score.isin(not_reprod_score)]
manuel_reprod = manuel[manuel.score.isin(reprod_score)]


In [6]:
not_reprod = pd.concat([neurips_not_reprod, manuel_not_reprod])
reprod = pd.concat([neurips_reprod, manuel_reprod])


In [7]:
not_reprod.shape, reprod.shape


((58, 10), (34, 10))

In [8]:
data = pd.concat([not_reprod, reprod])[["repo", "score"]]
data.loc[data.score.isin(not_reprod_score), "label"] = 0
data.loc[data.score.isin(reprod_score), "label"] = 1
data.label = data.label.astype(int)
data.sample(n=10)


Unnamed: 0,repo,score,label
30,VinAIResearch/BERTweet,0,0
22,Philip-Bachman/amdim-public,5,1
57,baidu/Senta,0,0
45,allenai/natural-instructions,0,0
91,brjathu/RPSnet,6,1
62,brmson/dataset-sts,0,0
134,jiesutd/YEDDA,0,0
47,allenai/scibert,0,0
4,facebookresearch/XLM,6,1
142,maszhongming/MatchSum,5,1


In [9]:
import tqdm


for repo in (padm := tqdm.tqdm(data.repo.values)):
    padm.set_postfix_str(f"Processing {repo}")
    readme_content, url = get_readme(repo)
    html = cmarkgfm.github_flavored_markdown_to_html(readme_content)
    readme_parser = ReadmeSectionParser(repo, html)
    sections = readme_parser.parse_sections(
        with_mask=True, group_by_parent=True)
    content = ReadmeSectionParser.merge_sections(pd.DataFrame(
        sections), keys=["header", "content"], with_newline=True)
    data.loc[data.repo == repo, "content"] = content


100%|██████████| 92/92 [00:38<00:00,  2.41it/s, Processing maszhongming/MatchSum]                                          


In [12]:
data.rename(columns={"content": "text"}, inplace=True)


In [13]:
data.text = data.text.apply(clear_text)

In [None]:
data["text_length"] = data.text.apply(lambda x: len(x.split(" ")))

In [14]:
data[["text", "label"]].to_csv(
    "data/mixed/hierarchical_binary.csv", index=False)
