In [None]:
from util.labelling_helper import ZeroShotLabellingHelper
from util.readme_parser import ReadmeSectionParser

import pandas as pd

from tqdm import tqdm
from tqdm.auto import tqdm
tqdm.pandas()


In [None]:
df = pd.read_csv("data/acl/sections/sections_clean.csv", index_col=0)
df.parent_header.fillna("", inplace=True)


In [None]:
labelling_helper = ZeroShotLabellingHelper()


In [None]:
sentence = "Installation Installing <LINK text='Jigg parser'> (for Japanese)"
labelling_helper.get_most_likely_label(sentence)


### Base


In [None]:
header_labelled_df = df.copy()
header_labelled_df[["label", "score"]] = df["header"].progress_apply(
    lambda x: pd.Series(labelling_helper.get_most_likely_label(x)))


In [None]:
header_plus_content_labelled_df = df.copy()
header_plus_content_labelled_df[["label", "score"]] = header_plus_content_labelled_df[["parent_header", "header", "content"]].progress_apply(
    lambda x: pd.Series(labelling_helper.get_most_likely_label(" ".join(filter(None, x)))), axis=1)


In [None]:
header_content_labelled_df = df.copy()
header_content_labelled_df[["label", "score"]] = header_content_labelled_df[["header", "content"]].progress_apply(
    lambda x: pd.Series(labelling_helper.get_most_likely_label(" ".join(filter(None, x)))), axis=1)
# 4:39


In [None]:
content_labelled_df = df.copy()
content_labelled_df[["label", "score"]] = content_labelled_df["content"].progress_apply(
    lambda x: pd.Series(labelling_helper.get_most_likely_label(x)))
# 4:33


In [None]:
header_plus_labelled_df = df.copy()
header_plus_labelled_df[["label", "score"]] = header_plus_labelled_df[["parent_header", "header"]].progress_apply(
    lambda x: pd.Series(labelling_helper.get_most_likely_label(" ".join(filter(None, x)))), axis=1)
# 2:32


### Grouped


In [None]:
grouped_df = df.copy()
grouped_sections = []

for repo in df.repo.unique().tolist():
    repo_sections = df[df["repo"] == repo]
    grouped = ReadmeSectionParser.group_by_parent_header(repo_sections)
    grouped_sections += grouped

grouped_sections = pd.DataFrame(grouped_sections)


In [None]:
grouped_labelled_df = grouped_sections.copy()
grouped_labelled_df[["label", "score"]] = grouped_sections[["header", "content"]].progress_apply(
    lambda x: pd.Series(labelling_helper.get_most_likely_label(" ".join(filter(None, x)))), axis=1)


### Saving


In [None]:
main_dir = "data/acl/sections/labelled/zeroshot/"

header_labelled_df.to_csv(f"{main_dir}header.csv")
header_content_labelled_df.to_csv(f"{main_dir}header_content.csv")
content_labelled_df.to_csv(f"{main_dir}content.csv")
header_plus_labelled_df.to_csv(f"{main_dir}header_plus.csv")
header_plus_content_labelled_df.to_csv(f"{main_dir}header_plus_content.csv")
grouped_labelled_df.to_csv(f"{main_dir}grouped.csv")
