In [1]:
from util.readme_parser import ReadmeSectionParser
import pandas as pd
from tqdm.auto import tqdm
from util.labelling_helper import GroundTruthLabellingHelper
tqdm.pandas()


In [2]:
df = pd.read_csv("data/acl/sections/sections_clean.csv", index_col=0)
df.parent_header.fillna("", inplace=True)


In [3]:
labelling_helper = GroundTruthLabellingHelper()


In [4]:
sentence = "Installation Installing <LINK text='Jigg parser'> (for Japanese)"
labelling_helper.get_most_similar_group(sentence)


('requirements', 0.24110528826713562)

### Base


In [5]:
header_labelled_df = df.copy()
header_labelled_df[["label", "score"]] = df["header"].progress_apply(
    lambda x: pd.Series(labelling_helper.get_most_similar_group(x)))


  0%|          | 0/35463 [00:00<?, ?it/s]

In [None]:
content_labelled_df = df.copy()
content_labelled_df[["label", "score"]] = df["content"].progress_apply(
    lambda x: pd.Series(labelling_helper.get_most_similar_group(x)))


In [None]:
header_plus_labelled_df = df.copy()
header_plus_labelled_df[["label", "score"]] = df[["parent_header", "header"]].progress_apply(
    lambda x: pd.Series(labelling_helper.get_most_similar_group(" ".join(filter(None, x)))), axis=1)


In [None]:
header_content_labelled_df = df.copy()
header_content_labelled_df[["label", "score"]] = header_content_labelled_df[["header", "content"]].progress_apply(
    lambda x: pd.Series(labelling_helper.get_most_similar_group(" ".join(filter(None, x)))), axis=1)


In [None]:
header_plus_content_labelled_df = df.copy()
header_plus_content_labelled_df[["label", "score"]] = header_plus_content_labelled_df[["parent_header", "header", "content"]].progress_apply(
    lambda x: pd.Series(labelling_helper.get_most_similar_group(" ".join(filter(None, x)))), axis=1)


### Grouped


In [None]:
grouped_df = df.copy()
grouped_sections = []

for repo in df.repo.unique().tolist():
    repo_sections = df[df["repo"] == repo]
    grouped = ReadmeSectionParser.group_by_parent_header(repo_sections)
    grouped_sections += grouped

grouped_sections = pd.DataFrame(grouped_sections)


In [None]:
grouped_labelled_df = grouped_sections.copy()
grouped_labelled_df[["label", "score"]] = grouped_sections[["header", "content"]].progress_apply(
    lambda x: pd.Series(labelling_helper.get_most_similar_group(" ".join(filter(None, x)))), axis=1)


  0%|          | 0/22032 [00:00<?, ?it/s]

### Saving


In [None]:
# main_dir = "data/acl/sections/labelled/text_sim_ground_truth/"

# header_labelled_df.to_csv(f"{main_dir}header.csv")
# header_plus_labelled_df.to_csv(f"{main_dir}header_plus.csv")
# content_labelled_df.to_csv(f"{main_dir}content.csv")
# header_content_labelled_df.to_csv(f"{main_dir}header_content.csv")
# header_plus_content_labelled_df.to_csv(
#     f"{main_dir}header_plus_content.csv")
# grouped_labelled_df.to_csv(
#     f"{main_dir}grouped.csv")
