In [None]:
import pandas as pd
from nltk.tokenize import word_tokenize
from tqdm.auto import tqdm
tqdm.pandas()


In [None]:
masked = pd.read_csv("data/acl/sections/sections.csv")
unmasked = pd.read_csv("data/acl/sections/sections_unmasked.csv")
grouped_masked = pd.read_csv("data/acl/sections/sections_grouped.csv")
grouped_unmasked = pd.read_csv(
    "data/acl/sections/sections_grouped_unmasked.csv")


### Section Uzunluk Analizi


In [None]:
def get_token_describe(df, typename):
    df = df.copy()
    df["tokens"] = df["content"].astype(str).progress_apply(word_tokenize)
    df["header_tokens"] = df["header"].astype(
        str).progress_apply(word_tokenize)
    if "parent_header" in df.columns:
        df["parent_header_tokens"] = df["parent_header"].astype(
            str).progress_apply(word_tokenize)
    # df["token_count"] = df["tokens"].apply(len)
    df["token_count"] = df["tokens"].progress_apply(len)
    df["header_token_count"] = df["header_tokens"].progress_apply(len)
    if "parent_header" in df.columns:
        df["parent_header_token_count"] = df["parent_header_tokens"].progress_apply(
            len)
    df["total_token_count"] = df["token_count"] + df["header_token_count"]
    if "parent_header" in df.columns:
        df["total_token_count"] += df["parent_header_token_count"]

    desc = df[["total_token_count"]].describe()
    desc.rename(columns={"total_token_count": typename}, inplace=True)
    return df, desc


In [None]:
masked_df, masked_desc = get_token_describe(masked, "masked")
unmasked_df, unmasked_desc = get_token_describe(unmasked, "unmasked")
grouped_masked_df, grouped_masked_desc = get_token_describe(
    grouped_masked, "grouped_masked")
grouped_unmasked_df, grouped_unmasked_desc = get_token_describe(
    grouped_unmasked, "grouped_unmasked")


In [None]:
analysis = pd.concat([masked_desc, unmasked_desc,
                     grouped_masked_desc, grouped_unmasked_desc], axis=1).T
analysis.astype(int).to_latex("x.tex")


In [None]:
def filter_by_token_count(df, limit):
    count = df[df["token_count"] > limit].shape[0]
    return count, round(count / df.shape[0], 4) * 100


In [None]:
print("Masked sections(>512 tokens):", filter_by_token_count(masked_df, 512))
print("Unmasked sections(>512 tokens):",
      filter_by_token_count(unmasked_df, 512))
print("Grouped masked sections(>512 tokens):",
      filter_by_token_count(grouped_masked_df, 512))
print("Grouped unmasked sections(>512 tokens):",
      filter_by_token_count(grouped_unmasked_df, 512))


### Maskeleme Analizi


In [None]:
from util.readme_parser import encodings
import re


def extract_word(text):
    return re.findall(r"(<\w+)", text)


encodings = {key.replace("_c", ""): extract_word(
    encoding)[0] for key, encoding in encodings.items()}
masked_cp = masked.copy()
for mask in encodings:
    masked_cp[mask] = masked_cp["content"].str.count(encodings[mask])
masked_cp["mask_total"] = masked_cp[encodings.keys()].sum(axis=1)


In [None]:
group_by_repo = masked_cp.groupby("repo").sum(numeric_only=True).reset_index()
group_by_repo.describe().iloc[1:, 2:].T
