In [1]:
import pandas as pd
from e2e_system.reproder import _ground_truth
import numpy as np


In [2]:
intersection_df = pd.read_json(
    "data/acl/sections/labelled/manuel/sections_grouped_orderby-stars/processed/intersection.json")
min_intersection_df = pd.read_json(
    "data/acl/sections/labelled/manuel/sections_grouped_orderby-stars/processed/min_intersection.json")
union_df = pd.read_json(
    "data/acl/sections/labelled/manuel/sections_grouped_orderby-stars/processed/union.json")


In [3]:
def process(df, label_column_name):
    label_df = df.copy()
    for label in _ground_truth:
        label_df[label] = False

    for idx, row in label_df.iterrows():
        labels = row[label_column_name]
        for label in labels:
            label_df.loc[idx, label] = True

    label_df["score"] = label_df[list(_ground_truth)].sum(axis=1)
    label_df.sort_values("score", ascending=False, inplace=True)
    return label_df


In [4]:
intersection_df_processed = process(intersection_df, "intersection")
min_intersection_df_processed = process(
    min_intersection_df, "min_intersection")
union_df_processed = process(union_df, "union")


In [5]:
repositories = union_df.repo.unique()


In [6]:
sections = pd.read_csv("data/acl/sections/sections_grouped.csv")


In [7]:
from util.readme_parser import ReadmeSectionParser


def process(df):
    data = []
    for idx, row in df.iterrows():
        row_sections = sections[sections.repo == row.repo]
        all_content = ReadmeSectionParser.merge_sections(
            pd.DataFrame(row_sections), keys=["header", "content"], with_newline=True)
        data.append([row.repo, all_content, row.score])

    train_data = pd.DataFrame(data, columns=["repo", "content", "score"])
    train_data = train_data[['content', 'score']].rename(
        columns={'content': 'text', 'score': 'label'})
    return train_data


In [8]:
intersection_df_train = process(intersection_df_processed)
min_intersection_df_train = process(min_intersection_df_processed)
union_df_train = process(union_df_processed)


In [9]:
union_df_train["content_length"] = union_df_train["text"].apply(
    lambda x: len(x.split(" ")))
intersection_df_train["content_length"] = intersection_df_train["text"].apply(
    lambda x: len(x.split(" ")))
min_intersection_df_train["content_length"] = min_intersection_df_train["text"].apply(
    lambda x: len(x.split(" ")))
union_df_train.content_length.describe()


count      207.000000
mean      1152.053140
std       1389.043934
min         23.000000
25%        401.000000
50%        856.000000
75%       1254.500000
max      11425.000000
Name: content_length, dtype: float64

In [10]:
min_intersection_df_train = min_intersection_df_train[
    min_intersection_df_train.content_length <= 4096]
intersection_df_train = intersection_df_train[
    intersection_df_train.content_length <= 4096]
union_df_train = union_df_train[union_df_train.content_length <= 4096]


In [11]:
union_df_train.content_length.describe()


count     200.00000
mean      948.66000
std       737.69451
min        23.00000
25%       394.50000
50%       806.00000
75%      1224.50000
max      3704.00000
Name: content_length, dtype: float64

In [12]:
intersection_df_train.to_csv(
    "data/acl/sections/labelled/manuel/sections_grouped_orderby-stars/processed/intersection_train.csv", index=False)
min_intersection_df_train.to_csv(
    "data/acl/sections/labelled/manuel/sections_grouped_orderby-stars/processed/min_intersection_train.csv", index=False)
union_df_train.to_csv(
    "data/acl/sections/labelled/manuel/sections_grouped_orderby-stars/processed/union_train.csv", index=False)


In [13]:
# zip these 3 cvs files
import zipfile
with zipfile.ZipFile("data/acl/sections/labelled/manuel/sections_grouped_orderby-stars/processed/train_data.zip", 'w') as zipf:
    zipf.write("data/acl/sections/labelled/manuel/sections_grouped_orderby-stars/processed/intersection_train.csv",
               arcname="intersection_train.csv")
    zipf.write("data/acl/sections/labelled/manuel/sections_grouped_orderby-stars/processed/min_intersection_train.csv",
               arcname="min_intersection_train.csv")
    zipf.write("data/acl/sections/labelled/manuel/sections_grouped_orderby-stars/processed/union_train.csv",
               arcname="union_train.csv")