In [1]:
import itertools as it
import random
from collections import defaultdict
from typing import Iterator

import pandas as pd
import seaborn as sns
from entitybert.selection import prepare_file_ranker_df
from tqdm import tqdm

In [14]:
class ItemLookup[T]:
    def __init__(self):
        self._items: dict[int, list[T]] = defaultdict(list)

    def add_item(self, value: int, item: T):
        self._items[value].append(item)

    def within(self, value_range: range) -> Iterator[T]:
        for value in value_range:
            yield from self._items[value]


class File:
    def __init__(self, id: int, lloc: int, entities: int):
        self.id = id
        self.lloc = lloc
        self.entities = entities

    def __repr__(self) -> str:
        return f"File(id={self.id}, lloc={self.lloc}, entities={self.entities})"


class FileLookup:
    def __init__(self):
        self._files: dict[int, File] = dict()
        self._by_lloc: ItemLookup[File] = ItemLookup()
        self._by_entities: ItemLookup[File] = ItemLookup()

    def add_file(self, file: File):
        if file.id in self._files:
            raise ValueError("duplicate file id")
        self._files[file.id] = file
        self._by_lloc.add_item(file.lloc, file)
        self._by_entities.add_item(file.entities, file)

    def rand_file(self) -> File:
        return random.choice(list(self._files.values()))

    def within(self, lloc_range: range, entities_range: range) -> set[File]:
        lloc = self._by_lloc.within(lloc_range)
        entities = self._by_entities.within(entities_range)
        return set(lloc) & set(entities)


class ProjectLookup:
    def __init__(self):
        self._projects: dict[str, FileLookup] = defaultdict(FileLookup)

    def add_file(self, project: str, file: File):
        self._projects[project].add_file(file)

    def rand_project(self) -> str:
        return random.choice(list(self._projects.keys()))

    def rand_file(self, project: str) -> File:
        return self._projects[project].rand_file()

    def rand_file_within_range(
        self, project: str, lloc_range: range, entities_range: range
    ) -> File | None:
        files = self._projects[project].within(lloc_range, entities_range)
        if len(files) == 0:
            return None
        return random.choice(list(files))

    def rand_file_pair(
        self, lloc_tol: int, entities_tol: int
    ) -> tuple[File, File] | None:
        a_project = self.rand_project()
        b_project = self.rand_project()
        a_file = self.rand_file(a_project)
        lloc_range = range(max(0, a_file.lloc - lloc_tol), a_file.lloc + lloc_tol + 1)
        entities_range = range(
            max(0, a_file.entities - entities_tol), a_file.entities + entities_tol + 1
        )
        b_file = self.rand_file_within_range(b_project, lloc_range, entities_range)
        if b_file is None:
            return None
        if a_file.id == b_file.id:
            return None
        return (a_file, b_file)

    def sample_n_pairs(
        self, lloc_tol: int, entities_tol: int, n: int
    ) -> list[tuple[File, File]]:
        ids: set[int] = set()
        pairs: set[tuple[File, File]] = set()
        while len(pairs) < n:
            pair = self.rand_file_pair(lloc_tol, entities_tol)
            if pair is None:
                continue
            if pair[0].id in ids or pair[1].id in ids:
                continue
            ids.add(pair[0].id)
            ids.add(pair[1].id)
            pairs.add(pair)
        return list(pairs)

In [3]:
with open(".data/dbs_test.txt") as f:
    db_paths = sorted(line.rstrip() for line in f.readlines())

In [4]:
random.seed(0)
random.shuffle(db_paths)

In [5]:
dfs = []

for db_path in tqdm(db_paths):
    df = prepare_file_ranker_df(db_path)
    if df is None:
        continue
    df.insert(0, "project", db_path)
    dfs.append(df)

df = pd.concat(dfs, ignore_index=True)
df

100%|██████████| 645/645 [05:35<00:00,  1.92it/s]


Unnamed: 0,project,filename,loc,lloc,entities,content
0,.data/dbs/AigeStudio/WheelPicker.db,Demo/src/main/java/com/aigestudio/wheelpicker/...,69,53,9,package com.aigestudio.wheelpicker.demo;\n\nim...
1,.data/dbs/AigeStudio/WheelPicker.db,WheelPicker/src/main/java/com/aigestudio/wheel...,28,19,6,package com.aigestudio.wheelpicker.model;\n\ni...
2,.data/dbs/AigeStudio/WheelPicker.db,WheelPicker/src/main/java/com/aigestudio/wheel...,29,19,6,package com.aigestudio.wheelpicker.model;\n\n\...
3,.data/dbs/AigeStudio/WheelPicker.db,WheelPicker/src/main/java/com/aigestudio/wheel...,592,497,88,package com.aigestudio.wheelpicker.widgets;\n\...
4,.data/dbs/AigeStudio/WheelPicker.db,WheelPicker/src/main/java/com/aigestudio/wheel...,117,84,17,package com.aigestudio.wheelpicker.widgets;\n\...
...,...,...,...,...,...,...
101918,.data/dbs/locationtech/geogig.db,src/storage/temporary-rocksdb/src/main/java/or...,152,117,22,/* Copyright (c) 2016 Boundless and others.\n ...
101919,.data/dbs/locationtech/geogig.db,src/storage/temporary-rocksdb/src/main/java/or...,30,13,2,/* Copyright (c) 2019 Boundless and others.\n ...
101920,.data/dbs/locationtech/geogig.db,src/storage/temporary-rocksdb/src/main/java/or...,238,201,23,/* Copyright (c) 2016 Boundless and others.\n ...
101921,.data/dbs/locationtech/geogig.db,src/storage/temporary-rocksdb/src/main/java/or...,107,70,9,/* Copyright (c) 2016 Boundless and others.\n ...


In [6]:
df.to_csv("fileranker.csv", index=False)

In [7]:
with open("project_names.txt", "w") as f:
    f.write("\n".join(sorted(set(df["project"]))))

In [20]:
df["entities"].std() * (1 / 64)

0.3585340088384863

In [21]:
df["lloc"].std() * (1 / 64)

2.881388745153522

In [22]:
project_lookup = ProjectLookup()

for ix, row in df.iterrows():
    lloc = row["lloc"]
    entities = row["entities"]
    project_lookup.add_file(row["project"], File(int(ix), lloc, entities))

In [23]:
pairs = project_lookup.sample_n_pairs(3, 0, 800)

In [24]:
pair = pairs[9]
df.loc[[pair[0].id, pair[1].id]]

Unnamed: 0,project,filename,loc,lloc,entities,content
33265,.data/dbs/classgraph/classgraph.db,src/main/java/io/github/classgraph/MappableInf...,121,42,7,/*\n * This file is part of ClassGraph.\n *\n ...
27502,.data/dbs/mezz/JustEnoughItems.db,Forge/src/main/java/mezz/jei/forge/plugins/for...,55,45,7,package mezz.jei.forge.plugins.forge;\n\nimpor...


In [25]:
print(df.loc[pair[0].id]["content"])

/*
 * This file is part of ClassGraph.
 *
 * Author: Luke Hutchison
 *
 * Hosted at: https://github.com/classgraph/classgraph
 *
 * --
 *
 * The MIT License (MIT)
 *
 * Copyright (c) 2019 Luke Hutchison
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
 * documentation files (the "Software"), to deal in the Software without restriction, including without
 * limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
 * the Software, and to permit persons to whom the Software is furnished to do so, subject to the following
 * conditions:
 *
 * The above copyright notice and this permission notice shall be included in all copies or substantial
 * portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT
 * LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.

In [26]:
print(df.loc[pair[1].id]["content"])

package mezz.jei.forge.plugins.forge;

import mezz.jei.api.IModPlugin;
import mezz.jei.api.JeiPlugin;
import mezz.jei.api.constants.ModIds;
import mezz.jei.api.registration.IRuntimeRegistration;
import mezz.jei.forge.events.RuntimeEventSubscriptions;
import mezz.jei.forge.startup.EventRegistration;
import mezz.jei.gui.startup.JeiEventHandlers;
import mezz.jei.gui.startup.JeiGuiStarter;
import mezz.jei.gui.startup.ResourceReloadHandler;
import net.minecraft.resources.ResourceLocation;
import net.minecraftforge.common.MinecraftForge;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;

import javax.annotation.Nullable;
import java.util.Optional;

@JeiPlugin
public class ForgeGuiPlugin implements IModPlugin {
	private static final Logger LOGGER = LogManager.getLogger();
	private static @Nullable ResourceReloadHandler resourceReloadHandler;

	private final RuntimeEventSubscriptions runtimeSubscriptions = new RuntimeEventSubscriptions(MinecraftForge.EVENT_BUS