In [47]:
import json
import os


In [48]:
raw_dataset_dir = r"D:\Code\DataSet\CodeForceDataSet-python-raw"
file_dataset_dir = r"D:\Code\DataSet\CodeForceDataSet-python-file"

In [49]:
if not os.path.exists(file_dataset_dir):
    os.makedirs(file_dataset_dir, exist_ok=True)


In [50]:
code_list = []

In [51]:
# 遍历源文件夹的所有子文件夹
for root, dirs, files in os.walk(raw_dataset_dir):
    for file in files:
        # 检查文件是否是以数字命名的 JSON 文件
        if file.endswith('.json') and file.split('.')[0].isdigit():
            with open(os.path.join(root, file), 'r', encoding='utf-8') as json_file:
                data = json.load(json_file)
                # 提取代码部分
                code = data.get('code', None)
                if code:
                    submission_id = file.split('.')[0]
                    code_list.append({
                        "submission_id": submission_id,
                        "code": code.replace("\r\n", "\n")
                    })

In [52]:
len(code_list)


192719

In [53]:
code_list[0]


{'submission_id': '150671782',
 'code': 'import math\nn, m, a = map(int, input().split())\nprint(math.ceil(n / a) * math.ceil(m / a))'}

In [54]:
from datasets import Dataset

In [55]:
dataset = Dataset.from_dict({
    "submission_id": [item["submission_id"] for item in code_list],
    "code": [item["code"] for item in code_list]
})


In [56]:
dataset.save_to_disk(file_dataset_dir)


Saving the dataset (0/1 shards):   0%|          | 0/192719 [00:00<?, ? examples/s]

python -m text_dedup.minhash --local --path "D:\Code\DataSet\CodeForceDataSet-python-file" --cache_dir "./cache" --output "output/minhash/cf_code_dedup" --column "code" --batch_size 10000


In [58]:
def generate_trigrams(sentence):
    words = sentence.split()  # 将句子分割为单词列表
    trigrams = []
    for i in range(len(words) - 2):  # 遍历单词列表并创建三元组
        trigram = ' '.join(words[i:i+3])  # 使用空格连接三个连续的单词以构建三元组
        trigrams.append(trigram)
    return trigrams


In [62]:
string = """import math
n, m, a = map(int, input().split())"""
print(generate_trigrams(string))


['import math n,', 'math n, m,', 'n, m, a', 'm, a =', 'a = map(int,', '= map(int, input().split())']


In [68]:
import hashlib

def generate_hash(tuple_text):
    """为给定的文本元组生成哈希值"""
    return int(hashlib.sha256(tuple_text.encode('utf-8')).hexdigest(), 16)

def generate_hashes_for_ngram(ngram_list, num_hashes=5):
    """为不同的N-元组生成指定数量的哈希值"""
    return [[generate_hash(ngram + str(i)) for i in range(num_hashes)] for ngram in ngram_list]

def minhash(hash_matrix):
    """为文档哈希矩阵的每一列取最小值来实现MinHash"""
    return [min(col) for col in zip(*hash_matrix)]

In [71]:
ngrams = ['import math n,', 'math n, m,', 'n, m, a', 'm, a =', 'a = map(int,', '= map(int, input().split())']
hashes = generate_hashes_for_ngram(ngrams)
for ngram, hash_vals in zip(ngrams, hashes):
    print(f"{ngram}: {hash_vals}")
min_hashes = minhash(hashes)
print(f"MinHash values: {min_hashes}")


import math n,: [78149134553365072019066775144341277670111421087739512054818099953271490902912, 59802554008788181017501849161104303166835184258835941971123538498749717839410, 3769228439970723341477483693524990186945019192842147305650759239360087487732, 53232963759652475888720587332114382818059579000212916713009454504634250394057, 59397435971425653888946667088231756636550816685042897983825988182294850498956]
math n, m,: [600793418204153550189536066512918301608424099041150134033652330543484825481, 97806190521931182457751456888272392718089203216732451690288889093504386853580, 38370078065514351124219195395634183032320375195922083111383374364717637694618, 64069454102526755705207035566195984050734407232165754232664927544526488439070, 104651007050237937740874437825452753794947150616383621776437477144652308200367]
n, m, a: [18913708237792113415860115878944559017424004974950826964919516411217895150353, 38051928376668428940142715993718585868142810136774274995896967437768184890350, 78057380996533