In [None]:
import torch
from datasets import Dataset, DatasetDict, IterableDataset, load_dataset,load_from_disk
import hashlib
import os
import tqdm
# 示例文件路径
file_path = "/data/changye/data/Align-Anything/Align-Anything_cooccur.pt"

os.environ['HF_DATASETS_CACHE']='/data/changye/tmp'
# 读取 .pt 文件
data = torch.load(file_path)

dataset_path="/data/changye/data/Align-Anything-TI2T-Instruction-100K"
train_dataset = load_dataset(
            dataset_path,
            split="train",
            trust_remote_code=True,
        )


In [None]:
print(train_dataset[0])

In [None]:

data_list=[]
for data_l in data:
    keys=list(data_l.keys())
    for key in keys:
        data_list.append((key,data_l[key]))


In [None]:
print(len(data_list))

In [None]:
def generate_text_hash(text: str) -> str:
    """
    Generate a unique identifier for the given text using SHA-256.

    Args:
        text (str): Input text.

    Returns:
        str: Unique hash for the text.
    """
    hash_object = hashlib.sha256(text.encode('utf-8'))
    return hash_object.hexdigest()

In [None]:
dataset_dict = {}
for j, item in tqdm.tqdm(enumerate(train_dataset), desc="Processing data_list", unit="item"):
    key = generate_text_hash(item['prompt']+item['response'])
    if key not in dataset_dict:
        dataset_dict[key] = []
    dataset_dict[key].append(j)  # 记录每个idx对应的所有匹配位置

# 2. 生成 formatted_dataset，确保按照顺序匹配
formatted_dataset = []
index_set = set()  # 用来记录已经匹配过的 train_dataset 索引

# 使用 tqdm 显示进度条
for i in tqdm.tqdm(range(len(data_list)), desc="Processing data_list", unit="item"):
    key_datalist, value_datalist = data_list[i]
    
    # 查找是否存在匹配的key
    if key_datalist in dataset_dict:
        # 遍历匹配的索引
        for j in dataset_dict[key_datalist]:
            if j not in index_set:  # 如果该项未匹配过
                # 如果匹配，则将 value 从 data_list 加到 train_dataset 项中
                new_item = train_dataset[j].copy()  # 拷贝 train_dataset 项
                new_item["Coocurr"] = value_datalist
                formatted_dataset.append(new_item)
                
                # 将已匹配的索引添加到 index_set
                index_set.add(j)
                break  # 找到一个匹配就跳出内层循环，继续下一个 data_list 的元素


In [None]:
print(len(formatted_dataset))
print(formatted_dataset[-1])
# print(len(dataset_dict))

In [None]:
# 定义文件路径
cosi_file_path = "/data/changye/data/Align-Anything/Align-Anything_cosi_weight/cosi_feature_list.txt"  # 将此替换为你的文件路径

# 读取文件并转换为字典
osi_dict = {}
with open(cosi_file_path, "r") as file:
    for line in file:
        key, value = line.strip().split(",")  # 按逗号分割每行
        osi_dict[int(key)] = float(value)    # 将 key 转为 int, value 转为 float

# 输出结果
print(osi_dict)

In [None]:
set_cosi_key=set(osi_dict.keys())
cosi_coocur_data={}
for f_data in tqdm.tqdm(formatted_dataset):
    score=0
    for value in f_data['Coocurr']:
        if value in set_cosi_key:
            score+=osi_dict[value]
    f_data["Cooccur_score"]=score

In [None]:
print(formatted_dataset[-1])

In [14]:
dataset = Dataset.from_list(formatted_dataset)
dataset.save_to_disk("/data/changye/data/Align-Anything-cosi-full")

Saving the dataset (0/13 shards):   0%|          | 0/99160 [00:00<?, ? examples/s]

In [None]:
# 1. 根据 l0 字段从大到小对数据集排序
formatted_dataset_sorted = sorted(formatted_dataset, key=lambda x: x["Cooccur_score"], reverse=True)

# 2. 将数据集分成四个分位
num_samples = len(formatted_dataset_sorted)
q1 = int(num_samples * 0.25)
q2 = int(num_samples * 0.5)
q3 = int(num_samples * 0.75)

# 将数据集切分为四个分位
split_datasets = {
    "q0_25": formatted_dataset_sorted[:q1],
    "q25_50": formatted_dataset_sorted[q1:q2],
    "q50_75": formatted_dataset_sorted[q2:q3],
    "q75_100": formatted_dataset_sorted[q3:]
}

# 3. 转换为 HuggingFace Dataset 格式并保存，显示进度条
hf_datasets = {}
for split_name, split_data in tqdm.tqdm(split_datasets.items(), desc="Processing splits", unit="split"):
    hf_datasets[split_name] = Dataset.from_dict({k: [d[k] for d in split_data] for k in split_data[0].keys()})
    
    # 保存每个分位数据集到磁盘
    hf_datasets[split_name].save_to_disk(f"./{split_name}_dataset")

# 输出一下分割的数据集
for split_name, dataset in hf_datasets.items():
    print(f"{split_name} dataset:")
    print(dataset)

In [None]:
print(formatted_dataset_sorted[-1])

In [None]:
print(formatted_dataset_sorted[0])

In [None]:
print(cosi_coocur_data['15557.jpg'])

In [None]:
#我现在有一个dict，键为'image_name'，值为一个浮点数，我希望按照值的大小进行排序，然后我有一个数据集以'image_name'为索引，结构为Dataset({
#     features: ['image', 'question', 'chosen', 'rejected', 'image_name'],
#     num_rows: 93258
# })
#我希望能够按照这个排序对这个dataset进行筛选保存，能够保存前25%，25%到50%这样按百分比保存的值，以huggingfacedataset的形式存储，请补全以下代码
# Step 1: Sort image_names based on their corresponding values in 'data'
sorted_image_names = sorted(cosi_coocur_data, key=cosi_coocur_data.get, reverse=True)

# Step 2: Compute indices for the percentiles
total_images = len(sorted_image_names)
percentiles = [0.25, 0.5, 0.75, 1.0]
indices = [int(total_images * p) for p in percentiles]

# Step 3: Create ranges for each percentile
start_indices = [0] + indices[:-1]
end_indices = indices
ranges = list(zip(start_indices, end_indices))

# Step 4: Filter and save datasets for each percentile range
for idx, (start, end) in enumerate(ranges):
    # Get the image_names for the current percentile range
    image_name_set = set(sorted_image_names[start:end])

    # Filter the dataset based on the image_name_set
    percentile_dataset = train_dataset.filter(lambda example: example['image_name'] in image_name_set)

    # Save the filtered dataset
    percentile_label = f"{int(percentiles[idx]*100)}%"
    percentile_dataset.save_to_disk(f'percentile_{percentile_label}_dataset')

    print(f"Saved {percentile_label} dataset with {len(percentile_dataset)} samples.")

In [None]:
print(list(cosi_coocur_data.values())[0])

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.pyplot as plt
import numpy as np
from scipy.interpolate import make_interp_spline
from collections import Counter

# 假设 data_list 是一个包含数值的列表
# 例如：
# data_list = [1, 2, 2, 3, 3, 3, 4, 4, 4, 4]
# values_list = [tensor['Cooccur'].cpu().item() if tensor['Cooccur'].is_cuda else tensor['Cooccur'].item() for tensor in formatted_dataset_sorted]
values_list=[tensor["Cooccur_score"] for tensor in formatted_dataset_sorted]
# 统计频率
num_bins = 10

# 生成直方图
frequencies, bin_edges = np.histogram(values_list, bins=num_bins)

# 计算每个区间的中心点（用于插值）
bin_centers = (bin_edges[:-1] + bin_edges[1:]) / 2

# 使用样条插值将直方图转换为平滑曲线
bin_centers_smooth = np.linspace(bin_centers[0], bin_centers[-1], 300)  # 插值点
frequencies_smooth = make_interp_spline(bin_centers, frequencies)(bin_centers_smooth)

# 绘图
plt.figure(figsize=(8, 6))
plt.plot(bin_centers_smooth, frequencies_smooth, color='orange', lw=2)


# 样式设置
plt.grid(which='both', linestyle='--', linewidth=0.5, alpha=0.7)
plt.xlabel('Cooccuring score of data ', fontsize=14)
plt.ylabel('Frequency', fontsize=14)
plt.title('The distribution of Align-Anything data based on cooccuring score', fontsize=16)
plt.legend(fontsize=12)
plt.show()
