In [7]:
import torch
from pathlib import Path
import h5py
import json
import numpy as np

In [8]:
# 将h5文件读取为dict
def hdf5_to_dict(hdf5_file):
    hdf5_file = h5py.File(hdf5_file, "r")
    def recursively_convert(h5_obj):
        if isinstance(h5_obj, h5py.Group):
            return {key: recursively_convert(h5_obj[key]) for key in h5_obj.keys()}
        elif isinstance(h5_obj, h5py.Dataset):
            return h5_obj[()]
        else:
            raise TypeError("Unsupported h5py object type")
    return recursively_convert(hdf5_file)

# 每5帧划分为1个clip，不足5帧的clip忽略
def get_clips_turn(picks, clip_length=5):
    clips = []
    reminder = len(picks) % clip_length
    n = len(picks) - reminder
    for i in range(0, n, clip_length):
        clips.append(picks[i:i+clip_length])
    return clips, reminder

# 将picks划分为n段，之后针对每段跳帧取clip
def get_clips_jump(picks, num_seg=5):
    num_samples = len(picks) // num_seg
    reminder = len(picks) % num_samples
    clips = []
    for i in range(num_samples):
        indices = []
        for j in range(num_seg):
            indices.append(i+j*num_samples)
        clips.append(picks[indices])
    return clips, reminder

# 生成sample的id
def id_generator(dataset_name, video_name, clip_type, sample_id, remainder):
    # example: SumMe_video_1_00000000
    # 后八位数字的前两位表示clip的类型，00代表turn，01代表jump
    # 中间四位代表sample的id，最后两位代表reminder，也就是有多少帧被忽略
    # 默认sample数小于9999，reminder小于99
    if clip_type == "turn":
        clip_type = "00"
    elif clip_type == "jump":
        clip_type = "01"
    ids = dataset_name + "_" + video_name + "_" + clip_type + sample_id.zfill(4) + remainder.zfill(2)
    return ids

# 不同的大语言模型对conversation要求不同
def apply_conversation_template(conversation_template, video_name):
    if video_name in conversation_template:
        return conversation_template[video_name]
    else:
        return video_name

In [9]:
# 1 读取基本信息
origin_dataset_dir = "/root/autodl-tmp/data"

summe_h5_path = Path(origin_dataset_dir,"SumMe","summe.h5")
tvsum_h5_path = Path(origin_dataset_dir, "TVSum", "tvsum.h5")
summe_json_path = Path(origin_dataset_dir, "SumMe", "video_name_dict.json")
tvsum_json_path = Path(origin_dataset_dir, "TVSum", "video_name_dict.json")
summe_frame_dir = Path(origin_dataset_dir, "SumMe", "frames")
tvsum_frame_dir = Path(origin_dataset_dir, "TVSum", "frames")

summe_dict = hdf5_to_dict(summe_h5_path)
tvsum_dict = hdf5_to_dict(tvsum_h5_path)

with open(summe_json_path, "r") as f:
    summe_name_dict = json.load(f)
with open(tvsum_json_path, "r") as f:
    tvsum_name_dict = json.load(f)

# 将summe_name_dict和tvsum_name_dict反转
summe_name_dict_revers = {v: k for k, v in summe_name_dict.items()}
tvsum_name_dict_revers = {v: k for k, v in tvsum_name_dict.items()}
# summe_dict.keys()
# tvsum_dict.keys()
# summe_name_dict
# tvsum_name_dict

In [10]:
# 数据集的一个sample示例如下
# sample = {
#     "id":"SumMe_video_1_01000003",
#     "images": ["path1", "path2", "path3", "path4", "path5"],
#     "conversations":{
#       "role": "user",
#       "content": [
#           {"type": "text", "text": "What is shown in this image?"},
#           {"type": "image"},
#           {"type": "image"},
#           {"type": "image"},
#           {"type": "image"},
#           {"type": "image"},
#         ],
#     },
# }

In [22]:
# 2 生成dataset
dataset_samples_turn = []
dataset_samples_jump = []

summe_videos = summe_dict.keys()
tvsum_videos = tvsum_dict.keys()

for video_name in summe_videos:
    video_name_real = summe_name_dict_revers[video_name]
    frames_dir = Path(summe_frame_dir, video_name_real)

    video_dict = summe_dict[video_name]
    picks = video_dict["picks"]

    clips_turn, remainder_turn = get_clips_turn(picks, clip_length=5)
    clips_jump, remainder_jump = get_clips_jump(picks, num_seg=5)

    for i, clip in enumerate(clips_turn):
        sample = {}
        sample_id = id_generator("SumMe", video_name, "turn", str(i), str(remainder_turn))
        sample["id"] = sample_id
        sample["images"] = [Path(frames_dir, f"{str(frame).zfill(6)}.jpg") for frame in clip]
        dataset_samples_turn.append(sample)
    
    for i, clip in enumerate(clips_jump):
        sample = {}
        sample_id = id_generator("SumMe", video_name, "jump", str(i), str(remainder_jump))
        sample["id"] = sample_id
        sample["images"] = [Path(frames_dir, f"{str(frame).zfill(6)}.jpg") for frame in clip]
        dataset_samples_jump.append(sample)
    
dataset_samples_jump[-100:-90]


[{'id': 'SumMe_video_7_01010304',
  'images': [PosixPath('/root/autodl-tmp/data/SumMe/frames/Cockpit_Landing/001545.jpg'),
   PosixPath('/root/autodl-tmp/data/SumMe/frames/Cockpit_Landing/003345.jpg'),
   PosixPath('/root/autodl-tmp/data/SumMe/frames/Cockpit_Landing/005145.jpg'),
   PosixPath('/root/autodl-tmp/data/SumMe/frames/Cockpit_Landing/006945.jpg'),
   PosixPath('/root/autodl-tmp/data/SumMe/frames/Cockpit_Landing/008745.jpg')]},
 {'id': 'SumMe_video_7_01010404',
  'images': [PosixPath('/root/autodl-tmp/data/SumMe/frames/Cockpit_Landing/001560.jpg'),
   PosixPath('/root/autodl-tmp/data/SumMe/frames/Cockpit_Landing/003360.jpg'),
   PosixPath('/root/autodl-tmp/data/SumMe/frames/Cockpit_Landing/005160.jpg'),
   PosixPath('/root/autodl-tmp/data/SumMe/frames/Cockpit_Landing/006960.jpg'),
   PosixPath('/root/autodl-tmp/data/SumMe/frames/Cockpit_Landing/008760.jpg')]},
 {'id': 'SumMe_video_7_01010504',
  'images': [PosixPath('/root/autodl-tmp/data/SumMe/frames/Cockpit_Landing/001575.jp