In [1]:
import torch
from pathlib import Path
import h5py
import json
import numpy as np

In [2]:
# 将h5文件读取为dict
def hdf5_to_dict(hdf5_file):
    hdf5_file = h5py.File(hdf5_file, "r")
    def recursively_convert(h5_obj):
        if isinstance(h5_obj, h5py.Group):
            return {key: recursively_convert(h5_obj[key]) for key in h5_obj.keys()}
        elif isinstance(h5_obj, h5py.Dataset):
            return h5_obj[()]
        else:
            raise TypeError("Unsupported h5py object type")
    return recursively_convert(hdf5_file)

# 每5帧划分为1个clip，不足5帧的clip忽略
def get_clips_turn(picks, clip_length=5):
    clips = []
    reminder = len(picks) % clip_length
    n = len(picks) - reminder
    for i in range(0, n, clip_length):
        clips.append(picks[i:i+clip_length])
    return clips, reminder

# 将picks划分为n段，之后针对每段跳帧取clip
def get_clips_jump(picks, num_seg=5):
    num_samples = len(picks) // num_seg
    reminder = len(picks) % num_samples
    clips = []
    for i in range(num_samples):
        indices = []
        for j in range(num_seg):
            indices.append(i+j*num_samples)
        clips.append(picks[indices])
    return clips, reminder

# 生成sample的id
def id_generator(dataset_name, video_name, clip_type, sample_id, remainder):
    # example: SumMe_video_1_00000000
    # 后八位数字的前两位表示clip的类型，00代表turn，01代表jump
    # 中间四位代表sample的id，最后两位代表reminder，也就是有多少帧被忽略
    # 默认sample数小于9999，reminder小于99
    if clip_type == "turn":
        clip_type = "00"
    elif clip_type == "jump":
        clip_type = "01"
    ids = dataset_name + "_" + video_name + "_" + clip_type + sample_id.zfill(4) + remainder.zfill(2)
    return ids

# 不同的大语言模型对conversation要求不同
def apply_conversation_template(llm_name, num_images, prompt):
    if llm_name == "llava-next":
        conversation = [
            {
            "role": "user",
            "content": [
                {"type": "text", "text": prompt},
                # 重复{"type": "image"}, num_images次
                *[{"type": "image"} for _ in range(num_images)],
                ],
            },
        ]
    return conversation

In [3]:
# 1 读取基本信息
origin_dataset_dir = "/root/autodl-tmp/data"

summe_h5_path = Path(origin_dataset_dir,"SumMe","summe.h5")
tvsum_h5_path = Path(origin_dataset_dir, "TVSum", "tvsum.h5")
summe_json_path = Path(origin_dataset_dir, "SumMe", "video_name_dict.json")
tvsum_json_path = Path(origin_dataset_dir, "TVSum", "video_name_dict.json")
summe_frame_dir = Path(origin_dataset_dir, "SumMe", "frames")
tvsum_frame_dir = Path(origin_dataset_dir, "TVSum", "frames")

summe_dict = hdf5_to_dict(summe_h5_path)
tvsum_dict = hdf5_to_dict(tvsum_h5_path)

with open(summe_json_path, "r") as f:
    summe_name_dict = json.load(f)
with open(tvsum_json_path, "r") as f:
    tvsum_name_dict = json.load(f)

# 将summe_name_dict和tvsum_name_dict反转
summe_name_dict_revers = {v: k for k, v in summe_name_dict.items()}
tvsum_name_dict_revers = {v: k for k, v in tvsum_name_dict.items()}
# summe_dict.keys()
# tvsum_dict.keys()
# summe_name_dict
# tvsum_name_dict

In [4]:
# 数据集的一个sample示例如下
# sample = {
#     "id":"SumMe_video_1_01000003",
#     "images": ["path1", "path2", "path3", "path4", "path5"],
#     "conversations":{
#       "role": "user",
#       "content": [
#           {"type": "text", "text": "What is shown in this image?"},
#           {"type": "image"},
#           {"type": "image"},
#           {"type": "image"},
#           {"type": "image"},
#           {"type": "image"},
#         ],
#     },
# }

In [5]:
# 2 生成dataset
# context_prompt = "If you were a law enforcement agency, how would you rate the scene described on a scale from 0 to 1, with 0 representing a standard scene and 1 denoting a scene with suspicious activities?"
context_prompt = "You are a professional short film editor and director. Now you need to do some video summarization work. I will give you some frames of a short video. Please rate the frames based on their representativeness, diversity, and interest. You may need to refer to the context for rating. The rating is from 0 to 1, where 0 means not to be kept and 1 means to be kept."

format_prompt = "Please provide the response in the form of a Python list based on the framse and respond with just only one number for one frame in the provided list below [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0] without any textual explanation. It should begin with '[' and end with ']'. Apart from the score, do not reply anything else."

finaly_prompt = context_prompt + " " + format_prompt

dataset_samples_turn_summe = []
dataset_samples_jump_summe = []
dataset_samples_turn_tvsum = []
dataset_samples_jump_tvsum = []

summe_videos = summe_dict.keys()
tvsum_videos = tvsum_dict.keys()

# 生成SumMe的dataset
for video_name in summe_videos:
    video_name_real = summe_name_dict_revers[video_name]
    frames_dir = Path(summe_frame_dir, video_name_real)

    video_dict = summe_dict[video_name]
    picks = video_dict["picks"]

    clips_turn, remainder_turn = get_clips_turn(picks, clip_length=5)
    clips_jump, remainder_jump = get_clips_jump(picks, num_seg=5)

    for i, clip in enumerate(clips_turn):
        sample = {}
        sample_id = id_generator("SumMe", video_name, "turn", str(i), str(remainder_turn))
        sample["id"] = sample_id
        sample["images"] = [str(Path(frames_dir, f"{str(frame).zfill(6)}.jpg")) for frame in clip]
        sample["conversation"] = apply_conversation_template("llava-next", 5, finaly_prompt)
        dataset_samples_turn_summe.append(sample)
    
    for i, clip in enumerate(clips_jump):
        sample = {}
        sample_id = id_generator("SumMe", video_name, "jump", str(i), str(remainder_jump))
        sample["id"] = sample_id
        sample["images"] = [str(Path(frames_dir, f"{str(frame).zfill(6)}.jpg")) for frame in clip]
        sample["conversation"] = apply_conversation_template("llava-next", 5, finaly_prompt)
        dataset_samples_jump_summe.append(sample)

# 生成TVSum的dataset
for video_name in tvsum_videos:
    video_name_real = tvsum_name_dict_revers[video_name]
    frames_dir = Path(tvsum_frame_dir, video_name_real)

    video_dict = tvsum_dict[video_name]
    picks = video_dict["picks"]

    clips_turn, remainder_turn = get_clips_turn(picks, clip_length=5)
    clips_jump, remainder_jump = get_clips_jump(picks, num_seg=5)

    for i, clip in enumerate(clips_turn):
        sample = {}
        sample_id = id_generator("TVSum", video_name, "turn", str(i), str(remainder_turn))
        sample["id"] = sample_id
        sample["images"] = [str(Path(frames_dir, f"{str(frame).zfill(6)}.jpg")) for frame in clip]
        sample["conversation"] = apply_conversation_template("llava-next", 5, finaly_prompt)
        dataset_samples_turn_tvsum.append(sample)
    
    for i, clip in enumerate(clips_jump):
        sample = {}
        sample_id = id_generator("TVSum", video_name, "jump", str(i), str(remainder_jump))
        sample["id"] = sample_id
        sample["images"] = [str(Path(frames_dir, f"{str(frame).zfill(6)}.jpg")) for frame in clip]
        sample["conversation"] = apply_conversation_template("llava-next", 5, finaly_prompt)
        dataset_samples_jump_tvsum.append(sample)

In [6]:
# 3 保存dataset
out_dir_summe = Path("/root/TFVSN/dataset/SumMe")
out_dir_tvsum = Path("/root/TFVSN/dataset/TVSum")
if not out_dir_summe.exists():
    out_dir_summe.mkdir()
if not out_dir_tvsum.exists():
    out_dir_tvsum.mkdir()

json_turn_summe = Path(out_dir_summe, "summe_dataset_turn.json")
json_jump_summe = Path(out_dir_summe, "summe_dataset_jump.json")

json_turn_tvsum = Path(out_dir_tvsum, "tvsum_dataset_turn.json")
json_jump_tvsum = Path(out_dir_tvsum, "tvsum_dataset_jump.json")

# 保存为json文件
with open(json_turn_summe, "w") as f:
    json.dump(dataset_samples_turn_summe, f, indent=4)
with open(json_jump_summe, "w") as f:
    json.dump(dataset_samples_jump_summe, f, indent=4)
with open(json_turn_tvsum, "w") as f:
    json.dump(dataset_samples_turn_tvsum, f, indent=4)
with open(json_jump_tvsum, "w") as f:
    json.dump(dataset_samples_jump_tvsum, f, indent=4)
    

