# Data Process Pipeline

First, you should add hosts in your ~/.ssh/config file

In [105]:
import os
import paramiko

HOSTS = ["h800-80", "h800-81", "h800-82", "h800-83", "h800-84", "h800-85", "h800-86", "h800-170", "h800-171"]

# load from ~/.ssh/config
ssh_config = paramiko.SSHConfig()
user_config_file = os.path.expanduser("~/.ssh/config")
if os.path.exists(user_config_file):
    with open(user_config_file) as f:
        ssh_config.parse(f)


def get_ssh_config(hostname):
    # get the configuration for the host
    user_config = ssh_config.lookup(hostname)
    user_config
    cfg = {
        "hostname": user_config["hostname"],
        "username": user_config["user"],
        "port": int(user_config["port"]),
        "key_filename": user_config["identityfile"],
    }
    return cfg


def connect(hostname):
    cfg = get_ssh_config(hostname)
    # connect
    client = paramiko.SSHClient()
    client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
    client.connect(**cfg)
    return client


def run_command(command, hostname, get_pty=True, log=True, nohup=False, log_file=None):
    client = connect(hostname)
    command = f'bash -ic "{command}"' if get_pty else command
    if log_file:
        command = f'{command} > {log_file} 2>&1'
    if nohup:
        command = f'nohup {command} &'
    stdin, stdout, stderr = client.exec_command(command, get_pty=get_pty)
    stdout_str = stdout.read().decode()
    stderr_str = stderr.read().decode()
    if log:
        print("HOST:", hostname)
        if stdout_str:
            print("==== STDOUT ====\n", stdout_str)
        if stderr_str:
            print("==== STDERR ====\n", stderr_str)
    client.close()


def run_command_all_hosts(command, hosts=HOSTS):
    for hostname in hosts:
        run_command(command, hostname)

In [96]:
def nvidia_smi(host):
    if host:
        run_command("nvidia-smi", host)
    else:
        run_command_all_hosts("nvidia-smi")


def nvitop(host=None):
    if host:
        run_command(f"/home/zhaowangbo/.local/bin/nvitop -1", host)
    else:
        run_command_all_hosts("/home/zhaowangbo/.local/bin/nvitop -1")


def ps(host=None, interest="python|sleep|torchrun|colossal"):
    if host:
        if interest is None:
            run_command("ps ux | cat", host)
        else:
            run_command(f"ps ux | cat | grep --color=never -E '{interest}'", host)
    else:
        if interest is None:
            run_command_all_hosts("ps ux | cat")
        else:
            run_command_all_hosts(f"ps ux | cat | grep --color=never -E '{interest}'")

In [106]:
OPEN_SORA_HOME = "/home/zhaowangbo/open-sora"
def convert_dataset_cmd(input_dir, output_file, datatype="video"):
    commands = []
    commands.append(f'cd {OPEN_SORA_HOME}')
    # makedirs
    output_dir = os.path.dirname(output_file)
    commands.append(f'mkdir -p {output_dir}')
    commands.append(f'python -m tools.datasets.convert {datatype} {input_dir} --output {output_dir}')
    return " && ".join(commands), output_file

In [107]:
host = "h800-83"
log_file = "./logs/data-panda-16-split.log"
cmd, output_file = convert_dataset_cmd("/mnt/disk1/data-panda/16", "/mnt/hdd/data/panda70m_by/raw/meta/split-16/meta.csv")

In [109]:
run_command(cmd, host, nohup=True, log_file=log_file)

HOST: h800-83
==== STDOUT ====
 


In [108]:
cmd

'cd /home/zhaowangbo/open-sora && mkdir -p /mnt/hdd/data/panda70m_by/raw/meta/split-16 && python -m tools.datasets.convert video /mnt/disk1/data-panda/16 --output /mnt/hdd/data/panda70m_by/raw/meta/split-16'

In [102]:
ps(host, interest="convert")

HOST: h800-83
==== STDOUT ====
 zhaowan+ 2928070 10.0  0.0  14216  5556 pts/13   Ss   15:25   0:00 bash -ic ps ux | cat | grep --color=never -E 'convert'
zhaowan+ 2929492  0.0  0.0  12116   664 pts/13   S+   15:25   0:00 grep --color=auto --color=never -E convert



In [1819]:
RUN_CONVERT =               False
RUN_ZERO_FRAME =            False
RUN_CORRUPTED =             True
RUN_ZERO_FRAME_CORRUPTED =  False
RUN_COPY_OVER_CAPTION =     False

RUN_AESTHETICS =            False
RUN_COLLATE_AESTHETICS =    False
RUN_OPTICAL_FLOW =          False
RUN_AES_FLOW =              False

captioning_input_file = 'meta_remove_corrupted_aes_flow.csv'; RUN_CAPTIONING =            False
RUN_COLLATE_CAPTIONING =    False; 
RUN_COLLATE_CAPTIONING_INTERRUPTED =    False

RUN_INTERSECT_INFO =        False

RUN_CLEAN_REMOVE_LLAVA =    False
RUN_CLEAN_REMOVE =          False
clean_remove_input_path = 'meta_remove_corrupted_aes_flow_caption.csv'
clean_remove_output_path = 'meta_remove_corrupted_aes_flow_caption_cleaned_and_removed.csv'

### Convert to meta and collate

In [1821]:
if RUN_CONVERT:
    nohup_filename = "outs/" + NAME + "_convert_video.out"

    command = \
        'cd /home/tom/Open-Sora-dev/ && ' + \
        '/home/zhaowangbo/.conda/envs/opensora/bin/python convert_videos.py ' + \
        ' ' + DATA_PATH + ' ' + \
        ' ' + PATH + ' > ' + nohup_filename + ' 2>&1 &'
    print(command)
    output = client.exec_command(command)

### Filter out zero-frame and corrupted

In [1822]:
if RUN_CORRUPTED:
    nohup_filename = "outs/" + NAME + "_filter_corrupted.out"
    command = \
        'cd /home/tom/Open-Sora-dev/ && ' + \
        'nohup /home/zhaowangbo/.conda/envs/opensora/bin/python -m tools.datasets.csvutil ' + \
        PATH + 'meta.csv ' + \
        '--remove-corrupted' + ' > ' + nohup_filename + ' 2>&1 &'
    output = client.exec_command(command)


In [1823]:
if RUN_ZERO_FRAME:
    nohup_filename = "outs/" + NAME + "_filter_zero_frame.out"
    command = \
        'cd /home/tom/Open-Sora-dev/ && ' + \
        'nohup /home/zhaowangbo/.conda/envs/opensora/bin/python -m tools.datasets.csvutil ' + \
        PATH + 'meta_remove_corrupted.csv ' + \
        '--info --fmin 1 --output ' + \
        PATH + 'meta_remove_corrupted.csv' + ' > ' + nohup_filename + ' 2>&1 &'
    output = client.exec_command(command)


In [1824]:
if RUN_ZERO_FRAME_CORRUPTED:
    out_filename = "outs/" + NAME + "_filter_zero_frame_corrupted.out"
    command = \
        'cd /home/tom/Open-Sora-dev/ && ' + \
        '/home/zhaowangbo/.conda/envs/opensora/bin/python -m tools.datasets.csvutil ' + \
        PATH + 'meta.csv ' + \
        '--info --fmin 1 --output ' + \
        PATH + 'meta.csv ' + '> ' + out_filename + ' 2>&1 && ' + \
        'echo "[DONE]: filter zero frame" >> ' + out_filename + ' && ' + \
        '/home/zhaowangbo/.conda/envs/opensora/bin/python -m tools.datasets.csvutil ' + \
        PATH + 'meta.csv ' + \
        '--remove-corrupted' + ' >> ' + out_filename + ' 2>&1 && ' + \
        'echo "[DONE]: filter corrupted" >> ' + out_filename
    print(command)
    output = client.exec_command(command)

### Run copy over caption

In [1825]:
if RUN_COPY_OVER_CAPTION:
    nohup_filename = "outs/" + NAME + "_copy_over_caption.out"
    command = \
        'cd /home/tom/Open-Sora-dev/ && ' + \
        'nohup /home/zhaowangbo/.conda/envs/opensora/bin/python copy_over_caption.py' + \
        ' ' + PATH + 'meta.csv > ' + nohup_filename + ' 2>&1 &'
    output = client.exec_command(command)

### Run aesthetics and collate

In [1826]:
if RUN_AESTHETICS:
    nohup_filename = "outs/" + NAME + "_aes.out"
    command = \
        'cd /home/tom/Open-Sora-dev/ && ' + \
        'nohup /home/zhaowangbo/.conda/envs/opensora/bin/torchrun --nproc_per_node 8 -m tools.scoring.aesthetic.inference ' + \
        PATH + 'meta_remove_corrupted.csv ' + \
        '--bs 1024 --num_workers 16 > ' + \
        nohup_filename + ' 2>&1 &'
    output = client.exec_command(command)


In [1827]:
if RUN_COLLATE_AESTHETICS:
    command = \
        'cd /home/tom/Open-Sora-dev/ && ' + \
        'nohup /home/zhaowangbo/.conda/envs/opensora/bin/python -m tools.datasets.csvutil ' + \
        PATH + 'meta_remove_corrupted_aes_part*.csv ' + \
        '--output ' + \
        PATH + 'meta_remove_corrupted_aes.csv '
    output = client.exec_command(command)
    # this takes priority! delete all meta_remove_corrupted_aes_part*.csv
    # output = client.exec_command("rm " + PATH + "meta_remove_corrupted_aes_part*.csv")

### Run optical flow


In [1828]:
if RUN_OPTICAL_FLOW:
    command = \
        'cd /home/tom/Open-Sora-dev/ && ' + \
        'nohup /home/zhaowangbo/.conda/envs/llava2/bin/torchrun --standalone --nproc_per_node 8 -m tools.scoring.optical_flow.inference_parallel ' + \
        PATH + 'meta_remove_corrupted_aes.csv > ' + \
        "outs/" + NAME + "_flow.out 2>&1 &"
    print(command)
    output = client.exec_command(command)


### Run aesthetics and optical flow


In [1829]:
if RUN_AES_FLOW:
    out_filename = "outs/" + NAME + "_aes_flow.out"
    command = \
        'cd /home/tom/Open-Sora-dev/ && ' + \
        '/home/zhaowangbo/.conda/envs/opensora/bin/torchrun --nproc_per_node 8 -m tools.scoring.aesthetic.inference ' + \
        PATH + 'meta_remove_corrupted.csv ' + \
        '--bs 1024 --num_workers 16 > ' + \
        out_filename + ' 2>&1 && ' + \
        'echo "[DONE]: aesthetic" >> ' + out_filename + ' && ' + \
        '/home/zhaowangbo/.conda/envs/opensora/bin/python -m tools.datasets.csvutil ' + \
        PATH + 'meta_remove_corrupted_aes_part*.csv ' + \
        '--output ' + \
        PATH + 'meta_remove_corrupted_aes.csv ' + ' >> ' + out_filename + ' 2>&1 && ' + \
        'echo "[DONE]: collate aesthetic" >> ' + out_filename + ' && ' + \
        '/home/zhaowangbo/.conda/envs/llava2/bin/torchrun --standalone --nproc_per_node 8 -m tools.scoring.optical_flow.inference_parallel ' + \
        PATH + 'meta_remove_corrupted_aes.csv >> ' + \
        out_filename + " 2>&1 && " + \
        'echo "[DONE]: flow" >> ' + out_filename
        # CONTINUE 
    output = client.exec_command(command)

### Run captioning and collate


In [1830]:
if RUN_CAPTIONING:
    out_filename = "outs/" + NAME + "_caption.out"
    command = \
        'cd /home/tom/Open-Sora-dev/ && ' + \
        '/home/zhaowangbo/.conda/envs/llava2/bin/torchrun --nproc_per_node 8 ' + \
        '--standalone ' + \
        '-m tools.caption.caption_llava ' + \
        PATH + captioning_input_file + ' ' + \
        '--tp-size 2 ' + \
        '--dp-size 4 ' + \
        '--model-path liuhaotian/llava-v1.6-mistral-7b ' + \
        '--bs 16 ' + \
        '--prompt video > ' + \
        out_filename + " 2>&1 && " + \
        '/home/zhaowangbo/.conda/envs/llava2/bin/python -m tools.datasets.csvutil ' + \
        PATH + 'meta_remove_corrupted_aes_flow_caption_part*.csv ' + \
        '--output ' + \
        PATH + 'meta_remove_corrupted_aes_flow_caption.csv ' + ' >> ' + out_filename + ' 2>&1 '
    print(command)
    output = client.exec_command(command)

In [1831]:
# cd /home/tom/Open-Sora-dev/ && /home/zhaowangbo/.conda/envs/llava2/bin/torchrun --nproc_per_node 8 --standalone -m tools.caption.caption_llava /mnt/hdd/data/v2text/raw/meta/split-18/meta_remove_corrupted_aes_flow.csv --tp-size 2 --dp-size 4 --model-path liuhaotian/llava-v1.6-mistral-7b --bs 16 --prompt video > outs/split-18_caption.out 2>&1 && /home/zhaowangbo/.conda/envs/llava2/bin/python -m tools.datasets.csvutil /mnt/hdd/data/v2text/raw/meta/split-18/meta_remove_corrupted_aes_flow_caption_part*.csv --output /mnt/hdd/data/v2text/raw/meta/split-18/meta_remove_corrupted_aes_flow_caption.csv >> outs/split-18_caption.out 2>&1 && 

In [1832]:
if RUN_COLLATE_CAPTIONING_INTERRUPTED:
    command = \
        'cd /home/tom/Open-Sora-dev/ && ' + \
        '/home/zhaowangbo/.conda/envs/llava2/bin/python -m tools.datasets.csvutil ' + \
        PATH + 'meta_remove_corrupted_aes_flow_remaining_caption_part*.csv ' + \
        '--output ' + \
        PATH + 'meta_remove_corrupted_aes_flow_remaining_caption.csv '
    output = client.exec_command(command)
    command = \
        'cd /home/tom/Open-Sora-dev/ && ' + \
        '/home/zhaowangbo/.conda/envs/llava2/bin/python -m tools.datasets.csvutil ' + \
        PATH + 'meta_remove_corrupted_aes_flow_remaining_caption.csv ' + \
        PATH + 'meta_remove_corrupted_aes_flow_caption_partial.csv ' + \
        '--output ' + \
        PATH + 'meta_remove_corrupted_aes_flow_caption.csv '
    output = client.exec_command(command)
    

### Clean and remove

In [1833]:
if RUN_CLEAN_REMOVE_LLAVA:
    out_filename = "outs/" + NAME + "_clean_remove_llava.out"
    command = \
        'cd /home/tom/Open-Sora-dev/ && ' + \
        '/home/zhaowangbo/.conda/envs/opensora/bin/python -m tools.datasets.csvutil ' + \
        PATH + clean_remove_input_path + ' ' + \
        '--clean-caption --remove-caption-prefix --remove-empty-caption ' + \
        '--output ' + \
        PATH + clean_remove_output_path + ' > ' + out_filename + ' 2>&1 && ' + \
        'echo "[DONE]: RUN_CLEAN_REMOVE_LLAVA" >> ' + out_filename
    output = client.exec_command(command)

In [1834]:
if RUN_CLEAN_REMOVE:
    out_filename = "outs/" + NAME + "_clean_remove.out"
    command = \
        'cd /home/tom/Open-Sora-dev/ && ' + \
        '/home/zhaowangbo/.conda/envs/opensora/bin/python -m tools.datasets.csvutil ' + \
        PATH + clean_remove_input_path + ' ' + \
        '--clean-caption --remove-caption-prefix --remove-empty-caption ' + \
        '--output ' + \
        PATH + clean_remove_output_path + ' > ' + out_filename + ' 2>&1 && ' + \
        'echo "[DONE]: collate RUN_CLEAN_REMOVE" >> ' + out_filename
    output = client.exec_command(command)

### Intersect

In [1835]:
# python -m tools.datasets.csvutil ~/dataset/HD-VG-130M/meta_remove_corrupted_aes.csv --intersect ~/dataset/HD-VG-130M/meta_remove_corrupted_flow.csv --output ~/dataset/HD-VG-130M/meta_remove_corrupted_aes_flow.csv
if RUN_INTERSECT_INFO:
    command = \
        'cd /home/tom/Open-Sora-dev/ && ' + \
        '/home/zhaowangbo/.conda/envs/opensora/bin/python -m tools.datasets.csvutil ' + \
        PATH + 'meta_remove_corrupted.csv ' + \
        '--intersect ' + PATH + 'meta_remove_corrupted_aes_flow_caption.csv ' + \
        '--output ' + PATH + 'meta_remove_corrupted_aes_flow_caption.csv '
    output = client.exec_command(command)

In [1836]:
# # # remove empty captions and process captions (may need to re-caption lost ones)

# # --remove-caption-prefix: llava has a prefix, remove it

# # --clean-caption makes it T5 friendly

# python -m tools.datasets.csvutil ~/dataset/meta_caption.csv --clean-caption --remove-caption-prefix --remove-empty-caption --output ~/dataset/meta_caption_processed.csv

# python -m tools.datasets.csvutil ~/dataset/meta_caption.csv --clean-caption --remove-caption-prefix --remove-empty-caption --output ~/dataset/meta_caption_processed.csv

# # # 4. Sanity check & prepare for training
# # # sanity check
# # python -m tools.datasets.csvutil ~/dataset/meta_caption_processed.csv --info --output ~/dataset/meta_ready.csv


In [1837]:
# command1 = 'conda activate /home/zhaowangbo/.conda/envs/llava2'
# command2 = ' cd ~/Open-Sora-dev'
# command3 = 'nohup torchrun --nproc_per_node 8 --standalone -m tools.caption.caption_llava /mnt/hdd/data/unsplash-full/resize_4k/meta/meta_machine_1.csv --tp-size 2 --dp-size 4 --bs 8 > llava_unsplash-full_machine_1.out &'
# stdin, stdout, stderr = client.exec_command(command1)
    # stdin, stdout, stderr = client.exec_command(command2)
    # Execute the command
    # stdin, stdout, stderr = client.exec_command(command3)
    # output = stdout.read().decode()
    # print(output)
    # error = stderr.read().decode()
    # print(error)
