# 좌표를 모델 인풋으로 바꾸기

In [1]:
import os
import re
import pandas as pd
import numpy as np
from pathlib import Path

from csv_to_npz import save_npz_sample_from_segment

In [2]:
# 경로 설정
project_path = '/home/j-j13a602/make_dataset'

In [3]:
# word_id, person_id로 추출할 csv 파일 찾기

def find_target_csv_file(file_name, target_word_ids, target_person_ids, video_type):

    if video_type == "crowd":
        # 현재 파일의 단어 번호
        current_word_id = int(file_name.split("FS")[-1][:4])
        # 현재 파일의 사람 번호
        current_person_id = int(file_name.split("CROWD")[-1][:2])

    elif video_type == "real":
        # 현재 파일의 단어 번호
        current_word_id = int(file_name.split("WORD")[-1][:4])
        # 현재 파일의 사람 번호
        current_person_id = int(file_name.split("REAL")[-1][:2])

    elif video_type == "syn":
        # 현재 파일의 단어 번호
        current_word_id = int(file_name.split("WORD")[-1][:4])
        # 현재 파일의 사람 번호
        current_person_id = int(file_name.split("SYN")[-1][:2])

    # 현재 파일의 단어와 사람 번호가 있으면 변환 가능
    if current_word_id in target_word_ids and current_person_id in target_person_ids:
        print(f"파일명 '{file_name}' 변환 가능")
        return True
    else:
        print(f"파일명 '{file_name}' 에러")
        return False


def get_word_id_and_person_id(file_name, video_type):
    if video_type == "crowd":
        # 현재 파일의 단어 번호
        current_word_id = int(file_name.split("FS")[-1][:4])
        # 현재 파일의 사람 번호
        current_person_id = int(file_name.split("CROWD")[-1][:2])

    elif video_type == "real":
        # 현재 파일의 단어 번호
        current_word_id = int(file_name.split("WORD")[-1][:4])
        # 현재 파일의 사람 번호
        current_person_id = int(file_name.split("REAL")[-1][:2])

    elif video_type == "syn":
        # 현재 파일의 단어 번호
        current_word_id = int(file_name.split("WORD")[-1][:4])
        # 현재 파일의 사람 번호
        current_person_id = int(file_name.split("SYN")[-1][:2])

    return current_word_id, current_person_id

In [4]:
# 들어있는 csv 확인하기
annotation_dir = project_path + "/annotation"

annotation_dir_files = [
    os.path.join(annotation_dir, f)
    for f in os.listdir(annotation_dir)
    if f.lower().endswith(".csv") and re.search(r"real", f, re.IGNORECASE)
]

print("총 좌표 csv 개수:", len(annotation_dir_files))
print(annotation_dir_files[:3])

총 좌표 csv 개수: 1500
['/home/j-j13a602/make_dataset/annotation/NIA_SL_WORD0749_REAL09_F.csv', '/home/j-j13a602/make_dataset/annotation/NIA_SL_WORD0798_REAL09_F.csv', '/home/j-j13a602/make_dataset/annotation/NIA_SL_WORD0572_REAL09_F.csv']


In [5]:
# 메타데이터 불러오기
real_detail_metadata = pd.read_csv(project_path + "/real_word_detail_metadata.csv", encoding="utf-8")

print(real_detail_metadata.shape)
real_detail_metadata.head(2)

(48000, 11)


Unnamed: 0,basename,mp4_url,mp4_name,duration,exported_on,word_start,word_end,word_name,word_id,person_id,video_annotated
0,NIA_SL_WORD0001_REAL01,https://blackolivevideo.blob.core.windows.net/...,NIA_SL_WORD0001_REAL01_F.mp4,4.867,2020/12/10,1.743,3.103,고민,1,1,True
1,NIA_SL_WORD0002_REAL01,https://blackolivevideo.blob.core.windows.net/...,NIA_SL_WORD0002_REAL01_F.mp4,4.434,2020/12/10,1.831,3.361,뻔뻔,2,1,True


In [6]:
# @title word에 있는 csv -> npz 파일 변경하기.

target_word_ids = list(range(1, 3001))
target_person_ids = list(range(1, 11))

completed_word_count = 0
error_word_files = []
real_npz_folder = project_path + "/npz" # Define the real_data folder path

for annotation_file in annotation_dir_files:
    # Construct the expected NPZ file path
    base_name = Path(annotation_file).stem
    expected_npz_path = os.path.join(real_npz_folder, f"{base_name}.npz")

    # Check if the NPZ file already exists
    if os.path.exists(expected_npz_path):
        print(f"NPZ 파일이 이미 존재합니다. 스킵: {expected_npz_path}")
        completed_word_count += 1 # Count existing files as completed
        continue # Skip to the next file

    # Check if the file name matches the target word and person IDs
    if find_target_csv_file(annotation_file, target_word_ids, target_person_ids, "real"):
        # If it matches, process the file
        target_ann_file = os.path.join(annotation_dir, annotation_file)
        target_word_id, target_person_id = get_word_id_and_person_id(annotation_file, "real")
        file_name, target_type = os.path.splitext(target_ann_file)

        try:
            out_file = save_npz_sample_from_segment(
                target_path=target_ann_file,
                target_type=target_type.strip("."),
                metadata=real_detail_metadata,
                out_dir=real_npz_folder, # Use the defined real_npz_folder
                label_id=target_word_id,
                use_buckets=True,
                FPS=30,
            )
            print("[SAVED]", out_file)
            completed_word_count += 1

        except Exception as e:
            # Check if out_file is defined before printing
            if 'out_file' in locals():
                print(f"[{e}] {out_file}")
            else:
                print(f"[{e}] Error processing {target_ann_file}")
                error_word_files.append(target_ann_file)
    else:
        print(f"파일명 '{annotation_file}' 는 타겟 word_id 또는 person_id에 해당하지 않습니다. 스킵.")


print(f"에러 파일 개수: {len(error_word_files)}")
print(f"변환 성공 파일 개수 (기존 파일 포함): {completed_word_count}")

파일명 '/home/j-j13a602/make_dataset/annotation/NIA_SL_WORD0749_REAL09_F.csv' 변환 가능
Word start time: 1.316s -> Start frame: 39
Word end time: 2.905s -> End frame: 87
[SAVED] /home/j-j13a602/make_dataset/npz/NIA_SL_WORD0749_REAL09_F.npz
파일명 '/home/j-j13a602/make_dataset/annotation/NIA_SL_WORD0798_REAL09_F.csv' 변환 가능
Word start time: 1.992s -> Start frame: 60
Word end time: 3.527s -> End frame: 106
[SAVED] /home/j-j13a602/make_dataset/npz/NIA_SL_WORD0798_REAL09_F.npz
파일명 '/home/j-j13a602/make_dataset/annotation/NIA_SL_WORD0572_REAL09_F.csv' 변환 가능
Word start time: 1.371s -> Start frame: 41
Word end time: 3.74s -> End frame: 112
[SAVED] /home/j-j13a602/make_dataset/npz/NIA_SL_WORD0572_REAL09_F.npz
파일명 '/home/j-j13a602/make_dataset/annotation/NIA_SL_WORD0963_REAL09_F.csv' 변환 가능
Word start time: 2.192s -> Start frame: 66
Word end time: 3.981s -> End frame: 119
[SAVED] /home/j-j13a602/make_dataset/npz/NIA_SL_WORD0963_REAL09_F.npz
파일명 '/home/j-j13a602/make_dataset/annotation/NIA_SL_WORD1465_REAL0

In [7]:
# 들어있는 csv 확인하기
npz_dir = project_path + "/npz"

npz_dir_files = [
    os.path.join(npz_dir, f)
    for f in os.listdir(npz_dir)
    if f.lower().endswith(".npz") and re.search(r"real", f, re.IGNORECASE)
]

print("총 좌표 csv 개수:", len(npz_dir_files))
print(npz_dir_files[:3])

총 좌표 csv 개수: 1498
['/home/j-j13a602/make_dataset/npz/NIA_SL_WORD1453_REAL09_F.npz', '/home/j-j13a602/make_dataset/npz/NIA_SL_WORD0317_REAL09_F.npz', '/home/j-j13a602/make_dataset/npz/NIA_SL_WORD1388_REAL09_F.npz']


# 훈련하려는 데이터 모으기

In [8]:
total_df = pd.read_csv(project_path + "/dataset_metadata_real1494_person7.csv")

print(total_df.shape)
total_df.head(2)

(10458, 4)


Unnamed: 0,landmarks_file,word_id,person_id,word_gloss
0,05-1__NIA_SL_WORD0624_REAL05_F.npz,624,5,새집
1,05-1__NIA_SL_WORD1272_REAL05_F.npz,1272,5,뽀뽀


In [16]:
print(total_df.word_gloss.nunique())
total_df.word_gloss.unique()

1431


array(['새집', '뽀뽀', '일어나다', ..., '전기기기', '호텔대학교', '어제밤'],
      shape=(1431,), dtype=object)

In [17]:
print(total_df.word_id.nunique())
total_df.word_id.unique()

1494


array([ 624, 1272, 1273, ..., 1498, 1499, 1500], shape=(1494,))

In [9]:
# npz로 변환된 것 확인하기
completed_npz_df = pd.DataFrame(npz_dir_files, columns=["file_name"])

completed_npz_df["word_id"] = completed_npz_df["file_name"].apply(lambda x: int(x.split("__")[-1].split("WORD")[-1][:4]))
completed_npz_df["person_id"] = completed_npz_df["file_name"].apply(lambda x: int(x.split("__")[-1].split("REAL")[-1][:2]))

completed_npz_df.head(2)

Unnamed: 0,file_name,word_id,person_id
0,/home/j-j13a602/make_dataset/npz/NIA_SL_WORD14...,1453,9
1,/home/j-j13a602/make_dataset/npz/NIA_SL_WORD03...,317,9


In [11]:
completed_npz_df["file_name"] = completed_npz_df["file_name"].apply(lambda x: x.split("/")[-1])

completed_npz_df.head(2)

Unnamed: 0,file_name,word_id,person_id
0,NIA_SL_WORD1453_REAL09_F.npz,1453,9
1,NIA_SL_WORD0317_REAL09_F.npz,317,9


In [13]:
# completed_npz_df와 real_detail_metadata 비교
# mp4_name이 file_name에 있으면 해당 word_name을 completed_npz_df 에 컬럼으로 넣어준다.

# Extract the base filename without the directory for merging
completed_npz_df['mp4_name'] = completed_npz_df['file_name'].apply(lambda x: Path(x).stem + ".mp4")

# Merge completed_npz_df3 with syn_detail_metadata to get the word_name
real_completed_with_name = pd.merge(
    completed_npz_df,
    real_detail_metadata[['mp4_name', 'word_name']],
    on='mp4_name',
    how='left'
)

# Drop the redundant 'mp4_name' columns
completed_with_name = real_completed_with_name.drop(columns=['mp4_name'])

# Rename the merged DataFrame to completed_npz_df3
completed_npz_df = completed_with_name

print("completed_npz_df with word_name:")
display(completed_npz_df.head())

completed_npz_df with word_name:


Unnamed: 0,file_name,word_id,person_id,word_name_x,word_name_y
0,NIA_SL_WORD1453_REAL09_F.npz,1453,9,,신용산
1,NIA_SL_WORD0317_REAL09_F.npz,317,9,,급행
2,NIA_SL_WORD1388_REAL09_F.npz,1388,9,,초대3
3,NIA_SL_WORD0171_REAL09_F.npz,171,9,,코치
4,NIA_SL_WORD0155_REAL09_F.npz,155,9,,여행사


In [14]:
completed_npz_df.drop(columns=["word_name_x"], inplace=True)
completed_npz_df.rename(columns={"file_name": "landmarks_file", "word_name_y": "word_gloss"}, inplace=True)

print(completed_npz_df.shape)
completed_npz_df.head(2)

(1498, 4)


Unnamed: 0,landmarks_file,word_id,person_id,word_gloss
0,NIA_SL_WORD1453_REAL09_F.npz,1453,9,신용산
1,NIA_SL_WORD0317_REAL09_F.npz,317,9,급행


In [19]:
person_9 = completed_npz_df[completed_npz_df['word_id'].isin(total_df.word_id.unique().tolist())].copy()

print(person_9.shape)
person_9.head(2)

(1492, 4)


Unnamed: 0,landmarks_file,word_id,person_id,word_gloss
0,NIA_SL_WORD1453_REAL09_F.npz,1453,9,신용산
1,NIA_SL_WORD0317_REAL09_F.npz,317,9,급행


In [21]:
person_9.word_id.unique()

array([1453,  317, 1388, ...,  354,  892, 1239], shape=(1492,))

In [22]:
person_13456810 = total_df[total_df["word_id"].isin(person_9.word_id.unique().tolist())].copy()

print(person_13456810)
person_13456810.head(2)

                           landmarks_file  word_id  person_id word_gloss
0      05-1__NIA_SL_WORD0624_REAL05_F.npz      624          5         새집
1      05-1__NIA_SL_WORD1272_REAL05_F.npz     1272          5         뽀뽀
2      05-1__NIA_SL_WORD1273_REAL05_F.npz     1273          5       일어나다
3      05-1__NIA_SL_WORD0001_REAL05_F.npz        1          5         고민
4      05-1__NIA_SL_WORD0002_REAL05_F.npz        2          5         뻔뻔
...                                   ...      ...        ...        ...
10453  08-1__NIA_SL_WORD0464_REAL08_F.npz      464          8         단명
10454  08-1__NIA_SL_WORD0465_REAL08_F.npz      465          8         단문
10455  08-1__NIA_SL_WORD0466_REAL08_F.npz      466          8        능력자
10456  08-1__NIA_SL_WORD0467_REAL08_F.npz      467          8        곰방대
10457  08-1__NIA_SL_WORD0468_REAL08_F.npz      468          8         담당

[10444 rows x 4 columns]


Unnamed: 0,landmarks_file,word_id,person_id,word_gloss
0,05-1__NIA_SL_WORD0624_REAL05_F.npz,624,5,새집
1,05-1__NIA_SL_WORD1272_REAL05_F.npz,1272,5,뽀뽀


In [26]:
final_df = pd.concat([person_13456810, person_9])
print(final_df.shape)

final_df.to_csv(project_path + "/dataset_metadata.csv", index=True, encoding="utf-8")

(11936, 4)


In [28]:
import zipfile

# Define the path and name for the output zip file
output_zip_path = os.path.join(project_path, "dataset_final.zip")
output_csv_path = os.path.join(project_path, "dataset_metadata.csv")

# Create a new zip file in write mode
with zipfile.ZipFile(output_zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
    # Add the metadata CSV file to the zip archive
    zipf.write(output_csv_path, os.path.basename(output_csv_path))

    # Iterate through the list of NPZ file paths and add each NPZ file
    for npz_file_path in person_9.landmarks_file:
        zipf.write(os.path.join(project_path, "npz", npz_file_path), npz_file_path) # Include in a subfolder in the zip

print(f"Zip file created successfully at: {output_zip_path}")

Zip file created successfully at: /home/j-j13a602/make_dataset/dataset_final.zip


In [1]:
%pwd

'/home/j-j13a602/make_dataset'

In [8]:
import os

print("총 데이터셋 개수:", len(os.listdir('/home/j-j13a602/dataset_final')))

총 데이터셋 개수: 11938


In [9]:
11938 / 8

1492.25