In [1]:
import os
import pandas as pd
from tqdm import tqdm
import warnings

# FutureWarning 무시
warnings.simplefilter("ignore", FutureWarning)

In [2]:
def make_annotation_csv_file(
        root_path: str, 
        annotation_filename:str, 
        annotation_filename_sa:str, 
        csv_filename: str,
        save: bool = False
    ) -> None:

    columns = [
        'video_name', 
        'youtube_id', 
        'ethnicity',
        'ethnicity_label',
        'gender',
        'gender_label',
        'openness',
        'conscientiousness', 
        'extraversion',
        'agreeableness',
        'neuroticism',
        'interview', 
    ]
    df = pd.DataFrame(columns=columns)
    
    annotation_dict = pd.read_pickle(os.path.join(root_path, annotation_filename))
    gender_ethnicity_df = pd.read_csv(os.path.join(root_path, annotation_filename_sa), sep=";")

    for key in tqdm(annotation_dict["interview"].keys()):
        VideoName = key
        YouTubeID = gender_ethnicity_df[gender_ethnicity_df["VideoName"] == key]["YouTubeID"].item()
        # ocean score
        openness = annotation_dict["openness"][key]
        conscientiousness = annotation_dict["conscientiousness"][key]
        extraversion = annotation_dict["extraversion"][key]
        agreeableness = annotation_dict["agreeableness"][key]
        neuroticism = annotation_dict["neuroticism"][key]

        # interview score
        interview = annotation_dict["interview"][key]

        # 0 Asian 1 Caucasian 2 African-American
        ethnicity = gender_ethnicity_df[gender_ethnicity_df["VideoName"] == key]["Ethnicity"].item() - 1
        if ethnicity == 0:
            ethnicity_label = "Asian"
        elif ethnicity == 1:
            ethnicity_label = "Caucasian"
        elif ethnicity == 2:
            ethnicity_label = "African-American"
        else:
            raise ValueError

        # 0 Male 1 Female
        gender = gender_ethnicity_df[gender_ethnicity_df["VideoName"] == key]["Gender"].item() - 1
        if gender == 0:
            gender_label = "Male"
        elif gender == 1:
            gender_label = "Female"
        else:
            raise ValueError

        new_item = {
            'video_name': VideoName,
            'youtube_id': YouTubeID,
            'ethnicity': ethnicity,
            'ethnicity_label': ethnicity_label,
            'gender': gender,
            'gender_label': gender_label,
            'openness': openness, 
            'conscientiousness': conscientiousness, 
            'extraversion': extraversion,
            'agreeableness': agreeableness,
            'neuroticism': neuroticism,
            'interview': interview,
        }
        df = pd.concat([df, pd.DataFrame([new_item])], ignore_index=True)
        
    df = df.sort_values("video_name").reset_index(drop=True)
    if save:
        df.to_csv(csv_filename, index=False)

In [3]:
root_path = "{workspace}/my_project/src/data_origin/annotation"

In [4]:
# make training annotation file
annotation_filename = "annotation_training.pkl"
annotation_filename_sa = "eth_gender_annotations_dev.csv"
csv_filename = "annotation_training.csv"
save = True
make_annotation_csv_file(root_path, annotation_filename, annotation_filename_sa, csv_filename, save)

100%|██████████| 6000/6000 [00:16<00:00, 368.68it/s]


In [5]:
# make validation annotation file
annotation_filename = "annotation_validation.pkl"
annotation_filename_sa = "eth_gender_annotations_dev.csv"
csv_filename = "annotation_validation.csv"
save = True
make_annotation_csv_file(root_path, annotation_filename, annotation_filename_sa, csv_filename, save)

100%|██████████| 2000/2000 [00:05<00:00, 378.25it/s]


In [6]:
# make test annotation file
annotation_filename = "annotation_test.pkl"
annotation_filename_sa = "eth_gender_annotations_test.csv"
csv_filename = "annotation_test.csv"
save = True
make_annotation_csv_file(root_path, annotation_filename, annotation_filename_sa, csv_filename, save)

100%|██████████| 2000/2000 [00:03<00:00, 588.59it/s]
