In [7]:
import pandas as pd
import pyarrow.parquet as pq

# Read the first CSV file
dataset_train_df = pd.read_csv("train.csv")

# Read the second CSV file
dataset_supplemental_df = pd.read_csv("supplemental_metadata.csv")

# Concatenate the two dataframes
dataset_df = pd.concat([dataset_train_df, dataset_supplemental_df], ignore_index=True)

# Save the combined CSV file
dataset_df.to_csv("train_full.csv", index=False)

In [8]:
# Read the first row of the DataFrame
path, sequence_id, file_id, phrase = dataset_df.iloc[0][
    ["path", "sequence_id", "file_id", "phrase"]
]
print(f"path: {path}, sequence_id: {sequence_id}, file_id: {file_id}, phrase: {phrase}")

sample_sequence_df = pq.read_table(
    f"{str(path)}",
    filters=[
        [("sequence_id", "=", sequence_id)],
    ],
).to_pandas()
print("Full sequence dataset shape is {}".format(sample_sequence_df.shape))

path: train_landmarks/5414471.parquet, sequence_id: 1816796431, file_id: 5414471, phrase: 3 creekhouse
Full sequence dataset shape is (123, 1630)


In [9]:
# Read the total amount unique files
unique_paths = dataset_df["path"].unique()

sum = unique_paths.shape[0]

print("Total number of files: {}".format(sum))

Total number of files: 121


In [10]:
LIP = [
    61, 185, 40, 39, 37, 267, 269, 270, 409,
    291, 146, 91, 181, 84, 17, 314, 405, 321, 375,
    78, 191, 80, 81, 82, 13, 312, 311, 310, 415,
    95, 88, 178, 87, 14, 317, 402, 318, 324, 308,
]

FACE = [f'x_face_{i}' for i in LIP] + [f'y_face_{i}' for i in LIP] + [f'z_face_{i}' for i in LIP]
LHAND = [f'x_left_hand_{i}' for i in range(21)] + [f'y_left_hand_{i}' for i in range(21)] + [f'z_left_hand_{i}' for i in range(21)]
RHAND = [f'x_right_hand_{i}' for i in range(21)] + [f'y_right_hand_{i}' for i in range(21)] + [f'z_right_hand_{i}' for i in range(21)]
POSE = [f'x_pose_{i}' for i in range(33)] + [f'y_pose_{i}' for i in range(33)] + [f'z_pose_{i}' for i in range(33)]

SEL_COLS = FACE + LHAND + RHAND + POSE
FRAME_LEN = 128

In [11]:
import tensorflow as tf
import numpy as np
import pandas as pd
from tqdm import tqdm
from pathlib import Path
from skimage.transform import resize

pbar = tqdm(dataset_df.file_id.unique())

for file_id in pbar:
    file_df = dataset_df.loc[dataset_df["file_id"] == file_id]

    path = file_df["path"].values[0]

    parquet_df = pq.read_table(
        path,
        columns=["sequence_id"] + SEL_COLS,
    ).to_pandas()

    tf_file = f"preprocessed/{file_id}.tfrecord"

    parquet_numpy = parquet_df.to_numpy()
    col_to_index = {col: i for i, col in enumerate(parquet_df.columns)}

    # Convert LHAND and RHAND to lists of indices
    LHAND_indices = [col_to_index[col] for col in LHAND]
    RHAND_indices = [col_to_index[col] for col in RHAND]

    with tf.io.TFRecordWriter(tf_file) as file_writer:
        for seq_id, phrase in zip(file_df.sequence_id, file_df.phrase):
            frames = parquet_numpy[parquet_df.index == seq_id]
            
            # Calculate the number of NaN values in each hand landmark
            r_nonan = np.sum(np.sum(np.isnan(frames[:, RHAND_indices]), axis=1) == 0)
            l_nonan = np.sum(np.sum(np.isnan(frames[:, LHAND_indices]), axis=1) == 0)
            no_nan = max(r_nonan, l_nonan)

            if 2 * len(phrase) < no_nan:
                features = {
                    COL: tf.train.Feature(
                        float_list=tf.train.FloatList(
                            value=frames[:, col_to_index[COL]]
                        )
                    )
                    for COL in SEL_COLS
                }

                features["phrase"] = tf.train.Feature(
                    bytes_list=tf.train.BytesList(
                        value=[bytes(phrase, "utf-8")],
                    )
                )

                # Create a tf.train.Example
                example = tf.train.Example(features=tf.train.Features(feature=features))

                # Serialize the example to a string
                record_bytes = example.SerializeToString()

                # Write the serialized example to a TFRecord file

                pbar.set_postfix_str(
                    f"Writing {file_id}.tfrecord, sequence_id: {seq_id}}"
                )
                file_writer.write(record_bytes)

100%|█| 121/121 [11:21<00:00,  5.63s/it, Writing 2100073719.tfrecord, sequence_id: 109101155
