# SWDance dataset pipeline
From video playlist to dataset in HumanML3D format. Created and ran in Google Colaboratory, using their free GPU.

**NOTE:**

Before running, download [SMPL neutral v1.0.0 model for Python](https://smpl.is.tue.mpg.de/download.php) (you have to register for access). Save into `{data_path}/smpl_models/basicModel_neutral_lbs_10_207_0_v1.0.0.pkl`.


If you want to test faster, download lower quality videos. This will result in lower quality pose estimation, but runs faster! To do this, remove `desc()` in `stream = yt.streams.filter(file_extension='mp4').order_by('resolution').desc().first()` inside `DataProcessor.download_yt_videos()`.

# SetUp

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# @title pip installs

# # VIBE
!pip install git+https://github.com/pytube/pytube
!pip install tqdm yacs
!pip install smplx
!pip install trimesh pyrender progress filterpy
!pip install opencv-python llvmlite
!pip install git+https://github.com/mattloper/chumpy.git
!pip install transforms3d
!pip install smplpytorch

# whisper
!pip install ffmpeg moviepy
!pip install -U stable-ts # stable whisper with better timestamps

# for animation:
!pip install matplotlib==3.3.4

# for emotion detection
!pip install --no-cache-dir transformers sentencepiece

# for downloading stuff
!pip install --upgrade --no-cache-dir gdown

In [None]:
#@title install VIBE and yolov7
vibe_path = '/content' # @param {type:"string"}
%cd {vibe_path}
!git clone https://github.com/mkocabas/VIBE.git
%cd VIBE
!source scripts/prepare_data.sh


yolo_path = '/content' # @param {type:"string"}
%cd {yolo_path}
!git clone https://github.com/WongKinYiu/yolov7
%cd yolov7
!wget https://github.com/WongKinYiu/yolov7/releases/download/v0.1/yolov7.pt

%cd /content/

In [None]:
#@title fix pytube bug
import os
import inspect
import pytube

package_path = os.path.dirname(inspect.getfile(pytube))
pytube_path = f'{package_path}/innertube.py'

with open(pytube_path, 'r', encoding='utf-8') as file:
    data = file.readlines()

print(data[222])
data[222] = data[222].replace('ANDROID_MUSIC', 'ANDROID')
data[222] = data[222].replace('ANDROID', 'IOS')

with open(pytube_path, 'w', encoding='utf-8') as file:
    file.writelines(data)

# test
with open(pytube_path, 'r', encoding='utf-8') as file:
    data = file.readlines()
print(data[222])


## for the other thingy

pytube_path = f'{package_path}/cipher.py'
with open(pytube_path, 'r', encoding='utf-8') as file:
    data = file.readlines()

print(data[410])
data[410] = data[410].replace('find_object_from_startpoint(raw_code, match.span()[1] - 1)', 'js')

with open(pytube_path, 'w', encoding='utf-8') as file:
    file.writelines(data)

# test
with open(pytube_path, 'r', encoding='utf-8') as file:
    data = file.readlines()
print(data[410])

    def __init__(self, client='IOS', use_oauth=False, allow_cache=True):

    def __init__(self, client='IOS', use_oauth=False, allow_cache=True):

    transform_plan_raw = js

    transform_plan_raw = js



In [None]:
#@title fix libcuda error
!export LC_ALL="en_US.UTF-8"
!export LD_LIBRARY_PATH="/usr/lib64-nvidia"
!export LIBRARY_PATH="/usr/local/cuda/lib64/stubs"
!ldconfig /usr/lib64-nvidia

# The Pipeline

In [None]:
# @title Settings
import os

data_dir = '/content/SWDance' # @param {type:"string"}
video_path = '/tmp/videos' # @param {type:"string"}

if not os.path.exists(data_dir):
    os.makedirs(data_dir)
if not os.path.exists(video_path):
    os.makedirs(video_path)


In [None]:
#@title Imports
%cd /content
# %cd /content/drive/MyDrive/

import torch
import numpy as np
import pandas as pd
from pandas.io import json

import time
import joblib
import os, sys
import os.path as osp
from pathlib import Path
import spacy
from tqdm import tqdm
import codecs as cs
import shutil

# captions
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import stable_whisper
import nltk

# videos
import cv2
import moviepy as mopy
from moviepy import editor
from pytube import Playlist, YouTube
from torch.utils.data import DataLoader
from smplx import SMPL as SMPL_native
from scipy.signal import savgol_filter

# loading models...
print('Loading whisper model...')
WHISPER_MODEL = stable_whisper.load_model('base')

print('Loading models for caption augmentation...')
import gensim.downloader as gensim_api
from transformers import AutoTokenizer, AutoModelWithLMHead
GLOVE_MODEL = gensim_api.load("glove-wiki-gigaword-100")
EMOTION_TOKENIZER = AutoTokenizer.from_pretrained("mrm8488/t5-base-finetuned-emotion")
EMOTION_MODEL = AutoModelWithLMHead.from_pretrained("mrm8488/t5-base-finetuned-emotion")

print("Loading SMPL layer...") # for transform to smpl24
import transforms3d
from smplpytorch.display_utils import display_model
from smplpytorch.pytorch.smpl_layer import SMPL_Layer
SMPL_LAYER = SMPL_Layer(center_idx=0,gender='neutral',model_root=osp.join(data_dir, 'smpl_models'))

NLP = spacy.load('en_core_web_sm')
nltk.downloader.download('vader_lexicon')

print('Loading VIBE model...')
sys.path.append(osp.join(vibe_path, 'VIBE'))
# sys.path.append(osp.join(vibe_path, 'VIBE/data/vibe_data'))
# sys.path.append('/content/drive/MyDrive/THESIS/VIBE/')
# print(sys.path)

%cd VIBE
from lib.models.vibe import VIBE_Demo
from lib.utils.demo_utils import download_ckpt, video_to_images
from lib.dataset.inference import Inference
from lib.models.smpl import SMPL_MODEL_DIR

DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
VIBE_MODEL = VIBE_Demo(seqlen=16, n_layers=2, hidden_size=1024,
                                add_linear=True, use_residual=True).to(DEVICE)
pretrained_file = download_ckpt(use_3dpw=False)
ckpt = torch.load(pretrained_file)
print(f'Performance of pretrained VIBE model on 3DPW: {ckpt["performance"]}')

ckpt = ckpt['gen_state_dict']
VIBE_MODEL.load_state_dict(ckpt, strict=False)
VIBE_MODEL.eval()
print(f'Loaded pretrained VIBE weights from \"{pretrained_file}\"')


# utils
def get_filename(file_idx):
  name_length = 6
  diff = name_length - len(str(file_idx))
  return diff*'0' + str(file_idx)

# credit: HumanML3D
def swap_left_right(joints):

  assert len(joints.shape) == 3 and joints.shape[-1] == 3
  joints = joints.copy()
  joints[..., 0] *= -1

  right_chain = [2, 5, 8, 11, 14, 17, 19, 21]
  left_chain = [1, 4, 7, 10, 13, 16, 18, 20]

  tmp = joints[:, right_chain]
  joints[:, right_chain] = joints[:, left_chain]
  joints[:, left_chain] = tmp
  return joints

%cd /content

/content
Loading whisper model...
Loading models for caption augmentation...



  torch.Tensor(smpl_data['betas'].r).unsqueeze(0))



Loading SMPL layer...


[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


Loading VIBE model...
/content/VIBE






=> loaded pretrained model from 'data/vibe_data/spin_model_checkpoint.pth.tar'
Performance of pretrained VIBE model on 3DPW: 56.56075477600098
Loaded pretrained VIBE weights from "data/vibe_data/vibe_model_wo_3dpw.pth.tar"
/content


In [None]:
#@title Single Person Detector (based on yolov7)
# NOTE: I changed line 21 in utils/google_utils to `file = Path(str(file))` (in Drive the uppercase matters...)

from pathlib import Path
import torch.backends.cudnn as cudnn
from numpy import random

print('yolo imports...')
sys.path.append(yolo_path+'/yolov7')
from models.experimental import attempt_load
from utils.datasets import LoadStreams, LoadImages
from utils.general import check_img_size, check_requirements, check_imshow, non_max_suppression, apply_classifier, \
    scale_coords, xyxy2xywh, strip_optimizer, set_logging, increment_path
from utils.plots import plot_one_box
from utils.torch_utils import select_device, load_classifier, time_synchronized, TracedModel

def load_yolo_model(weights=[yolo_path+'/yolov7/yolov7.pt'], img_size=640, trace=True, device=''):

  device = select_device(device)
  half = device.type != 'cpu'

  # Load model
  model = attempt_load(weights, map_location=device)  # load FP32 model
  if trace:
      model = TracedModel(model, device, img_size)
  if half:
      model.half()  # to FP16

  return model


YOLO_MODEL = load_yolo_model()

def SinglePersonDetector(
    model, source, img_size=640, conf_thres=0.25,
    iou_thres=0.45, device='', view_img=False, save_txt=True, save_conf=False,
    nosave=False, classes=None, agnostic_nms=False, augment=False,
    update=False, project='runs/detect', name='exp', exist_ok=False, save_bboxes=True):

  imgsz=img_size

  save_img = not nosave and not source.endswith('.txt')  # save inference images
  webcam = source.isnumeric() or source.endswith('.txt') or source.lower().startswith(
      ('rtsp://', 'rtmp://', 'http://', 'https://'))

  # Directories
  save_dir = Path(increment_path(Path(project) / name, exist_ok=exist_ok))  # increment run
  (save_dir / 'labels' if save_txt else save_dir).mkdir(parents=True, exist_ok=True)  # make dir

  # Initialize
  set_logging()
  device = select_device(device)
  half = device.type != 'cpu'

  # Model attributes
  stride = int(model.stride.max())  # model stride
  imgsz = check_img_size(imgsz, s=stride)  # check img_size

  # Second-stage classifier
  classify = False
  if classify:
      modelc = load_classifier(name='resnet101', n=2)  # initialize
      modelc.load_state_dict(torch.load('weights/resnet101.pt', map_location=device)['model']).to(device).eval()

  # Set Dataloader
  vid_path, vid_writer = None, None
  if webcam:
      view_img = check_imshow()
      cudnn.benchmark = True  # set True to speed up constant image size inference
      dataset = LoadStreams(source, img_size=imgsz, stride=stride)
  else:
      dataset = LoadImages(source, img_size=imgsz, stride=stride)

  # Get names and colors
  names = model.module.names if hasattr(model, 'module') else model.names
  colors = [[random.randint(0, 255) for _ in range(3)] for _ in names]

  # Run inference
  if device.type != 'cpu':
      model(torch.zeros(1, 3, imgsz, imgsz).to(device).type_as(next(model.parameters())))  # run once
  old_img_w = old_img_h = imgsz
  old_img_b = 1

  t0 = time.time()
  person_tracked = False
  bbox_info = {'frames':[], 'bbox': []}
  for path, img, im0s, vid_cap in tqdm(dataset):
      img = torch.from_numpy(img).to(device)
      img = img.half() if half else img.float()  # uint8 to fp16/32
      img /= 255.0  # 0 - 255 to 0.0 - 1.0
      if img.ndimension() == 3:
          img = img.unsqueeze(0)

      # Warmup
      if device.type != 'cpu' and (old_img_b != img.shape[0] or old_img_h != img.shape[2] or old_img_w != img.shape[3]):
          old_img_b = img.shape[0]
          old_img_h = img.shape[2]
          old_img_w = img.shape[3]
          for i in range(3):
              model(img, augment=augment)[0]

      # Inference
      t1 = time_synchronized()
      with torch.no_grad():   # Calculating gradients would cause a GPU memory leak
          pred = model(img, augment=augment)[0]
      t2 = time_synchronized()

      # Apply NMS
      pred = non_max_suppression(pred, conf_thres, iou_thres, classes=classes, agnostic=agnostic_nms)
      t3 = time_synchronized()

      # Apply Classifier
      if classify:
          pred = apply_classifier(pred, modelc, img, im0s)

      # Process detections
      for i, det in enumerate(pred):  # detections per image
          if webcam:  # batch_size >= 1
              p = Path(path[i])
              s, im0, frame = '%g: ' % i, im0s[i].copy(), dataset.count
          else:
              p = Path(path)
              s, im0, frame = '', im0s, getattr(dataset, 'frame', int(p.stem)-1)

          save_path = str(save_dir / p.name)  # img.jpg
          txt_path = str(save_dir / 'labels' / p.stem) + ('' if dataset.mode == 'image' else f'_{frame}')  # img.txt
          gn = torch.tensor(im0.shape)[[1, 0, 1, 0]]  # normalization gain whwh

          if len(det):
              # Rescale boxes from img_size to im0 size
              det[:, :4] = scale_coords(img.shape[2:], det[:, :4], im0.shape).round()

              # Print results
              for c in det[:, -1].unique():
                  n = (det[:, -1] == c).sum()  # detections per class
                  s += f"{n} {names[int(c)]}{'s' * (n > 1)}, "  # add to string

              # Results
              biggest_bbox = 0
              xyxy_main = det[-1][:4]

              for *xyxy, conf, cls in reversed(det):  # check all boxes & get main person out
                if cls == 0:  # get persons (idx for person = 0)
                  person_tracked = True

                  # MPT style
                  xyxy = [d.cpu() for d in xyxy]
                  w, h = xyxy[2] - xyxy[0], xyxy[3] - xyxy[1]
                  c_x, c_y = xyxy[0] + w/2, xyxy[1] + h/2

                  if float(w*h) > biggest_bbox:
                    biggest_bbox = float(w*h)
                    w = h = np.where(w / h > 1, w, h)
                    bbox_MPT_main = np.array([c_x, c_y, w, h])
                    xyxy_main = xyxy

              if person_tracked:   # only write if a person has been tracked...
                bbox_info['frames'] += [frame]
                bbox_info['bbox'] += [bbox_MPT_main]

                if save_txt:  # Write to file
                    line = (cls, *bbox_MPT_main, conf) if save_conf else (cls, *bbox_MPT_main)  # label format
                    with open(txt_path + '.txt', 'a') as f:
                        f.write(('%g ' * len(line)).rstrip() % line + '\n')

                if save_img or view_img:  # Add bbox to image
                    label = f'{names[int(cls)]} {conf:.2f}'
                    plot_one_box(xyxy_main, im0, label=label, color=colors[int(cls)], line_thickness=1)

          # Print time (inference + NMS)
          # print(f'{s}Done. ({(1E3 * (t2 - t1)):.1f}ms) Inference, ({(1E3 * (t3 - t2)):.1f}ms) NMS')

          # Stream results
          if view_img and person_tracked:
              cv2.imshow(str(p), im0)
              cv2.waitKey(1)  # 1 millisecond

          # Save results (image with detections)
          if save_img and person_tracked:
              if dataset.mode == 'image':
                  cv2.imwrite(save_path, im0)
                  print(f" The image with the result is saved in: {save_path}")
              else:  # 'video' or 'stream'
                  if vid_path != save_path:  # new video
                      vid_path = save_path
                      if isinstance(vid_writer, cv2.VideoWriter):
                          vid_writer.release()  # release previous video writer
                      if vid_cap:  # video
                          fps = vid_cap.get(cv2.CAP_PROP_FPS)
                          w = int(vid_cap.get(cv2.CAP_PROP_FRAME_WIDTH))
                          h = int(vid_cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
                      else:  # stream
                          fps, w, h = 30, im0.shape[1], im0.shape[0]
                          save_path += '.mp4'
                      vid_writer = cv2.VideoWriter(save_path, cv2.VideoWriter_fourcc(*'mp4v'), fps, (w, h))
                  vid_writer.write(im0)

  if save_txt or save_img and person_tracked:
      s = f"\n{len(list(save_dir.glob('labels/*.txt')))} labels saved to {save_dir / 'labels'}" if save_txt else ''
      print(f"Results saved to {save_dir}{s}")

  if not person_tracked:
    shutil.rmtree(save_dir)
    print("No person was tracked so no output :(")

  print(f'Done. ({time.time() - t0:.3f}s)')
  return bbox_info

yolo imports...
Fusing layers... 


  if param.grad is not None:



RepConv.fuse_repvgg_block
RepConv.fuse_repvgg_block
RepConv.fuse_repvgg_block
 Convert model to Traced-model... 
 traced_script_module saved! 
 model is traced! 



In [None]:
#@title DataProcessor

class DataProcessor:

  def __init__(self, data_dir, device=None):
    ''' In video_path, insert either a directory where your videos are stored,
        or otherwise, a link to a single youtube video or a youtube playlist
        link '''
    self.data_dir = data_dir
    self.video_path = video_path
    self.videos_df = pd.read_csv(osp.join(self.data_dir, 'video_files.csv'))

    self.device = DEVICE if device is None else device

    self.whisper_model = WHISPER_MODEL
    self.glove_model = GLOVE_MODEL
    self.emotion_tokenizer = EMOTION_TOKENIZER
    self.emotion_model = EMOTION_MODEL
    self.vibe_model = VIBE_MODEL
    self.SMPL_layer = SMPL_LAYER
    self.nlp = NLP

    self.min_frames = 25
    self.vibe_batch_size = 128
    self.tracker_batch_size = 12
    self.yolo_img_size = 416
    self.display = False
    self.render = False
    self.bbox_scale = 1.1
    self.smooth_pose = False

  def download_yt_video(self, url, video_idx):

    if osp.isfile(f'{self.video_path}/{video_idx}.mp4'):
      print(f'yt video exists: {self.video_path}/{video_idx}.mp4')
      return f'{self.video_path}/{video_idx}.mp4', self.videos_df.loc[video_idx, ('fps')]

    yt = YouTube(url, use_oauth=True)
    # stream = yt.streams.filter(progressive=True, file_extension='mp4').order_by('resolution').desc().first()
    stream = yt.streams.filter(file_extension='mp4').order_by('resolution').desc().first()
    print(f'video resolution= {stream.resolution}')
    video_fps = stream.fps
    video_file = stream.download(output_path=f'{self.video_path}', filename=f'{video_idx}.mp4')

    audio_stream = yt.streams.filter(type='audio').desc().first()
    audio_file = audio_stream.download(output_path='/tmp', filename=f'{video_idx}.wav')

    if video_file is None:
      exit('Youtube url is not valid!')
    print(f'YouTube Video has been downloaded to {video_file}...')

    if audio_file is None:
      exit('Youtube url is not valid!')
    print(f'Audio has been downloaded to {audio_file}...')

    return video_file, video_fps

  def get_whisper_captions(self, video_file):
    video_idx = osp.basename(video_file).replace('.mp4', '')
    audio_file = osp.join('/tmp', f'{video_idx}.wav')

    caps_dir = osp.join(self.data_dir, 'caption_output')
    Path(caps_dir).mkdir(parents=True, exist_ok=True)
    caps_file = osp.join(caps_dir, f"{video_idx}.pkl")

    if osp.isfile(audio_file):
      print(f'audio video exists: {audio_file}')
    else:
      clip = mopy.editor.VideoFileClip(video_file)
      clip.audio.write_audiofile(audio_file, codec='pcm_s16le')

    if osp.isfile(caps_file):
      print(f'caps file exists: {caps_file}')
      return joblib.load(caps_file)
    else:
      captions = self.whisper_model.transcribe(audio_file, word_timestamps=True).to_dict()
      joblib.dump(captions, caps_file)

    del audio_file
    return captions

  def process_captions(self, captions):
    # function inspired by HumanML3D github

    processed_captions = []

    for caption in captions['segments']:
      text = caption['text']
      sentences = self.augment_captions(text).split('\n')
      tokens = []

      for text in sentences:
        doc = self.nlp(text)

        if doc == "" or doc == " ":
          # save empty as X -> https://spacy.io/api/token#attributes
          tokens += [' /X']

        else:
          word_list, pos_list = [], []
          for token in doc:
            word = token.text
            if not word.isalpha():
              continue
            if (token.pos_ == 'NOUN' or token.pos_ == 'VERB') and (word != 'left'):
              word_list.append(token.lemma_)
            else:
              word_list.append(word)
            pos_list.append(token.pos_)
          tokens += [' '.join([f'{word_list[i]}/{pos_list[i]}' for i in range(len(word_list))])]

      # timestamp (s) to frame_count
      start = caption['start']
      end = caption['end']

      caption_info = {"text": sentences, "tokens": tokens, "start": start, "end": end}
      processed_captions.append(caption_info)

    return processed_captions

  def augment_captions(self, caption, emotions=True, similar_words=True):

    sia = SentimentIntensityAnalyzer()
    cap_polarity = sia.polarity_scores(caption)['compound']

    if emotions:
      input_ids = self.emotion_tokenizer.encode(caption, return_tensors='pt')
      emotion_output = self.emotion_model.generate(input_ids=input_ids, max_length=20)
      emotion = [self.emotion_tokenizer.decode(ids, skip_special_tokens=True) for ids in emotion_output][0]
    else:
      emotion = ""

    if similar_words:   # TODO: only simwords for the emotion word??
      try:
        similar_words = GLOVE_MODEL.most_similar(emotion) # could premake this bc it has only 5 emotions
        similar_words = [w[0] for w in similar_words]
        emotion_polarity = sia.polarity_scores(emotion)['compound']
        same_polarity_words = []

        for word in similar_words:
          word_polarity = sia.polarity_scores(word)['compound']

          if emotion_polarity - 0.2 <= word_polarity <= emotion_polarity + 0.2:
            same_polarity_words += [word]
        same_polarity_words = " ".join(same_polarity_words[:5])

      except Exception as e:
        print(f"Didn't work for emotion {emotion}, {e}")
        same_polarity_words = ""
    else:
      same_polarity_words = ""

    return f'{caption}\n{caption} {emotion} {same_polarity_words}\n{emotion} {same_polarity_words}'

  def get_vibe_poses(self, video_file):
    # inspired by VIBE github code but modified.
    # need to download VIBE in subfolder to be able to run this.

    video_idx = osp.basename(video_file).replace('.mp4', '')
    vibe_dir = osp.join(self.data_dir, 'vibe_output')
    Path(vibe_dir).mkdir(parents=True, exist_ok=True)
    output_file = osp.join(vibe_dir, f"{video_idx}.pkl")

    if osp.isfile(output_file):
      print(f'VIBE file exists: {output_file}')
      output_dict = joblib.load(output_file)
      return output_dict

    if osp.isdir(f'/tmp/{video_idx}_mp4'):
      image_folder = f'/tmp/{video_idx}_mp4'
      img_files = [i for i in os.listdir(image_folder)]
      num_frames = len(img_files)
      test_img = cv2.imread(f'{image_folder}/{img_files[0]}')
      img_shape = test_img.shape
    else:
      image_folder, num_frames, img_shape = video_to_images(video_file, return_info=True)

    print(f'Video {video_idx}, number of frames: {num_frames}')
    orig_height, orig_width = img_shape[:2]
    total_time = time.time()

    print('Tracking pose...')
    tracking_results = SinglePersonDetector(
        YOLO_MODEL, source=image_folder,
        conf_thres=0.85, save_txt=False,
        save_conf=False, nosave=True,
        project='data/SWDance/videos',
    )

    bboxes = np.array(tracking_results['bbox'])
    frames = np.array(tracking_results['frames'])

    print(f'Running VIBE...')
    dataset = Inference(
        image_folder=image_folder,
        frames=frames,
        bboxes=bboxes,
        joints2d=None,
        scale=self.bbox_scale,
    )

    dataloader = DataLoader(dataset, batch_size=self.vibe_batch_size, num_workers=4)

    with torch.no_grad():
      pred_cam, pred_verts, pred_pose, pred_betas, pred_joints3d = [], [], [], [], []

      for batch in tqdm(dataloader):
        try:
          batch = batch.unsqueeze(0)
          batch = batch.to(self.device)
          batch_size, seqlen = batch.shape[:2]

          output = dp.vibe_model(batch)[-1]

          pred_cam.append(output['theta'][:, :, :3].reshape(batch_size * seqlen, -1))
          pred_verts.append(output['verts'].reshape(batch_size * seqlen, -1, 3))
          pred_pose.append(output['theta'][:,:,3:75].reshape(batch_size * seqlen, -1))
          pred_betas.append(output['theta'][:, :,75:].reshape(batch_size * seqlen, -1))
          pred_joints3d.append(output['kp_3d'].reshape(batch_size * seqlen, -1, 3))

        except Exception as e:
          print("Error:", e)

        del batch

    print('Transform pose to SMPL24 ...')
    smpl24_joints = self.transform_pose_smpl24(torch.cat(pred_pose, dim=0).cpu(),
                                               torch.cat(pred_betas, dim=0).cpu())
    print('Post process pose and smooth...')
    smpl24_joints_smooth = self.post_process_pose(dataset.bboxes, smpl24_joints)

    output_dict = {
            'pred_cam': torch.cat(pred_cam, dim=0).cpu().numpy(),
            'verts': torch.cat(pred_verts, dim=0).cpu().numpy(),
            'pose': torch.cat(pred_pose, dim=0).cpu().numpy(),
            'betas': torch.cat(pred_betas, dim=0).cpu().numpy(),
            'joints3d': torch.cat(pred_joints3d, dim=0).cpu().numpy(),
            'bboxes': dataset.bboxes,
            'frame_ids': dataset.frames,
            'smpl24_joints': smpl24_joints,
            'smpl24_joints_smooth': smpl24_joints_smooth
            }

    total_time = time.time() - total_time
    print(f'Total time: {total_time:.2f}s for {num_frames} frames.')

    print(f'Saving output results to \"{output_file}\".')
    joblib.dump(output_dict, output_file)
    # shutil.rmtree(image_folder)   # deletes the images
    return output_dict

  def transform_pose_smpl24(self, pose, betas):
    # get smpl24
    verts, smpl24_joints = self.SMPL_layer(pose, th_betas=betas)

    # flip upside down (VIBE output = upside down)
    R = transforms3d.euler.axangle2mat([1, 0, 0], -np.pi)
    smpl24_rotated = np.dot(smpl24_joints, R)

    return smpl24_rotated

  def align_captions_poses(self, captions, pose_dict, index_df, video_idx):

    print(f'Aligning poses and caps for video {video_idx}...')
    file_idx = 0 if index_df.empty else index_df['file_idx'].iloc[-1]+1
    curr_timestamp = 0

    for caption in captions:
      if caption['start'] > curr_timestamp:
        # frames with NO captions
        file_idx, index_df = self.save_pose(
            file_idx, video_idx, pose_dict, index_df,
            start=curr_timestamp,
            end=caption['start'],
            caption=False)

      # frames WITH captions
      file_idx, index_df = self.save_pose(
          file_idx, video_idx, pose_dict, index_df,
          start=caption['start'],
          end=caption['end'],
          caption=caption)
      curr_timestamp = caption['end']

    return index_df

  def fill_missing_frames(self, joints):

    consec_missing = []   # TODO: fill up with in-betweening??

    for i in range(len(joints)):
      if np.isnan(joints[i]).all():
        consec_missing += [i]
        if len(consec_missing) > 10:
          print('more than 10 are missing consecutively ', i)
        joints[i] = joints[i-1]  # fill with prev joints
      else:
        consec_missing = []

    return joints

  def save_pose(self, file_idx, video_idx, pose_dict, index_df, start, end, caption=True):
    filename = get_filename(file_idx)
    video_fps = self.videos_df.loc[video_idx, ('fps')]
    frame_ids = [i for i in range(round(start*video_fps), round(end*video_fps)+1)]
    joints = []

    for frame_id in frame_ids:
      idx = np.where(pose_dict['frame_ids'] == frame_id)[0] # first appearance
      try:
        if idx.size > 0:
          joints += [pose_dict['smpl24_joints_smooth'][idx[0]]]
        else:
          joints += [np.full([24, 3], np.nan)]

      except Exception as e:
        print(f"not working for joints idx {idx}, {type(joints)} to append {type(pose_dict['smpl24_joints_smooth'][idx[0]])} \n {e}")

    if all(np.isnan(j).all() for j in joints):
      print(f'no joints for video {video_idx} at {start} to {end}')
      return file_idx, index_df

    # remove the Nones at the beginning and end of the list
    while np.isnan(joints[-1]).all():
      end -= round(1/video_fps, 2)
      del joints[-1]

    while np.isnan(joints[0]).all():
      start += round(1/video_fps, 2)
      del joints[0]

    if len(joints) < self.min_frames:
      print(f'not enough joints for video {video_idx} at {start} to {end}')
      return file_idx, index_df

    joints = self.fill_missing_frames(joints)
    joints = np.stack(joints, axis=0)

    for side in ['', 'M']: # mirror pose
      if side == 'M':
        joints = swap_left_right(joints)

      joints_dir = osp.join(self.data_dir, 'joints')
      Path(joints_dir).mkdir(parents=True, exist_ok=True)
      np.save(osp.join(joints_dir, f'{side}{filename}.npy'), joints)

      df_row = {'file_idx': file_idx,
                'video_idx': video_idx,               # which video it belongs to
                'start_frame': round(start*video_fps),
                'end_frame': round(end*video_fps),
                'start_time': start,
                'end_time': end,
                'new_name': f'{side}{filename}.npy',  # named for humanml3d
                'fps': video_fps,
                'caption': "" if not caption else caption["text"][0],
                'no_frames': len(joints)}
      index_df = pd.concat([index_df, pd.DataFrame([df_row])], ignore_index=True)

      text_dir = osp.join(self.data_dir, 'texts')
      os.makedirs(text_dir, exist_ok=True)
      tokens_dir = osp.join(self.data_dir, 'tokens')
      os.makedirs(tokens_dir, exist_ok=True)
      just_texts_dir = osp.join(self.data_dir, 'just_texts')
      os.makedirs(just_texts_dir, exist_ok=True)

      if caption:
        # with open(osp.join(text_dir, f'{side}{filename}.txt'), 'w') as f:
        #   f.write(f'{caption["text"]}#{caption["tokens"]}#0.0#0.0\n')
        with open(osp.join(text_dir, f'{side}{filename}.txt'), 'w') as f:
          for i, t in enumerate(caption["text"]):
            f.write(f'{t}#{caption["tokens"][i]}#0.0#0.0\n')
        with open(osp.join(tokens_dir, f'{side}{filename}.txt'), 'w') as f:
          f.write('\n'.join(caption["tokens"]))
        with open(osp.join(just_texts_dir, f'{side}{filename}.txt'), 'w') as f:
          f.write('\n'.join(caption["text"]))

      else:
        with open(osp.join(text_dir, f'{side}{filename}.txt'), 'w') as f:
          f.write(f' # /X#0.0#0.0\n # /X#0.0#0.0\n # /X#0.0#0.0\n')
        with open(osp.join(tokens_dir, f'{side}{filename}.txt'), 'w') as f:
          f.write(' /X\n /X\n /X\n')
        with open(osp.join(just_texts_dir, f'{side}{filename}.txt'), 'w') as f:
          f.write(f' \n \n \n')

    file_idx += 1

    return file_idx, index_df

  def post_process_pose(self, bboxes, joints):
    # first, get x,y movement from bboxes
    # normalise bboxes between 0 & 1
    bbox_diff = np.max(bboxes, axis=0) - np.min(bboxes, axis=0)
    bbox_norm = (bboxes-np.min(bboxes, axis=0))/bbox_diff

    origin = bbox_norm[0]
    delta_bbox = bbox_norm - origin # change from every bbox from origin

    for i, joint in enumerate(joints):
      joints[i, :, 0] = joints[i, :, 0] - delta_bbox[i, 0]/2  # estimated x movement
      joints[i, :, 1] = joints[i, :, 1] - delta_bbox[i, 1]/2  # estimated y movement

    # then, translate pose to be standing on the floor
    # lowest y coordinate of feet joints (= 10 & 11)
    lowest = min(np.min(joints[:, 10, 1]), np.min(joints[:, 11, 1]))
    diff = 0 - lowest
    joints[:, :, 1] = joints[:, :, 1] + diff

    # finally, smooth pose with smoothing filter
    joints = savgol_filter(joints, window_length=12, polyorder=3, axis=0)

    return joints

  def run(self):
    try:
      index_df = pd.read_csv(osp.join(self.data_dir, 'index.csv'))
    except:
      index_df = pd.DataFrame()

    for i, video in self.videos_df.iterrows():
      print(f'Processing video {i}...')

      video_file = video['video_file']
      if video['processed'] == True:
        video_fps = video['fps']
        print(f'video {i} is already processed')
        continue
      elif any(yt in video_file for yt in ["youtube", "youtu.be"]):
        try:
          video_file, video_fps = self.download_yt_video(video_file, i)
        except Exception as e:
          print(f'Could not download video {i}, {e}')
          continue

      vibe_output = self.get_vibe_poses(video_file)

      caps = self.get_whisper_captions(video_file)
      processed_caps = self.process_captions(caps)

      index_df = self.align_captions_poses(processed_caps, vibe_output, index_df, i)
      self.videos_df.loc[i, ('processed')] = True
      self.videos_df.loc[i, ('fps')] = video_fps

      # update dfs after each video
      index_df.to_csv(osp.join(self.data_dir, 'index.csv'), index=False)
      self.videos_df.to_csv(osp.join(self.data_dir,'video_files.csv'), index=False)

      vibe_output = caps = processed_caps = None # does this clear RAM?

    return index_df

# Run

In [None]:
%cd /content
playlist = 'https://www.youtube.com/playlist?list=PLtWL_OMHER4XstPsqS5y-nHUURORdtUkI' # @param {type: "string"}
# also possible: link to a folder with .mp4 videos or a link to a single yt video

/content


In [None]:
# create .csv of youtube links.
if any(yt in playlist for yt in ["youtube", "youtu.be"]):
  video_list = Playlist(playlist) if 'playlist' in playlist else [playlist]
else:
  video_list = [f for f in os.listdir(playlist) if 'mp4' in f]
print(f'Number Of Videos: {len(video_list)}')

videos_df = pd.DataFrame({'video_file':video_list, 'processed':False, 'fps':20})
videos_df.to_csv(osp.join(data_dir, 'video_files.csv'), index=False)
videos_df

Number Of Videos: 2


Unnamed: 0,video_file,processed,fps
0,https://www.youtube.com/watch?v=fSHjGPWmkb8,False,20
1,https://www.youtube.com/watch?v=XfCK-Kj_ST0,False,20


In [None]:
dp = DataProcessor(data_dir)

In [None]:
index_df = dp.run()

### Check dataset

In [None]:
index_df = pd.read_csv(f'{data_dir}/index.csv')
index_df.head()

Unnamed: 0,file_idx,video_idx,start_frame,end_frame,start_time,end_time,new_name,fps,caption,no_frames
0,0,0,149,177,7.45,8.84,000000.npy,20,,29
1,0,0,149,177,7.45,8.84,M000000.npy,20,,29
2,1,0,177,203,8.84,10.14,000001.npy,20,"breathing of statues,",27
3,1,0,177,203,8.84,10.14,M000001.npy,20,"breathing of statues,",27
4,2,0,203,238,10.14,11.9,000002.npy,20,,36


In [None]:
total_frames = sum(index_df['no_frames'])
fps = 20
print(f'{len(index_df)} files (incl. mirrored), {round((total_frames/fps)/60, 2)} minutes == {round((total_frames/fps)/3600, 2)} hours of movement')

56 files (incl. mirrored), 1.97 minutes == 0.03 hours of movement


In [None]:
data = np.load(f'{data_dir}/joints/' + os.listdir(f'{data_dir}/joints')[0])
data.shape  # should be (n, 24, 3), where n is the number of frames for the sequence.

(93, 24, 3)

In [None]:
index_df['fps'].unique() # I think this part is not working, all have the same fps...

array([20])

### Run this if you want to reset things

In [None]:
!rm {data_dir}/index.csv

videos_df = pd.read_csv(f'{data_dir}/video_files.csv')
videos_df.loc[1, ('processed')] = False  # TODO: do this for every video
videos_df.to_csv(osp.join(data_dir,'video_files.csv'), index=False)
print(videos_df.head())

!rm {data_dir}/joints/*

# Visualise motions
To check if everything went well, you can visualise one of the motions.

In [None]:
import os
from os.path import join as pjoin
from tqdm import tqdm
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from matplotlib.animation import FuncAnimation, PillowWriter
from mpl_toolkits.mplot3d.art3d import Poly3DCollection
import mpl_toolkits.mplot3d.axes3d as p3
import joblib

kinematic_chain = [[0, 2, 5, 8, 11], [0, 1, 4, 7, 10], [0, 3, 6, 9, 12, 15], [9, 14, 17, 19, 21], [9, 13, 16, 18, 20]]

def plot_3d_motion(save_path, kinematic_tree, joints, title, figsize=(10, 10), fps=120, radius=4):

    title_sp = title.split(' ')
    if len(title_sp) > 10:
        title = '\n'.join([' '.join(title_sp[:10]), ' '.join(title_sp[10:])])
    def init():
        ax.set_xlim3d([-radius / 2, radius / 2])
        ax.set_ylim3d([0, radius])
        ax.set_zlim3d([0, radius])
        fig.suptitle(title, fontsize=20)
        ax.grid(b=False)

    def plot_xzPlane(minx, maxx, miny, minz, maxz):
        ## Plot a plane XZ
        verts = [
            [minx, miny, minz],
            [minx, miny, maxz],
            [maxx, miny, maxz],
            [maxx, miny, minz]
        ]
        xz_plane = Poly3DCollection([verts])
        xz_plane.set_facecolor((0.5, 0.5, 0.5, 0.5))
        ax.add_collection3d(xz_plane)

    data = joints.copy().reshape(len(joints), -1, 3)
    fig = plt.figure(figsize=figsize)
    ax = p3.Axes3D(fig)
    init()
    MINS = data.min(axis=0).min(axis=0)
    MAXS = data.max(axis=0).max(axis=0)
    colors = ['red', 'blue', 'black', 'red', 'blue',
              'darkblue', 'darkblue', 'darkblue', 'darkblue', 'darkblue',
             'darkred', 'darkred','darkred','darkred','darkred']
    frame_number = data.shape[0]

    height_offset = MINS[1]
    data[:, :, 1] -= height_offset
    trajec = data[:, 0, [0, 2]]

    data[..., 0] -= data[:, 0:1, 0]
    data[..., 2] -= data[:, 0:1, 2]

    def update(index):
        ax.lines = []
        ax.collections = []
        ax.view_init(elev=120, azim=-90)
        ax.dist = 7.5
        plot_xzPlane(MINS[0]-trajec[index, 0], MAXS[0]-trajec[index, 0], 0, MINS[2]-trajec[index, 1], MAXS[2]-trajec[index, 1])

        if index > 1:
            ax.plot3D(trajec[:index, 0]-trajec[index, 0], np.zeros_like(trajec[:index, 0]), trajec[:index, 1]-trajec[index, 1], linewidth=1.0,
                      color='blue')


        for i, (chain, color) in enumerate(zip(kinematic_tree, colors)):
            if i < 5:
                linewidth = 4.0
            else:
                linewidth = 2.0
            ax.plot3D(data[index, chain, 0], data[index, chain, 1], data[index, chain, 2], linewidth=linewidth, color=color)

        plt.axis('off')
        ax.set_xticklabels([])
        ax.set_yticklabels([])
        ax.set_zticklabels([])

    ani = FuncAnimation(fig, update, frames=frame_number, interval=1000/fps, repeat=False)

    ani.save(save_path, fps=fps)
    print('animation is saved')
    plt.close()


In [None]:
save_path = '/content/animation.mp4' #@param {type: 'string'}
joints = np.load(f'{data_dir}/joints/' + os.listdir(f'{data_dir}/joints')[0])
plot_3d_motion(save_path, kinematic_chain, joints, title=" ", fps=25, radius=4)

animation is saved


# Post process with HumanML3D

Code largely taken from HumanML3D's  `motion_representation.ipynb`, somewhat edited/cleaned.

In [None]:
%cd /content
!git clone https://github.com/EricGuo5513/HumanML3D.git

[Errno 2] No such file or directory: 'content'
/content
Cloning into 'HumanML3D'...
remote: Enumerating objects: 192, done.[K
remote: Counting objects: 100% (66/66), done.[K
remote: Compressing objects: 100% (39/39), done.[K
remote: Total 192 (delta 46), reused 35 (delta 27), pack-reused 126[K
Receiving objects: 100% (192/192), 132.74 MiB | 15.94 MiB/s, done.
Resolving deltas: 100% (68/68), done.


In [None]:
#@title Imports
%cd HumanML3D
from os.path import join as pjoin

from common.skeleton import Skeleton
import numpy as np
import os
from common.quaternion import *
from paramUtil import *

import torch
from tqdm import tqdm
import os

/content/HumanML3D


In [None]:
# @title Util functions
def uniform_skeleton(positions, tgt_offset):

  skel = Skeleton(n_raw_offsets, kinematic_chain, 'cpu')
  src_offset = skel.get_offsets_joints(torch.from_numpy(positions[0]))

  src_leg_len = np.abs(src_offset[5]).max() + np.abs(src_offset[8]).max() # lower legs = 5, 8
  tgt_leg_len = np.abs(tgt_offset[5]).max() + np.abs(tgt_offset[8]).max()

  scale_rt = tgt_leg_len / src_leg_len
  tgt_root_pos = torch.from_numpy(positions[:, 0]) * scale_rt

  quat_params = skel.inverse_kinematics_np(positions, face_joint_indx)
  skel.set_offset(tgt_offset)
  new_joints = skel.forward_kinematics_np(quat_params, tgt_root_pos)
  return new_joints


def recover_root_rot_pos(data):
  rot_vel = data[..., 0]
  r_rot_ang = torch.cumsum(torch.cat([torch.zeros_like(rot_vel[..., :1]), rot_vel[..., :-1]], dim=-1), dim=-1)

  r_rot_quat = torch.zeros(data.shape[:-1] + (4,)).to(data.device)
  r_rot_quat[..., 0] = torch.cos(r_rot_ang)
  r_rot_quat[..., 2] = torch.sin(r_rot_ang)

  r_pos = torch.zeros(data.shape[:-1] + (3,)).to(data.device)
  r_pos[..., 1:, [0, 2]] = data[..., :-1, 1:3]

  r_rot_quat_inv = qinv(r_rot_quat)
  r_pos = qrot(r_rot_quat_inv, r_pos)
  r_pos = torch.cumsum(r_pos, dim=-2)
  r_pos[..., 1] = data[..., 3]

  return r_rot_quat, r_pos


def recover_from_rot(data, joints_num, skeleton):
  r_rot_quat, r_pos = recover_root_rot_pos(data)
  r_rot_cont6d = quaternion_to_cont6d(r_rot_quat)

  start_indx = 1 + 2 + 1 + (joints_num - 1) * 3
  end_indx = start_indx + (joints_num - 1) * 6
  cont6d_params = data[..., start_indx:end_indx]
  cont6d_params = data[..., 1:]
  cont6d_params = torch.cat([r_rot_cont6d, cont6d_params], dim=-1)
  cont6d_params = cont6d_params.view(-1, joints_num, 6)

  positions = skeleton.forward_kinematics_cont6d(cont6d_params, r_pos)

  return positions


def recover_from_ric(data, joints_num):
  r_rot_quat, r_pos = recover_root_rot_pos(data)
  positions = data[..., 4:(joints_num - 1) * 3 + 4]
  positions = positions.view(positions.shape[:-1] + (-1, 3))

  positions = qrot(qinv(r_rot_quat[..., None, :]).expand(positions.shape[:-1] + (4,)), positions)

  positions[..., 0] += r_pos[..., 0:1]
  positions[..., 2] += r_pos[..., 2:3]

  positions = torch.cat([r_pos.unsqueeze(-2), positions], dim=-2)

  return positions

def process_file(positions, feet_thre):
    # floor
    positions = uniform_skeleton(positions, tgt_offsets)
    floor_height = positions.min(axis=0).min(axis=0)[1]
    positions[:, :, 1] -= floor_height

    # XZ at origin
    root_pos_init = positions[0]
    positions = positions - root_pos_init[0] * np.array([1, 0, 1])

    # Z+
    # Unpack face joint indices
    # r_hip, l_hip, sdr_r, sdr_l = face_joint_indx

    # Calculate the direction across the hips and shoulders
    # across1 = root_pos_init[r_hip] - root_pos_init[l_hip]
    # across2 = root_pos_init[sdr_r] - root_pos_init[sdr_l]
    # across = (across1 + across2) / np.sqrt((across1 + across2) ** 2).sum(axis=-1)[..., np.newaxis]

    # # Calculate forward direction
    # forward_init = np.cross(np.array([[0, 1, 0]]), across, axis=-1)
    # forward_init /= np.sqrt((forward_init ** 2).sum(axis=-1))[..., np.newaxis]

    # Set the target direction
    # target = np.array([[0, 0, 1]])

    # # Calculate the initial root quaternion
    # root_quat_init = qbetween_np(forward_init, target)
    # root_quat_init = np.ones(positions.shape[:-1] + (4,)) * root_quat_init

    # Rotate positions using the root quaternion
    # positions = qrot_np(root_quat_init, positions)
    global_positions = positions.copy()


    def foot_detect(positions, thres):
      # Calculate feet detection thresholds
      velfactor, heightfactor = np.array([thres, thres]), np.array([3.0, 2.0])

      # Calculate left feet detection
      feet_l_x = (positions[1:, fid_l, 0] - positions[:-1, fid_l, 0]) ** 2
      feet_l_y = (positions[1:, fid_l, 1] - positions[:-1, fid_l, 1]) ** 2
      feet_l_z = (positions[1:, fid_l, 2] - positions[:-1, fid_l, 2]) ** 2
      feet_l = ((feet_l_x + feet_l_y + feet_l_z) < velfactor).astype(np.float32)

      # Calculate right feet detection
      feet_r_x = (positions[1:, fid_r, 0] - positions[:-1, fid_r, 0]) ** 2
      feet_r_y = (positions[1:, fid_r, 1] - positions[:-1, fid_r, 1]) ** 2
      feet_r_z = (positions[1:, fid_r, 2] - positions[:-1, fid_r, 2]) ** 2
      feet_r = ((feet_r_x + feet_r_y + feet_r_z) < velfactor).astype(np.float32)

      return feet_l, feet_r

    feet_l, feet_r = foot_detect(positions, feet_thre)
    r_rot = None

    def get_rifke(positions):
      positions[..., 0] -= positions[:, 0:1, 0]
      positions[..., 2] -= positions[:, 0:1, 2]
      positions = qrot_np(np.repeat(r_rot[:, None], positions.shape[1], axis=1), positions)
      return positions

    def get_cont6d_params(positions):
      skel = Skeleton(n_raw_offsets, kinematic_chain, "cpu")
      quat_params = skel.inverse_kinematics_np(positions, face_joint_indx, smooth_forward=False)

      # Quaternion to continuous 6D
      cont_6d_params = quaternion_to_cont6d_np(quat_params)
      r_rot = quat_params[:, 0].copy()

      # Root Linear Velocity
      velocity = (positions[1:, 0] - positions[:-1, 0]).copy()
      velocity = qrot_np(r_rot[1:], velocity)

      # Root Angular Velocity
      r_velocity = qmul_np(r_rot[1:], qinv_np(r_rot[:-1]))

      return cont_6d_params, r_velocity, velocity, r_rot

    cont_6d_params, r_velocity, velocity, r_rot = get_cont6d_params(positions)
    positions = get_rifke(positions)

    # Root height
    root_y = positions[:, 0, 1:2]

    # Root rotation and linear velocity
    r_velocity = np.arcsin(r_velocity[:, 2:3])
    l_velocity = velocity[:, [0, 2]]
    root_data = np.concatenate([r_velocity, l_velocity, root_y[:-1]], axis=-1)

    # Get Joint Rotation Representation
    rot_data = cont_6d_params[:, 1:].reshape(len(cont_6d_params), -1)

    # Get Joint Rotation Invariant Position Representation
    ric_data = positions[:, 1:].reshape(len(positions), -1)

    # Get Joint Velocity Representation
    local_vel = qrot_np(np.repeat(r_rot[:-1, None], global_positions.shape[1], axis=1),
                        global_positions[1:] - global_positions[:-1])
    local_vel = local_vel.reshape(len(local_vel), -1)

    # Combine all representations into a single data array
    data = np.concatenate([root_data, ric_data[:-1], rot_data[:-1], local_vel, feet_l, feet_r], axis=-1)

    return data, global_positions, positions, l_velocity

In [None]:
#@title Run motion_representation

example_id = "000000"
fid_r, fid_l = [8, 11], [7, 10]
face_joint_indx = [2, 1, 17, 16]
r_hip, l_hip = 2, 1
joints_num = 22

joints_dir = f'{data_dir}/joints/'
new_joints_dir = f'{data_dir}/new_joints/'
new_joint_vecs_dir = f'{data_dir}/new_joint_vecs/'

os.makedirs(new_joints_dir, exist_ok=True)
os.makedirs(new_joint_vecs_dir, exist_ok=True)

n_raw_offsets = torch.from_numpy(t2m_raw_offsets)
kinematic_chain = t2m_kinematic_chain

# Get offsets of target skeleton
example_data = np.load(os.path.join(joints_dir, example_id + '.npy'))
example_data = example_data.reshape(len(example_data), -1, 3)
example_data = torch.from_numpy(example_data)
tgt_skel = Skeleton(n_raw_offsets, kinematic_chain, 'cpu')
tgt_offsets = tgt_skel.get_offsets_joints(example_data[0])

source_list = sorted(os.listdir(joints_dir))
frame_num = 0

for source_file in tqdm(source_list):
    source_data = np.load(os.path.join(joints_dir, source_file))[:, :joints_num]
    try:
      data, ground_positions, positions, l_velocity = process_file(source_data, 0.002)
      rec_ric_data = recover_from_ric(torch.from_numpy(data).unsqueeze(0).float(), joints_num)
      np.save(pjoin(new_joints_dir, source_file), rec_ric_data.squeeze().numpy())
      np.save(pjoin(new_joint_vecs_dir, source_file), data)
      frame_num += data.shape[0]
    except Exception as e:
        print(source_file)
        print(e)

print('Total clips: %d, Frames: %d, Duration: %fm' % (len(source_list), frame_num, frame_num / 20 / 60))

100%|██████████| 56/56 [00:03<00:00, 15.51it/s]

Total clips: 56, Frames: 2304, Duration: 1.920000m





In [None]:
# @title mean_variance code
import numpy as np
import sys
import os
from os.path import join as pjoin
from tqdm import tqdm

# Info:
# root_rot_velocity (B, seq_len, 1)
# root_linear_velocity (B, seq_len, 2)
# root_y (B, seq_len, 1)
# ric_data (B, seq_len, (joint_num - 1)*3)
# rot_data (B, seq_len, (joint_num - 1)*6)
# local_velocity (B, seq_len, joint_num*3)
# foot contact (B, seq_len, 4)

def mean_variance(data_dir, save_dir, joints_num):
    file_list = os.listdir(data_dir)
    data_list = []

    for file in tqdm(file_list, desc='reading files'):
        data = np.load(pjoin(data_dir, file))
        if np.isnan(data).any():
            print(file)
            continue
        data_list.append(data)

    data = np.concatenate(data_list, axis=0)
    print(data.shape)
    Mean = data.mean(axis=0)
    Std = data.std(axis=0)
    Std[0:1] = Std[0:1].mean() / 1.0
    Std[1:3] = Std[1:3].mean() / 1.0
    Std[3:4] = Std[3:4].mean() / 1.0
    Std[4: 4+(joints_num - 1) * 3] = Std[4: 4+(joints_num - 1) * 3].mean() / 1.0
    Std[4+(joints_num - 1) * 3: 4+(joints_num - 1) * 9] = Std[4+(joints_num - 1) * 3: 4+(joints_num - 1) * 9].mean() / 1.0
    Std[4+(joints_num - 1) * 9: 4+(joints_num - 1) * 9 + joints_num*3] = Std[4+(joints_num - 1) * 9: 4+(joints_num - 1) * 9 + joints_num*3].mean() / 1.0
    Std[4 + (joints_num - 1) * 9 + joints_num * 3: ] = Std[4 + (joints_num - 1) * 9 + joints_num * 3: ].mean() / 1.0

    assert 8 + (joints_num - 1) * 9 + joints_num * 3 == Std.shape[-1]

    np.save(pjoin(save_dir, 'Mean.npy'), Mean)
    np.save(pjoin(save_dir, 'Std.npy'), Std)

    return Mean, Std

In [None]:
new_joint_vecs_dir = f'{data_dir}/new_joint_vecs/'
mean, std = mean_variance(new_joint_vecs_dir, data_dir, 22)

# See if you're on the right track:
HML3D_mean = np.load('HumanML3D/Mean.npy')
HML3D_std = np.load('HumanML3D/Std.npy')
print(f'Mean mine: {np.mean(mean)}, HML3D: {np.mean(HML3D_mean)}')
print(f'Std mine: {np.std(std)}, HML3D: {np.std(HML3D_std)}')

reading files: 100%|██████████| 56/56 [00:00<00:00, 597.17it/s]

(2304, 263)
Mean mine: 0.17349901795387268, HML3D: 0.1882529854774475
Std mine: 0.15011994540691376, HML3D: 0.12204898148775101





# TODO: train/test split