<center>

*******************************************************************************************
    
### Motion Tracking in Video Files with ALphaPose


##### 29 January 2025

##### Juan Ignacio Mendoza Garay  

*******************************************************************************************

</center>

##### INFORMATION:

* Extract one point of the body (in this demonstration the nose), for every tracked person in the picture. Then, get rid of extraneous data, interpolate missing data, and rearrange the tabular data. The resulting table has columns {p1_x,p1_y,p2_x,p2_y,...} where p is a tracked person from left to right, and {x,y} are horizontal and vertical coordinates of the point.

* Tested using:

    * AlphaPose
        * Fork: https://github.com/juigmend/Alpha_Pose
    * Python 3.11
    * Windows 11 operating system
    * Intel 64-bit CPU
>
* Dependencies:

    * Youtube video downloader: https://pypi.org/project/yt-dlp/
    * cython_bbox:
        1) install Desktop Development with C++ from the Visual Studio Installer
        2) type to command prompt: \
           set DISTUTILS_USE_SDK=1 
        3) install cython_bbox (e.g., using pip, conda, or other method)
    * Other packages might be prompted for installation.
>
* Instructions:

    Edit the values indicated with an arrow like this: <---  
    Comment/uncomment or change values as suggested by the comments.  
    Run the program, close your eyes and hope for the best.  

*******************************************************************************************

In [63]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from os import listdir
from os.path import exists, basename, splitext, isfile, isdir, join
import time
from datetime import timedelta
import io
import base64
import subprocess

***
#### Set paths and parameters:

In [69]:
AP_code_path = r"C:\Users\Gabriel_Rorke\Ghosts\Python\AlphaPose" # <--- folder of AlphaPose code

video_in_folder =  r"C:\Users\Gabriel_Rorke\Ghosts\Borodin_Quartet\input_video" 
video_in_path = video_in_folder # <--- path for input video file or folder with input video files

logs_path = r"C:\Users\Gabriel_Rorke\Ghosts\Borodin_Quartet\pose_tracking_results" # <--- folder for log files
video_out_path = logs_path+r'\video' # <--- folder for resulting AlphaPose video files (None to not save video)

json_path = logs_path+r'\tracking'        # <--- folder for resulting AlphaPose tracking files
figures_path = logs_path+r'\figures'      # <--- folder for data inspection files
preproc_path = logs_path+r'\preprocessed' # <--- folder for preprocessed data  files

fps = 29.97   # <--- fps (frames per second of input video)
n_persons = 4 # <--- expected number of individuals to be tracked
series_selection = [0,1] # <--- x and y of one point ([x1,y1] for "Nose", assuming COCO format)

overwrite_results = False # <--- recompute tracking (False will skip already processed videos)

save_parquet = True # <--- save pre-processed data to parquet file
save_raw_fig = 'concealed' # <--- save raw data figure: True, False or 'concealed'
save_pp_fig = 'concealed'  # <--- save preprocesed data figure: True, False or 'concealed'
save_AP_log = True  # <--- save pose tracking log
save_pp_log = True  # <--- save peprocessing log
verbose = False     # <--- display information and warnings

t_range = 'all'  # <--- list of time range to plot in frames (fps*seconds = frames) or 'all'
markersize = 0.8 # <--- marker size for plots
linewidth = 2    # <--- line width for plots

***
#### Pre-trained models:

In [65]:
# Most probably there is no need to alter the following:
yolo_pretrained_model_path = AP_code_path + r'\detector\yolo\data\yolov3-spp.weights'
pretrained_model_path = AP_code_path + r'\pretrained_models\fast_421_res152_256x192.pth'
pretrained_model_config_path = AP_code_path + r'\configs\coco\resnet\256x192_res152_lr1e-3_1x-duc.yaml'
tracker_weights_path = AP_code_path + r'\trackers\weights\osnet_ain_x1_0_msmt17_256x128_amsgrad_ep50_lr0.0015_coslr_b64_fb10_softmax_labsmth_flip_jitter.pth'

# OBJECT DETECTION:
if not exists(yolo_pretrained_model_path):
    ! mkdir {AP_code_path}\detector\yolo\data
    ! gdown -O {yolo_pretrained_model_path} https://drive.google.com/uc?id=1D47msNOOiJKvPOXlnpyzdKA3k6E97NTC

# POSE DETECTION:
if not exists(pretrained_model_path):
    ! gdown -O {pretrained_model_path} https://drive.google.com/uc?id=1kfyedqyn8exjbbNmYq8XGd2EooQjPtF9 # Fast Pose (DUC)
# Documentation: https://github.com/MVIG-SJTU/AlphaPose/blob/master/docs/MODEL_ZOO.md

# POSE TRACKING:
if not exists(tracker_weights_path):
    ! mkdir {AP_code_path}\trackers\weights
    ! gdown -O {tracker_weights_path} https://drive.google.com/uc?id=1myNKfr2cXqiHZVXaaG8ZAq_U2UpeOLfG # Human-ReID
# Documentation: https://github.com/MVIG-SJTU/AlphaPose/tree/master/trackers

***
#### Get filenames:

In [66]:
if isdir(video_in_path):
    ffn_lst = []
    fn_lst = []
    for fn in listdir(video_in_path):
        ffn = join(video_in_path, fn)
        if isfile(ffn):
            ffn_lst.append( ffn ) 
            fn_lst.append( fn ) 
            
elif isfile(video_in_path): 
    ffn_lst = [video_in_path]
    fn_lst = [splitext(basename(video_in_path))[0]]

n_files = len(ffn_lst)

if not overwrite_results: json_saved_fn = listdir(json_path)

***
#### Run AlphaPose:

In [67]:
save_video_str = ''
if video_out_path:
    save_video_str = f'--visoutdir {video_out_path} --save_video '

for ffn, fn in zip(ffn_lst,fn_lst):

    fn_ne = fn.split('.')[0]
    
    new_file = True
    if not overwrite_results:
        json_fn = f'alphapose-results_{fn_ne}.json'
        new_file = json_fn not in json_saved_fn

    if overwrite_results or new_file:

        if verbose: print(f'AlphaPose - processing {fn}')
        
        if save_AP_log: 
            AP_log_txt = [fn + '\n']
            tic = time.time()

        # AlphaPose:
        ! cd {AP_code_path} && python scripts\demo_inference.py --sp --video {ffn} \
        --jsonoutdir {json_path} {save_video_str} --checkpoint {pretrained_model_path} \
        --cfg {pretrained_model_config_path} --pose_track --suffix {fn_ne}
    
        # Save pose tracking log:
        if save_AP_log: 
            AP_log_txt.append(f"toc = {timedelta(seconds = time.time() - tic)} (H:M:S)\n")
            txtlog_ffn = logs_path + '\\' + 'AP_log.txt'
            AP_log_txt.append('\n')
            with open(txtlog_ffn, 'a') as output:
                for t in AP_log_txt:
                    output.write(t)

***
#### Pre-processing:

In [68]:
series_labels = ['x','y']

if not overwrite_results: json_saved_fn = listdir(json_path)
    
for fn in fn_lst:

    fn_ne = fn.split('.')[0]
    
    new_file = True
    if not overwrite_results:
        json_fn = f'alphapose-results_{fn_ne}.json'
        new_file = json_fn not in json_saved_fn

    if overwrite_results or new_file:
    
        # Load data from JSON file produced by AlphaPose:
        json_fn = f'alphapose-results_{fn_ne}.json'
        data_raw_df = pd.read_json(json_path + '\\' + json_fn)
    
        # Reduce by removing unnecessary data:
        if len(series_selection) > 2:
            raise Exception(''.join(['only one point with two dimensions (x,y) allowed, ',
                                     f'but instead got this: {series_selection}']))
        data_red_df = data_raw_df.drop(['category_id','keypoints','score','box'],axis=1)
        for lbl,i in zip(series_labels,series_selection):
            data_red_df[lbl] = data_raw_df.keypoints.str[i]
        data_red_df.image_id = data_red_df.image_id.str.split('.').str[0].astype(int)
    
        # Inspect and make plot of raw data:
        if save_raw_fig or save_pp_log:
            if save_pp_log: pp_log_txt = [fn_ne + '\n']
            if t_range == 'all':
                t_range = [0,data_red_df.image_id.max()]
            n_series = len(series_selection)
            persons_range = range(1,n_persons+1)
            for i_s in range(n_series):
                if save_raw_fig: plt.subplot(n_series,1,i_s+1)
                n_frames = []
                legend = []
                for i_p in persons_range:
                    data_red_slice_df =\
                        data_red_df[series_labels[i_s]][  (data_red_df.idx == i_p) 
                                                        & (data_red_df.image_id >= t_range[0]) 
                                                        & (data_red_df.image_id < t_range[1]) ]
                    if save_raw_fig: data_red_slice_df.plot(linewidth=linewidth)        
                    n_frames.append(len(data_red_slice_df))   
                if save_raw_fig:
                    data_red_df[series_labels[i_s]].plot( marker='.', linestyle='none', 
                                                          markersize=markersize, color='k')
                    plt.ylabel(series_labels[i_s])
                    if i_s == 0: 
                        plt.legend( list(persons_range)+['all'],loc='upper right', 
                                    bbox_to_anchor=(1.2, 1.02) )
                if save_pp_log: 
                    mean_persons = sum(n_frames)/n_persons
                    for p in n_frames:
                        if p != mean_persons:
                            warning_frames = ''.join([ 'inconsistent frame count in ,'
                                                      f'{series_labels[i_s]} {tuple(n_frames)}' ])
                            pp_log_txt.append( warning_frames+'\n' )
                            if verbose: print('Warning:',warning_frames)
                            break
            if save_raw_fig:
                plt.gcf().suptitle(fn_ne+'\nRaw Data')
                plt.gcf().supxlabel('stacked frames (as in json file)')
                plt.tight_layout()
                fig_ffn = figures_path + '\\' + fn_ne + '_RAW.png'
                plt.savefig(fig_ffn)  
            if save_raw_fig == 'concealed': plt.close(plt.gcf())
            if save_pp_log or verbose:
                if (data_red_df.idx.max()) != n_persons:
                        warning_idx = 'more idx than number of people in raw data'
                        if verbose: print('Warning:',warning_idx)
                        if save_pp_log: pp_log_txt.append(warning_idx+'\n')
    
        # Rearrange such that each row is a frame (image_id), and fill missing data:
        data_rar_df = pd.DataFrame( list(range(data_red_df.image_id.max() + 1)) , columns=['image_id'])
        for i in persons_range:
            data_rar_df = data_rar_df.merge( 
                data_red_df[['image_id']+series_labels][(data_red_df.idx == i)],
                on='image_id', how='left', suffixes=(f'_{i-1}',f'_{i}') )
        data_rar_df = data_rar_df.drop(['image_id'],axis=1)
        found_nan = data_rar_df.isnull().values.any()
        if save_pp_log or verbose:
            if found_nan:
                data_rar_df = data_rar_df.interpolate(limit_direction='both')
                warning_interp = 'missing raw data have been interpolated'
                if verbose: print('Warning:',warning_interp)
                if save_pp_log: pp_log_txt.append(warning_interp+'\n')
    
        # Re-order and re-label columns in order from left to right as they appear in the image:
        # It is assumed that the persons don't relocate (e.g. they are sitting or standing in one place).
        # Indices are set to start at 0 to be consistent with Python indexing.
        new_order_x = [ x for x in data_rar_df.iloc[:,::2].median().sort_values().index]
        new_order_y = [ y.replace('x','y') for y in new_order_x ]
        new_order_xy = []
        new_order_lbl = []
        i_c = 0
        for x,y in zip(new_order_x,new_order_y):
            new_order_xy.append(x)
            new_order_xy.append(y)
            new_order_lbl.append(f'{i_c}_x')
            new_order_lbl.append(f'{i_c}_y')
            i_c += 1
        data_rar_df = data_rar_df.reindex(new_order_xy, axis=1)
        data_rar_df.columns = new_order_lbl
    
        # Save preprocessing log:
        if save_pp_log:
            txtlog_ffn = logs_path + '\\' + 'pp_log.txt'
            pp_log_txt.append('\n')
            with open(txtlog_ffn, 'a') as output:
                for t in pp_log_txt:
                    output.write(t)
    
        # Make plot of pre-processed data:
        if save_pp_fig:
            if t_range == 'all':
                t_range = [0,data_rar_df.index.max()]
            for i_s in range(n_series):
                plt.subplot(n_series,1,i_s+1)
                legend = []
                names_cols = [ f'{n}_{series_labels[i_s]}' for n in range(n_persons)]
                for nc in names_cols:
                    data_rar_slice_df = data_rar_df[nc].iloc[ t_range[0] : t_range[1] ]
                    data_rar_slice_df.plot(linewidth=linewidth)   
                    legend.append(nc.split('_')[0])
                    n_frames.append(len(data_red_slice_df))
                plt.ylabel(series_labels[i_s])
                if i_s == 0: 
                    plt.legend(legend,loc='upper right', bbox_to_anchor=(1.2, 1.02))
            plt.suptitle(fn_ne+'\nPre-processed Data')
            plt.xlabel('time (video frames)')
            plt.tight_layout()
            fig_ffn = figures_path + '\\' + fn_ne + '_PP.png'
            plt.savefig(fig_ffn)
            if save_pp_fig == 'concealed': plt.close(plt.gcf())
    
        # Write pre-processed data to a file:
        if save_parquet:    
            AP_pp_ffn = preproc_path + '\\' + fn_ne + '.parquet'
            data_rar_df.to_parquet(AP_pp_ffn)