Import libraries

In [None]:
import pandas as pd
import numpy as np
from itertools import combinations
from pathlib import Path
from scipy.spatial.distance import pdist, squareform
import string

Make dataframe for all features

In [None]:
df_distance = pd.DataFrame()
df_areas = pd.DataFrame()
df_angle = pd.DataFrame()

read in large file (after imputing missing values)

In [None]:
df_complete = pd.read_csv("/imputed_bigdf.csv",header=0)# get file path
print(df_complete.shape)

get correct columns and bodyparts

In [None]:
# specify header, minus bodypart coords and videoname columns
headertotal = list(df_complete.columns)[1:-1]

# extract columns with x and y 
coordinates_columnx = [string for index, string in enumerate(headertotal) if 'x' in string]
coordinates_columny = [string for index, string in enumerate(headertotal) if 'y' in string]
coordinates_columns = [x for y in zip(coordinates_columnx, coordinates_columny) for x in y]

# extract bodypart names
bodyparts_names = []
for i in range(0,len(coordinates_columns),2):
    name_string = list(filter(lambda x: x in coordinates_columns[i], coordinates_columns[i+1]))
    bodyparts_names.append("".join([str(item) for item in name_string]))
for i in range(len(bodyparts_names)):
    bodyparts_names[i] = bodyparts_names[i].split('.')[0]

get header for distance function

In [None]:
distance_columns = []
for i in range(len(bodyparts_names)):
    for j in range(i,len(bodyparts_names)):
        if i != j:
            if i != 5:
                distance_columns.append('Dist_'+bodyparts_names[i]+'_'+bodyparts_names[j])
print(distance_columns)

Pairwise distance

In [None]:
def pairwise_distances(df):
    # x, y, prob for each body part
    num_points = (df.shape[1] - 2) // 3  
    distances = []

    #for i in range(len(df)):
    for index, row in df.iterrows():
        # Extract coordinates for all points at timepoint i
        coordinates = row.loc[coordinates_columns].apply(pd.to_numeric, errors='raise').to_numpy().reshape(-1, 2)
        
        # Calculate pairwise distances for all pairs of points at timepoint i
        point_distances = pdist(coordinates, metric='euclidean')

        # Append the distances to the list
        distances.append(point_distances)

    # Convert the list of distances to a DataFrame
    distances_df = pd.DataFrame(distances,columns = distance_columns)
    distances_df.index = df.index
    return distances_df

In [None]:
for vid in df_complete['videoname'].unique():
    df_dist = pairwise_distances(df_complete.loc[df_complete['videoname']==vid])
    df_dist['videoname'] = vid
    df_distance = df_distance.append(df_dist)

In [None]:
# get mean distance parameters over videos
dist_grouped = df_distance.groupby('videoname')
dist_average = dist_grouped.mean()

In [None]:
def triangle_areas(df):
    num_points = (df.shape[1] - 2) // 3
    areas = []
    count = 0
    header = []

    for index, row in df.iterrows():
        # Extract coordinates for all points at timepoint i
        coordinates =  row.loc[coordinates_columns].apply(pd.to_numeric, errors='raise').to_numpy().reshape(-1, 2)
        
        # Generate all possible combinations of three points
        point_combinations = list(combinations(range(num_points), 3))
            
        # Calculate the area of the triangles for each combination of three points
        triangle_areas = []
        for combination in point_combinations:
            x1, y1 = coordinates[combination[0]]
            x2, y2 = coordinates[combination[1]]
            x3, y3 = coordinates[combination[2]]
            if count == 0:
                header.append('Area_'+bodyparts_names[combination[0]]+'_'+bodyparts_names[combination[1]]+'_'+bodyparts_names[combination[2]])
            
            # Calculate the area of the triangle using the shoelace formula
            area = 0.5 * np.abs(x1 * (y2 - y3) + x2 * (y3 - y1) + x3 * (y1 - y2))
            triangle_areas.append(area)

        # Append the areas to the list
        areas.append(triangle_areas)
        count += 1
    
    # Convert the list of areas to a DataFrame
    areas_df = pd.DataFrame(areas, columns=[header])
    areas_df.index = df.index
    return areas_df

In [None]:
df_areas = pd.DataFrame()
for vid in df_complete['videoname'].unique():
    df_areas_inter = triangle_areas(df_complete.loc[df_complete['videoname']==vid])
    df_areas_inter['videoname']= vid
    df_areas = df_areas.append(df_areas_inter)

In [None]:
areas_header_new = []
for i in list(df_areas.columns.values):
    areas_header_new.append(i[0])
               
df_renamed = df_areas.copy()
df_renamed.columns = areas_header_new

In [None]:
# get mean area parameters over videos
areas_grouped = df_renamed.groupby('videoname')
areas_average = areas_grouped.mean()

In [None]:
def triangle_angles(df):
    num_points = (df.shape[1] - 2) // 3
    angles = []
    header = []
    count = 0
 
    for index, row in df.iterrows():
        # Extract coordinates for all points at timepoint i
        coordinates = row.loc[coordinates_columns].apply(pd.to_numeric, errors='raise').to_numpy().reshape(-1, 2)
            
        # Generate all possible combinations of three points
        point_combinations = list(combinations(range(num_points), 3))
        
        # Calculate the angles for each combination of three points
        triangle_angles = []
        for combination in point_combinations:
            x1, y1 = coordinates[combination[0]] #A
            x2, y2 = coordinates[combination[1]] #B
            x3, y3 = coordinates[combination[2]] #C

            # Calculate the lengths of the sides of the triangle
            a = np.linalg.norm(np.array([x2, y2]) - np.array([x3, y3])) #opposite side 
            b = np.linalg.norm(np.array([x1, y1]) - np.array([x3, y3]))
            c = np.linalg.norm(np.array([x1, y1]) - np.array([x2, y2]))

            # Calculate the angles using the law of cosines
            angle_a = np.arccos((b**2 + c**2 - a**2) / (2 * b * c)) # BAC
            angle_b = np.arccos((a**2 + c**2 - b**2) / (2 * a * c)) # ABC
            angle_c = np.arccos((a**2 + b**2 - c**2) / (2 * a * b)) # ACB
            
            # Append the angles to the list
            triangle_angles.extend([angle_a, angle_b, angle_c])
            if count ==0:
                #angle a
                header.append('Angle_'+ bodyparts_names[combination[1]]+'_'+bodyparts_names[combination[0]]+'_'+bodyparts_names[combination[2]])
                #angle b
                header.append('Angle_'+ bodyparts_names[combination[0]]+'_'+bodyparts_names[combination[1]]+'_'+bodyparts_names[combination[2]])
                #angle c
                header.append('Angle_'+ bodyparts_names[combination[0]]+'_'+bodyparts_names[combination[2]]+'_'+bodyparts_names[combination[1]])
           
        # Append the angles to the list
        angles.append(triangle_angles)
        count += 1

    # Convert the list of angles to a DataFrame
    angles_df = pd.DataFrame(angles, columns=[header])
    angles_df.index = df.index

    return angles_df

In [None]:
df_angle = pd.DataFrame()
for vid in df_complete['videoname'].unique():
    df_triangle_angles = triangle_angles(df_complete.loc[df_complete['videoname']==vid])
    df_triangle_angles['videoname']= vid
    df_angle = df_angle.append(df_triangle_angles)

In [None]:
angles_header_new = []
for i in list(df_angle.columns.values):
    angles_header_new.append(i[0])
               
df_renamed_angles = df_angle.copy()
df_renamed_angles.columns = angles_header_new

In [None]:
# get mean angle parameters over videos
angle_grouped = df_renamed_angles.groupby('videoname')
angle_average = angle_grouped.mean()

In [None]:
# change column names in the average dataframes and then merge with feature df where I only specific videoname 

# set df_features + videoname column so it is ready for left merge
df_features = pd.DataFrame()
df_features['videoname'] = df_complete['videoname'].unique()

# loop through the three feature df's
for df in [df_distance,df_renamed,df_renamed_angles]:
    for i in ['mean_', 'median_', 'var_', 'std_']:
    
    # loop through the 4 stats we want to extract per feature
        df_loop = df.copy()
        header_renamed = []
        
        #change header to include specific stat
        valueslist = list(df_loop.columns.values)
        valueslist.remove('videoname')
        header_renamed = [i+j for j in valueslist]
        header_renamed.append('videoname')
        df_loop.columns = header_renamed
        grouped = df_loop.groupby('videoname')
        
        # extract feature and append to df_features
        if i == 'mean_':
            df_features = df_features.merge(grouped.mean(),on='videoname',how='left')
        if i == 'median_':
            df_features = df_features.merge(grouped.median(),on='videoname',how='left')
        if i == 'var_':
            df_features = df_features.merge(grouped.var(),on='videoname',how='left')
        if i == 'std_':
            df_features = df_features.merge(grouped.std(),on='videoname',how='left')

Exporting files

In [None]:
print(len(list(df_features.columns.values)))

In [None]:
# file paths for exporting data
filepath = ""

# export as big feature dataframe
df_features.to_csv(filepath+"features_big_stats_df.csv",index=False)

# export as single features dataframe, no stats
df_distance.to_csv(filepath+"distance_feature_df.csv",index=False)
df_renamed.to_csv(filepath+"areas_feature_df.csv",index=False)
df_renamed_angles.to_csv(filepath+"angle_feature_df.csv",index=False)

In [None]:
# check if no NaN in df_features
df_features.isnull().values.any() # is False so no null values :D

In [None]:
df_features.head(10)