Import libraries

In [1]:
import pandas as pd
import numpy as np
from itertools import combinations
from pathlib import Path
from scipy.spatial.distance import pdist, squareform
import string

Make dataframe for all features

In [2]:
df_distance = pd.DataFrame()
df_areas = pd.DataFrame()
df_angle = pd.DataFrame()

read in large file (after imputing missing values)

In [3]:
df_complete = pd.read_csv("/imputed_bigdf.csv",header=0)# get file path
print(df_complete.shape)

(1723689, 20)


get correct columns and bodyparts

In [4]:
# specify header, minus bodypart coords and videoname columns
headertotal = list(df_complete.columns)[1:-1]

# extract columns with x and y 
coordinates_columnx = [string for index, string in enumerate(headertotal) if 'x' in string]
coordinates_columny = [string for index, string in enumerate(headertotal) if 'y' in string]
coordinates_columns = [x for y in zip(coordinates_columnx, coordinates_columny) for x in y]

# extract bodypart names
bodyparts_names = []
for i in range(0,len(coordinates_columns),2):
    name_string = list(filter(lambda x: x in coordinates_columns[i], coordinates_columns[i+1]))
    bodyparts_names.append("".join([str(item) for item in name_string]))
for i in range(len(bodyparts_names)):
    bodyparts_names[i] = bodyparts_names[i].split('.')[0]

get header for distance function

In [5]:
distance_columns = []
for i in range(len(bodyparts_names)):
    for j in range(i,len(bodyparts_names)):
        if i != j:
            if i != 5:
                distance_columns.append('Dist_'+bodyparts_names[i]+'_'+bodyparts_names[j])
print(distance_columns)

['Dist_nose_earL', 'Dist_nose_earR', 'Dist_nose_schoulderL', 'Dist_nose_schoulderR', 'Dist_nose_tailbase', 'Dist_earL_earR', 'Dist_earL_schoulderL', 'Dist_earL_schoulderR', 'Dist_earL_tailbase', 'Dist_earR_schoulderL', 'Dist_earR_schoulderR', 'Dist_earR_tailbase', 'Dist_schoulderL_schoulderR', 'Dist_schoulderL_tailbase', 'Dist_schoulderR_tailbase']


Pairwise distance

In [6]:
def pairwise_distances(df):
    # x, y, prob for each body part
    num_points = (df.shape[1] - 2) // 3  
    distances = []

    #for i in range(len(df)):
    for index, row in df.iterrows():
        # Extract coordinates for all points at timepoint i
        coordinates = row.loc[coordinates_columns].apply(pd.to_numeric, errors='raise').to_numpy().reshape(-1, 2)
        
        # Calculate pairwise distances for all pairs of points at timepoint i
        point_distances = pdist(coordinates, metric='euclidean')

        # Append the distances to the list
        distances.append(point_distances)

    # Convert the list of distances to a DataFrame
    distances_df = pd.DataFrame(distances,columns = distance_columns)
    distances_df.index = df.index
    return distances_df

In [3]:
for vid in df_complete['videoname'].unique():
    df_dist = pairwise_distances(df_complete.loc[df_complete['videoname']==vid])
    df_dist['videoname'] = vid
    df_distance = df_distance.append(df_dist)

NameError: name 'df_complete' is not defined

In [8]:
# get mean distance parameters over videos
dist_grouped = df_distance.groupby('videoname')
dist_average = dist_grouped.mean()

                      Dist_nose_earL  Dist_nose_earR  Dist_nose_schoulderL  \
videoname                                                                    
CDKL5_OLR_Probe_CD21       28.291263       28.694249             43.053106   
CDKL5_OLR_Probe_CD23       28.443268       28.057779             42.693344   
CDKL5_OLR_Probe_CD24       29.023982       29.081298             45.714691   
CDKL5_OLR_Probe_CD25       30.450875       29.054574             46.011034   
CDKL5_OLR_Probe_CD26       25.009166       25.974209             39.095224   
...                              ...             ...                   ...   
SHYB_OLR_T_SGH59           31.907897       31.400853             49.540734   
SHYB_OLR_T_SGH60           30.778023       30.890876             47.456119   
SHYB_OLR_T_SGH61           33.153114       33.936369             50.084410   
SHYB_OLR_T_SGH62           30.879906       30.622127             46.189168   
SHYB_OLR_T_SGH63           31.098791       31.693692            

In [9]:
def triangle_areas(df):
    num_points = (df.shape[1] - 2) // 3
    areas = []
    count = 0
    header = []

    for index, row in df.iterrows():
        # Extract coordinates for all points at timepoint i
        coordinates =  row.loc[coordinates_columns].apply(pd.to_numeric, errors='raise').to_numpy().reshape(-1, 2)
        
        # Generate all possible combinations of three points
        point_combinations = list(combinations(range(num_points), 3))
            
        # Calculate the area of the triangles for each combination of three points
        triangle_areas = []
        for combination in point_combinations:
            x1, y1 = coordinates[combination[0]]
            x2, y2 = coordinates[combination[1]]
            x3, y3 = coordinates[combination[2]]
            if count == 0:
                header.append('Area_'+bodyparts_names[combination[0]]+'_'+bodyparts_names[combination[1]]+'_'+bodyparts_names[combination[2]])
            
            # Calculate the area of the triangle using the shoelace formula
            area = 0.5 * np.abs(x1 * (y2 - y3) + x2 * (y3 - y1) + x3 * (y1 - y2))
            triangle_areas.append(area)

        # Append the areas to the list
        areas.append(triangle_areas)
        count += 1
    
    # Convert the list of areas to a DataFrame
    areas_df = pd.DataFrame(areas, columns=[header])
    areas_df.index = df.index
    return areas_df

In [2]:
df_areas = pd.DataFrame()
for vid in df_complete['videoname'].unique():
    df_areas_inter = triangle_areas(df_complete.loc[df_complete['videoname']==vid])
    df_areas_inter['videoname']= vid
    df_areas = df_areas.append(df_areas_inter)

NameError: name 'pd' is not defined

In [11]:
areas_header_new = []
for i in list(df_areas.columns.values):
    areas_header_new.append(i[0])
               
df_renamed = df_areas.copy()
df_renamed.columns = areas_header_new

[('Area_nose_earL_earR',), ('Area_nose_earL_schoulderL',), ('Area_nose_earL_schoulderR',), ('Area_nose_earL_tailbase',), ('Area_nose_earR_schoulderL',), ('Area_nose_earR_schoulderR',), ('Area_nose_earR_tailbase',), ('Area_nose_schoulderL_schoulderR',), ('Area_nose_schoulderL_tailbase',), ('Area_nose_schoulderR_tailbase',), ('Area_earL_earR_schoulderL',), ('Area_earL_earR_schoulderR',), ('Area_earL_earR_tailbase',), ('Area_earL_schoulderL_schoulderR',), ('Area_earL_schoulderL_tailbase',), ('Area_earL_schoulderR_tailbase',), ('Area_earR_schoulderL_schoulderR',), ('Area_earR_schoulderL_tailbase',), ('Area_earR_schoulderR_tailbase',), ('Area_schoulderL_schoulderR_tailbase',), ('videoname',)]
['Area_nose_earL_earR', 'Area_nose_earL_schoulderL', 'Area_nose_earL_schoulderR', 'Area_nose_earL_tailbase', 'Area_nose_earR_schoulderL', 'Area_nose_earR_schoulderR', 'Area_nose_earR_tailbase', 'Area_nose_schoulderL_schoulderR', 'Area_nose_schoulderL_tailbase', 'Area_nose_schoulderR_tailbase', 'Area_ea

In [13]:
# get mean area parameters over videos
areas_grouped = df_renamed.groupby('videoname')
areas_average = areas_grouped.mean()

                      Area_nose_earL_earR  Area_nose_earL_schoulderL  \
videoname                                                              
CDKL5_OLR_Probe_CD21           246.726797                  53.323278   
CDKL5_OLR_Probe_CD23           230.927493                  57.413580   
CDKL5_OLR_Probe_CD24           242.598776                  63.721704   
CDKL5_OLR_Probe_CD25           234.938916                  64.122775   
CDKL5_OLR_Probe_CD26           202.002285                  47.948896   
...                                   ...                        ...   
SHYB_OLR_T_SGH59               279.853962                  93.171801   
SHYB_OLR_T_SGH60               267.816727                  68.838017   
SHYB_OLR_T_SGH61               311.653723                  49.619385   
SHYB_OLR_T_SGH62               267.283738                  54.787662   
SHYB_OLR_T_SGH63               262.793494                  69.874096   

                      Area_nose_earL_schoulderR  Area_nose_earL

In [14]:
def triangle_angles(df):
    num_points = (df.shape[1] - 2) // 3
    angles = []
    header = []
    count = 0
 
    for index, row in df.iterrows():
        # Extract coordinates for all points at timepoint i
        coordinates = row.loc[coordinates_columns].apply(pd.to_numeric, errors='raise').to_numpy().reshape(-1, 2)
            
        # Generate all possible combinations of three points
        point_combinations = list(combinations(range(num_points), 3))
        
        # Calculate the angles for each combination of three points
        triangle_angles = []
        for combination in point_combinations:
            x1, y1 = coordinates[combination[0]] #A
            x2, y2 = coordinates[combination[1]] #B
            x3, y3 = coordinates[combination[2]] #C

            # Calculate the lengths of the sides of the triangle
            a = np.linalg.norm(np.array([x2, y2]) - np.array([x3, y3])) #opposite side 
            b = np.linalg.norm(np.array([x1, y1]) - np.array([x3, y3]))
            c = np.linalg.norm(np.array([x1, y1]) - np.array([x2, y2]))

            # Calculate the angles using the law of cosines
            angle_a = np.arccos((b**2 + c**2 - a**2) / (2 * b * c)) # BAC
            angle_b = np.arccos((a**2 + c**2 - b**2) / (2 * a * c)) # ABC
            angle_c = np.arccos((a**2 + b**2 - c**2) / (2 * a * b)) # ACB
            
            # Append the angles to the list
            triangle_angles.extend([angle_a, angle_b, angle_c])
            if count ==0:
                #angle a
                header.append('Angle_'+ bodyparts_names[combination[1]]+'_'+bodyparts_names[combination[0]]+'_'+bodyparts_names[combination[2]])
                #angle b
                header.append('Angle_'+ bodyparts_names[combination[0]]+'_'+bodyparts_names[combination[1]]+'_'+bodyparts_names[combination[2]])
                #angle c
                header.append('Angle_'+ bodyparts_names[combination[0]]+'_'+bodyparts_names[combination[2]]+'_'+bodyparts_names[combination[1]])
           
        # Append the angles to the list
        angles.append(triangle_angles)
        count += 1

    # Convert the list of angles to a DataFrame
    angles_df = pd.DataFrame(angles, columns=[header])
    angles_df.index = df.index

    return angles_df

In [1]:
df_angle = pd.DataFrame()
for vid in df_complete['videoname'].unique():
    df_triangle_angles = triangle_angles(df_complete.loc[df_complete['videoname']==vid])
    df_triangle_angles['videoname']= vid
    df_angle = df_angle.append(df_triangle_angles)

NameError: name 'pd' is not defined

In [17]:
angles_header_new = []
for i in list(df_angle.columns.values):
    angles_header_new.append(i[0])
               
df_renamed_angles = df_angle.copy()
df_renamed_angles.columns = angles_header_new

['Angle_earL_nose_earR', 'Angle_nose_earL_earR', 'Angle_nose_earR_earL', 'Angle_earL_nose_schoulderL', 'Angle_nose_earL_schoulderL', 'Angle_nose_schoulderL_earL', 'Angle_earL_nose_schoulderR', 'Angle_nose_earL_schoulderR', 'Angle_nose_schoulderR_earL', 'Angle_earL_nose_tailbase', 'Angle_nose_earL_tailbase', 'Angle_nose_tailbase_earL', 'Angle_earR_nose_schoulderL', 'Angle_nose_earR_schoulderL', 'Angle_nose_schoulderL_earR', 'Angle_earR_nose_schoulderR', 'Angle_nose_earR_schoulderR', 'Angle_nose_schoulderR_earR', 'Angle_earR_nose_tailbase', 'Angle_nose_earR_tailbase', 'Angle_nose_tailbase_earR', 'Angle_schoulderL_nose_schoulderR', 'Angle_nose_schoulderL_schoulderR', 'Angle_nose_schoulderR_schoulderL', 'Angle_schoulderL_nose_tailbase', 'Angle_nose_schoulderL_tailbase', 'Angle_nose_tailbase_schoulderL', 'Angle_schoulderR_nose_tailbase', 'Angle_nose_schoulderR_tailbase', 'Angle_nose_tailbase_schoulderR', 'Angle_earR_earL_schoulderL', 'Angle_earL_earR_schoulderL', 'Angle_earL_schoulderL_earR

In [18]:
# get mean angle parameters over videos
angle_grouped = df_renamed_angles.groupby('videoname')
angle_average = angle_grouped.mean()

                      Angle_earL_nose_earR  Angle_nose_earL_earR  \
videoname                                                          
CDKL5_OLR_Probe_CD21              0.711118              1.231928   
CDKL5_OLR_Probe_CD23              0.640347              1.233656   
CDKL5_OLR_Probe_CD24              0.680082              1.231514   
CDKL5_OLR_Probe_CD25              0.583575              1.196011   
CDKL5_OLR_Probe_CD26              0.751172              1.251978   
...                                    ...                   ...   
SHYB_OLR_T_SGH59                  0.668977              1.207242   
SHYB_OLR_T_SGH60                  0.635082              1.250527   
SHYB_OLR_T_SGH61                  0.648181              1.278670   
SHYB_OLR_T_SGH62                  0.654858              1.230696   
SHYB_OLR_T_SGH63                  0.602621              1.295983   

                      Angle_nose_earR_earL  Angle_earL_nose_schoulderL  \
videoname                                

In [19]:
# change column names in the average dataframes and then merge with feature df where I only specific videoname 

# set df_features + videoname column so it is ready for left merge
df_features = pd.DataFrame()
df_features['videoname'] = df_complete['videoname'].unique()

# loop through the three feature df's
for df in [df_distance,df_renamed,df_renamed_angles]:
    for i in ['mean_', 'median_', 'var_', 'std_']:
    
    # loop through the 4 stats we want to extract per feature
        df_loop = df.copy()
        header_renamed = []
        
        #change header to include specific stat
        valueslist = list(df_loop.columns.values)
        valueslist.remove('videoname')
        header_renamed = [i+j for j in valueslist]
        header_renamed.append('videoname')
        df_loop.columns = header_renamed
        grouped = df_loop.groupby('videoname')
        
        # extract feature and append to df_features
        if i == 'mean_':
            df_features = df_features.merge(grouped.mean(),on='videoname',how='left')
        if i == 'median_':
            df_features = df_features.merge(grouped.median(),on='videoname',how='left')
        if i == 'var_':
            df_features = df_features.merge(grouped.var(),on='videoname',how='left')
        if i == 'std_':
            df_features = df_features.merge(grouped.std(),on='videoname',how='left')

['Dist_nose_earL', 'Dist_nose_earR', 'Dist_nose_schoulderL', 'Dist_nose_schoulderR', 'Dist_nose_tailbase', 'Dist_earL_earR', 'Dist_earL_schoulderL', 'Dist_earL_schoulderR', 'Dist_earL_tailbase', 'Dist_earR_schoulderL', 'Dist_earR_schoulderR', 'Dist_earR_tailbase', 'Dist_schoulderL_schoulderR', 'Dist_schoulderL_tailbase', 'Dist_schoulderR_tailbase', 'videoname']
mean_
['Dist_nose_earL', 'Dist_nose_earR', 'Dist_nose_schoulderL', 'Dist_nose_schoulderR', 'Dist_nose_tailbase', 'Dist_earL_earR', 'Dist_earL_schoulderL', 'Dist_earL_schoulderR', 'Dist_earL_tailbase', 'Dist_earR_schoulderL', 'Dist_earR_schoulderR', 'Dist_earR_tailbase', 'Dist_schoulderL_schoulderR', 'Dist_schoulderL_tailbase', 'Dist_schoulderR_tailbase']
['mean_Dist_nose_earL', 'mean_Dist_nose_earR', 'mean_Dist_nose_schoulderL', 'mean_Dist_nose_schoulderR', 'mean_Dist_nose_tailbase', 'mean_Dist_earL_earR', 'mean_Dist_earL_schoulderL', 'mean_Dist_earL_schoulderR', 'mean_Dist_earL_tailbase', 'mean_Dist_earR_schoulderL', 'mean_Dist

['Dist_nose_earL', 'Dist_nose_earR', 'Dist_nose_schoulderL', 'Dist_nose_schoulderR', 'Dist_nose_tailbase', 'Dist_earL_earR', 'Dist_earL_schoulderL', 'Dist_earL_schoulderR', 'Dist_earL_tailbase', 'Dist_earR_schoulderL', 'Dist_earR_schoulderR', 'Dist_earR_tailbase', 'Dist_schoulderL_schoulderR', 'Dist_schoulderL_tailbase', 'Dist_schoulderR_tailbase', 'videoname']
std_
['Area_nose_earL_earR', 'Area_nose_earL_schoulderL', 'Area_nose_earL_schoulderR', 'Area_nose_earL_tailbase', 'Area_nose_earR_schoulderL', 'Area_nose_earR_schoulderR', 'Area_nose_earR_tailbase', 'Area_nose_schoulderL_schoulderR', 'Area_nose_schoulderL_tailbase', 'Area_nose_schoulderR_tailbase', 'Area_earL_earR_schoulderL', 'Area_earL_earR_schoulderR', 'Area_earL_earR_tailbase', 'Area_earL_schoulderL_schoulderR', 'Area_earL_schoulderL_tailbase', 'Area_earL_schoulderR_tailbase', 'Area_earR_schoulderL_schoulderR', 'Area_earR_schoulderL_tailbase', 'Area_earR_schoulderR_tailbase', 'Area_schoulderL_schoulderR_tailbase']
['std_Area

['Dist_nose_earL', 'Dist_nose_earR', 'Dist_nose_schoulderL', 'Dist_nose_schoulderR', 'Dist_nose_tailbase', 'Dist_earL_earR', 'Dist_earL_schoulderL', 'Dist_earL_schoulderR', 'Dist_earL_tailbase', 'Dist_earR_schoulderL', 'Dist_earR_schoulderR', 'Dist_earR_tailbase', 'Dist_schoulderL_schoulderR', 'Dist_schoulderL_tailbase', 'Dist_schoulderR_tailbase', 'videoname']
var_
['Angle_earL_nose_earR', 'Angle_nose_earL_earR', 'Angle_nose_earR_earL', 'Angle_earL_nose_schoulderL', 'Angle_nose_earL_schoulderL', 'Angle_nose_schoulderL_earL', 'Angle_earL_nose_schoulderR', 'Angle_nose_earL_schoulderR', 'Angle_nose_schoulderR_earL', 'Angle_earL_nose_tailbase', 'Angle_nose_earL_tailbase', 'Angle_nose_tailbase_earL', 'Angle_earR_nose_schoulderL', 'Angle_nose_earR_schoulderL', 'Angle_nose_schoulderL_earR', 'Angle_earR_nose_schoulderR', 'Angle_nose_earR_schoulderR', 'Angle_nose_schoulderR_earR', 'Angle_earR_nose_tailbase', 'Angle_nose_earR_tailbase', 'Angle_nose_tailbase_earR', 'Angle_schoulderL_nose_schould

Exporting files

In [27]:
print(len(list(df_features.columns.values)))

381


In [29]:
# file paths for exporting data
filepath = ""

# export as big feature dataframe
df_features.to_csv(filepath+"features_big_stats_df.csv",index=False)

# export as single features dataframe, no stats
df_distance.to_csv(filepath+"distance_feature_df.csv",index=False)
df_renamed.to_csv(filepath+"areas_feature_df.csv",index=False)
df_renamed_angles.to_csv(filepath+"angle_feature_df.csv",index=False)

In [30]:
# check if no NaN in df_features
df_features.isnull().values.any() # is False so no null values :D

False

In [31]:
df_features.head(10)

Unnamed: 0,videoname,mean_Dist_nose_earL,mean_Dist_nose_earR,mean_Dist_nose_schoulderL,mean_Dist_nose_schoulderR,mean_Dist_nose_tailbase,mean_Dist_earL_earR,mean_Dist_earL_schoulderL,mean_Dist_earL_schoulderR,mean_Dist_earL_tailbase,...,std_Angle_earR_schoulderR_schoulderL,std_Angle_schoulderL_earR_tailbase,std_Angle_earR_schoulderL_tailbase,std_Angle_earR_tailbase_schoulderL,std_Angle_schoulderR_earR_tailbase,std_Angle_earR_schoulderR_tailbase,std_Angle_earR_tailbase_schoulderR,std_Angle_schoulderR_schoulderL_tailbase,std_Angle_schoulderL_schoulderR_tailbase,std_Angle_schoulderL_tailbase_schoulderR
0,CDKL5_OLR_Probe_CD21,28.291263,28.694249,43.053106,43.467834,118.770897,19.17154,14.944144,29.114398,95.605327,...,0.279065,0.441359,0.501988,0.093596,0.37431,0.412481,0.050822,0.463247,0.447949,0.097291
1,CDKL5_OLR_Probe_CD23,28.443268,28.057779,42.693344,41.988285,106.891879,17.129348,14.749161,24.855729,84.725984,...,0.297405,0.4942,0.578919,0.14223,0.42038,0.48303,0.082272,0.585691,0.562106,0.13647
2,CDKL5_OLR_Probe_CD24,29.023982,29.081298,45.714691,44.368433,118.671322,18.353541,17.021896,29.178609,93.984174,...,0.234239,0.421175,0.491129,0.089179,0.32008,0.365676,0.058755,0.410991,0.398889,0.090983
3,CDKL5_OLR_Probe_CD25,30.450875,29.054574,46.011034,42.713236,112.381847,16.980724,16.702965,27.145066,88.76286,...,0.253745,0.403466,0.464702,0.114862,0.358236,0.413492,0.065499,0.418636,0.434343,0.125798
4,CDKL5_OLR_Probe_CD26,25.009166,25.974209,39.095224,40.669948,111.465373,17.914892,14.405179,27.364562,89.863298,...,0.23807,0.461714,0.531041,0.093019,0.365157,0.409743,0.056518,0.463174,0.448544,0.093444
5,CDKL5_OLR_Probe_CD27,26.76378,28.00143,41.022557,41.388079,112.670523,17.144133,14.822287,26.216332,91.912756,...,0.264646,0.416218,0.461455,0.092156,0.448507,0.487667,0.05472,0.468115,0.47342,0.111571
6,CDKL5_OLR_Probe_CD5,28.203543,28.245817,43.360453,42.76536,122.224335,17.352192,15.252775,26.215104,96.558021,...,0.318495,0.401417,0.480741,0.100007,0.397669,0.426259,0.043038,0.401621,0.359318,0.089913
7,CDKL5_OLR_SS1_CKL210,30.144001,29.743761,46.701,44.512424,122.021394,19.95672,16.782001,30.213786,96.193572,...,0.267055,0.39346,0.454512,0.102947,0.350975,0.396233,0.055935,0.403869,0.402851,0.111057
8,CDKL5_OLR_SS1_CKL212,29.225575,28.893929,46.021841,45.102658,122.849949,19.645125,16.611852,29.062064,96.888596,...,0.262196,0.450902,0.53447,0.1219,0.330743,0.379717,0.064566,0.501717,0.490027,0.118622
9,CDKL5_OLR_SS1_CKL215,26.294279,25.926933,40.72902,39.927463,110.906538,17.028317,14.557065,26.500742,89.869585,...,0.243608,0.378171,0.403528,0.068216,0.292452,0.332059,0.04386,0.386693,0.416743,0.075863
