In [51]:
import pandas as pd
import glob
import os

from tqdm import tqdm

from utility_functions import get_year_from_filename

In [52]:
file_names = []
for f in glob.glob(os.path.join("data", "keyframes", '**\\*.csv'), recursive=True):
    file_names.append(f)

for f in glob.glob(os.path.join("data", "keyframes", '**\\*.xlsx'), recursive=True):
    file_names.append(f)

print("Identified {} .csv and .xlsx files...".format(len(file_names)))

Identified 94 .csv and .xlsx files...


In [53]:
file_names

['data\\keyframes\\2017\\Keyframes\\Body10_keyframes.csv',
 'data\\keyframes\\2017\\Keyframes\\Body11_keyframes.csv',
 'data\\keyframes\\2017\\Keyframes\\Body12_keyframes.csv',
 'data\\keyframes\\2017\\Keyframes\\Body13_keyframes.csv',
 'data\\keyframes\\2017\\Keyframes\\Body14_keyframes.csv',
 'data\\keyframes\\2017\\Keyframes\\Body18_keyframes.csv',
 'data\\keyframes\\2017\\Keyframes\\Body22_keyframes.csv',
 'data\\keyframes\\2017\\Keyframes\\Body26_keyframes.csv',
 'data\\keyframes\\2017\\Keyframes\\Body27_keyframes.csv',
 'data\\keyframes\\2017\\Keyframes\\Body28_keyframes.csv',
 'data\\keyframes\\2017\\Keyframes\\Body57_keyframes.csv',
 'data\\keyframes\\2017\\Keyframes\\Body59_keyframes.csv',
 'data\\keyframes\\2017\\Keyframes\\Body60_keyframes.csv',
 'data\\keyframes\\2017\\Keyframes\\Body61_keyframes.csv',
 'data\\keyframes\\2019\\Keyframes\\Body11_keyframes.csv',
 'data\\keyframes\\2019\\Keyframes\\Body12_keyframes.csv',
 'data\\keyframes\\2019\\Keyframes\\Body13_keyframes.csv

In [54]:
df = pd.DataFrame(columns=['bodyNumber', 'trial', 'year', 'T0', 'T1', 'T2', 'T3', 'T4', 'T5', 'T6', 'BR', 'BV', 'ORWF'])
# BEFORE RUNNING: Go to file data\keyframes\2021_A\Keyframes\Body69_keyframes.csv and delete the final comma in line 15
for file_name in tqdm(file_names):
    year = get_year_from_filename(file_name)
    if str.endswith(file_name, 'csv'):
        temp = pd.read_csv(file_name, sep=';|,', engine='python')
    else:
        temp = pd.read_excel(file_name)
    # add year
    temp.loc[:, 'year'] = year
    # remove empty, unnamed columns
    temp = temp.loc[:, ~temp.columns.str.contains('^Unnamed')]
    # remove filename column present only in 2021_B folder
    temp = temp.loc[:, ~temp.columns.str.contains('filename')]
    # remove white spaces from column names
    temp.columns = temp.columns.str.replace(' ', '')
    # append temp to main df
    df = pd.concat([df, temp], axis=0)
    if len(df.columns) != 13:
        print("There is a problem with the formatting of file {}. Please correct it before proceeding.".format(file_name))
        print(df.tail())
        break

100%|██████████| 94/94 [00:00<00:00, 186.20it/s]


In [55]:
print("*********** Dataframe Content: ***********\n")
print(df.head())

*********** Dataframe Content: ***********

  bodyNumber trial  year   T0   T1   T2   T3   T4   T5   T6   BR   BV ORWF
0         10     1  2017  NaN  NaN  350  532  601  655  790  NaN  NaN  NaN
1         10     2  2017  NaN  NaN  310  574  595  649  840  NaN  NaN  NaN
2         10     3  2017  NaN  NaN  269  533  600  654  792  NaN  NaN  NaN
3         10     4  2017  NaN  NaN  271  535  557  611  790  NaN  NaN  NaN
4         10     5  2017  NaN  NaN  214  478  533  587  815  NaN  NaN  NaN


In [56]:
print("*********** Dataframe Description: ***********\n")
print(df.describe())

*********** Dataframe Description: ***********

        bodyNumber  trial  year   T0   T1      T2      T3      T4      T5  \
count         1647   1647  1647  335  335  1494.0  1494.0  1645.0  1645.0   
unique          75     28     4   26  265   786.0   914.0   991.0   980.0   
top             13      6  2019   50  750     0.0    -1.0    -1.0    -1.0   
freq            55     94   881   87    4   128.0    32.0    32.0    32.0   

            T6   BR    BV  ORWF  
count   1542.0   67    40    27  
unique   939.0   66    39    25  
top       -1.0  741  1829   734  
freq      32.0    2     2     2  


In [57]:
print("*********** Dataframe Null Values per Column: ***********\n")
df.isna().sum()

*********** Dataframe Null Values per Column: ***********



bodyNumber       0
trial            0
year             0
T0            1312
T1            1312
T2             153
T3             153
T4               2
T5               2
T6             105
BR            1580
BV            1607
ORWF          1620
dtype: int64

In [59]:
df.to_csv(os.path.join("data", "keyframes.csv"))