# Part 1: Read in CSV files and Make Dataframes
1. Get all csv files using glob.glob to path
2. Read CSV files into dataframes
3. Collect data on aggregate mean, min, and max

In [None]:
import pandas as pd
import os
import glob
import math
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats

csvs = glob.glob("C:/Users/fanj4/OneDrive - Children's Hospital of Philadelphia/Documents/Video_Data/*.csv")
print(csvs)
cwd = os.getcwd()
print(cwd)

In [None]:
agg_min = []
agg_max = []
agg_mean = []
min_fx = []
max_fx = []
mean_fx = []
for csv in csvs:
    df = pd.read_csv(csv, index_col=None, header=0)
    c = df['confidence']
    print(df['confidence'].min())
    agg_min.append(df['confidence'].min())
    agg_max.append(df['confidence'].max())
    agg_mean.append(df['confidence'].mean())
    df = df.drop(df.index [ [0, 90] ])
    df = df.drop(c.index [ [len(c.index)-90, len(c.index)-1] ])
    min_fx.append(df['confidence'].min())
    max_fx.append(df['confidence'].max())
    mean_fx.append(df['confidence'].mean())
    #data.append(df)

In [None]:
plt.hist(agg_min, bins = 20, edgecolor = 'black')

In [None]:
plt.hist(agg_max, bins = 20, edgecolor = 'black')

In [None]:
plt.hist(agg_mean, bins = 20, edgecolor = 'black')

In [None]:
plt.hist(min_fx, bins = 20)

In [None]:
plt.hist(max_fx, bins = 20)

In [None]:
plt.hist(mean_fx, bins = 20)

# Part 2: Data Cleaning - Identify and Remove Bad Frames

- Majority of low confidence frames were when subject were looking off to the side and quickly moving their head back to center or shaking their head side to side such that only side profile is visable.
- We defined "good data" as frames with confidence greater than or equal to 0.5.
- Because of adjustments and movements at the beginning of videos, many frames at the beginning of the recording are low confidence. Therefore, we chose to cut out the first 10 seconds (300 frames) of each video.

In [None]:
t = 0
bad_data_frames = []
for csv in csvs:
    # Total Number of Videos, prevents errors
    if t > 246:
        break
    t+=1
    df1 = pd.read_csv(csv, index_col=None, header = 0)
    df = df1.tail(-300)
    file_name = csv.split('\\')
    ID = file_name[1].split('.csv')
    print(ID[0])
    df.plot(x='timestamp', y='confidence')
    good_data = df[df.confidence >= 0.5]
    bad_data = df[df.confidence < 0.5]
    print(list(bad_data["timestamp"]))
    plt.show()

# Part 3: Generate 13 Variables
1. For each file in 'csvs'
    a. Run pd.read_csv to get each column in csv
    b. Take out bad frames with confidence below 0.6 
    c. Generate 13 variables based on 3 columns: 'pose_Rx', 'pose_Ry', 'pose_Rz'
2. Notes on what each variable is:
- *Note: each variable takes the individual values of the pitch, yaw, and roll
- v1x, v1y, v1z = Square root of average square values of each frame
- v2x, v2y, v2z = Sqrt of average difference of consecutive frames squared
- v3x, v3y, v3z = maximum value - minimum value across all frames
- v4x, v4y, v4z = Sum of the positive difference between consecutive frames divided by frames per second
- v13 = Average of slant from all three directions

In [None]:
data =[]
vdata = []
print(len(csvs))
t = 0
for proj in csvs:

#     print(proj)
    file_name = proj.split('\\')
    ID = file_name[1].split('.csv')
#     print(ID[0])
                    
    t+=1
    df = pd.read_csv(proj, index_col=None, header = 0)
    

    good_data = df[df['confidence'] >= 0.6]
    print(good_data)
    good_data.nsmallest(n = math.floor(50), columns = ['confidence'])

    dfRx = good_data['pose_Rx']
    dfRy = good_data['pose_Ry']
    dfRz = good_data['pose_Rz']
    frames = good_data['frame']
    if (dfRx.size == 0 or dfRy.size == 0 or dfRz.size == 0):
        continue
    
    v1x = math.sqrt(dfRx.pow(2).sum()/dfRx.size)
    v2x = 0.0
    v4x = 0.0
    for i in range(len(dfRx)):
        if (i == 1):
            continue
        if pd.isna(dfRx.iloc[i]):
            dfRx.iloc[i] = 0
        if pd.isna(dfRx.iloc[i-1]):
            dfRx.iloc[i-1] = 0
        if frames.iloc[i]-frames.iloc[i-1] > 10:
            continue
        v2x += (float(dfRx.iloc[i]) - float(dfRx.iloc[i-1])) ** 2
        v4x += abs(float(dfRx.iloc[i]) - float(dfRx.iloc[i-1]))
    v2x = math.sqrt(v2x/dfRx.size)
    v3x = dfRx.max() - dfRx.min()
    v4x = 1800*v4x/dfRx.size
    
    v1y = math.sqrt(dfRy.pow(2).sum()/dfRy.size)
    v2y = 0.0
    v4y = 0.0
    for i in range(len(dfRy)):
        if (i == 1):
            continue
        if pd.isna(dfRy.iloc[i]):
            dfRy.iloc[i] = 0
        if pd.isna(dfRy.iloc[i-1]):
            dfRy.iloc[i-1] = 0
        if frames.iloc[i]-frames.iloc[i-1] > 10:
            continue
        v2y += (float(dfRy.iloc[i]) - float(dfRy.iloc[i-1])) ** 2
        v4y += abs(float(dfRy.iloc[i]) - float(dfRy.iloc[i-1]))
    v2y = math.sqrt(v2y/dfRy.size)
    v3y = dfRy.max() - dfRy.min()
    v4y = 1800*v4y/dfRy.size
    
    v1z = math.sqrt(dfRz.pow(2).sum()/dfRz.size)
    v2z = 0.0
    v4z = 0.0
    for i in range(len(dfRz)):
        if (i == 1):
            continue
        if pd.isna(dfRz.iloc[i]):
            dfRz.iloc[i] = 0
        if pd.isna(dfRz.iloc[i-1]):
            dfRz.iloc[i-1] = 0
        if frames.iloc[i]-frames.iloc[i-1] > 10:
            continue
        v2z += (float(dfRz.iloc[i]) - float(dfRz.iloc[i-1])) ** 2
        v4z += abs(float(dfRz.iloc[i]) - float(dfRz.iloc[i-1]))
    v2z = math.sqrt(v2z/dfRz.size)
    v3z = dfRz.max() - dfRz.min()
    v4z = 1800*v4z/dfRz.size
    
    v13 = 0.0
    for i in range(len(dfRx)):
        if pd.isna(dfRz.iloc[i]):
            dfRz.iloc[i] = 0
        if pd.isna(dfRy.iloc[i]):
            dfRy.iloc[i] = 0
        if pd.isna(dfRx.iloc[i]):
            dfRx.iloc[i] = 0
        v13 += math.sqrt(float(dfRx.iloc[i])**2+float(dfRy.iloc[i])**2+float(dfRz.iloc[i])**2)
    v13 /= dfRx.size
    
    variables = {'ID':  ID[0], 'v1_x':v1x, 'v2_x':v2x, 'v3_x':v3x, 'v4_x':v4x,'v1_y':v1y, 'v2_y':v2y, 'v3_y':v3y, 'v4_y':v4y, 'v1_z':v1z, 'v2_z':v2z, 'v3_z':v3z, 'v4_z':v4z, 'v13':v13}
    vdata.append(variables)
    
    df1 = pd.DataFrame(good_data, columns=['frame'])
    df2 = pd.DataFrame(good_data, columns=['confidence'])
    data.append(good_data)
#     print(df)
print(data)#Check Output 1
# df2 = pd.concat(data, ignore_index=True)
# Are face IDs always 0?
# df2#check output 2

In [None]:
vdata

In [None]:
vdata_df = pd.DataFrame(vdata)
vdata_df

In [None]:
vdata_df.to_csv("C:/Users/fanj4/OneDrive - Children's Hospital of Philadelphia/Documents/Video_Data/VideoDataUpdated_Version2.csv")