In [6]:
import pandas as pd
import numpy as np
import os
import re

In [80]:
folder_path = 'csv'

In [38]:
def standardize(x):
    '''
    takes a column of text, and standardizes it to all lowercase, no spaces, no punctuation
    
    '''
    
    # Remove all non-alphanumeric characters (punctuation, spaces, etc.)
    label_cleaned = re.sub(r'[^a-zA-Z]', '', x)
    # Convert to lowercase
    label_standardized = label_cleaned.lower()
    
    if 'self' in label_standardized:
        label_standardized = 'selflicking'
    elif 'groom' in label_standardized:
        label_standardized = 'allogrooming'
    else:
        label_standardized = 'allolicking'
    
    return label_standardized


In [53]:

# Loop through all files in the folder

# dictionary to hold all the data that will be turned into final df
all_video_data = {}
large_df = pd.DataFrame()

all_initials = set()


# loop through each file 
for filename in os.listdir(folder_path):
    
    if filename.endswith('.csv'):  # Check if the file is a CSV
        file_path = os.path.join(folder_path, filename)
        
        # temp dictionary that'll be added to all_video_data at the end of the loop
        curr_vid_data = {}
        
        # extract initials and file name for labeling purposes
        initials = filename.strip('.csv')[-2:]
        video_name = filename.split('_')[0]
        
        all_initials.add(initials)
        
        
        
        
        
   
        # turn current csv file into a data frame
        df = pd.read_csv(file_path, header = None)
        
        # standardize the labeling to all lowercase and no spaces/punctuation
        df[0] = df[0].apply(standardize)
        
        # get the total duration of each behavior
        # 0 = behavior name
        # 1 = NaN column
        # 2 = start time
        # 3 = end time
        # 4 = duration
        # 5 = NaN column
        
        d = dict(df.groupby(0).sum()[4])
        
        # rename the keys using initial and behavior name ex) allogrooming_KT
        for key, value in d.items():
            curr_vid_data[f'{key}_{initials}'] = value
        
        
        # first time this video appears,  not yet in large dict (creates new row for the video)
        if video_name not in all_video_data:
            all_video_data[video_name] = curr_vid_data  
            
        # video already in the dictonary, just append the data to existing row
        else:
            all_video_data[video_name].update(curr_vid_data)      
        
        

  d = dict(df.groupby(0).sum()[4])


In [54]:
all_initials

{'AB', 'AN', 'AZ', 'BF', 'DS', 'KC', 'KT', 'MA', 'MR', 'MW', 'NK', 'RB'}

In [78]:
t = pd.DataFrame(all_video_data).T.sort_index().fillna(0)

In [79]:
t

Unnamed: 0,allogrooming_AN,allolicking_AN,selflicking_AN,allogrooming_AZ,allolicking_AZ,selflicking_AZ,allogrooming_BF,selflicking_BF,allogrooming_DS,allolicking_DS,...,allogrooming_MW,allolicking_MW,selflicking_MW,allolicking_MR,allolicking_NK,allolicking_AB,allolicking_RB,allolicking_KC,allolicking_KT,allolicking_BF
reliability1,26.352,144.949,7.218,44.895,148.628,21.328,13.182,5.525,15.65,134.985,...,24.936,151.504,10.39,147.46,146.29,145.937,139.178,140.061,156.374,132.921
reliability2,14.408,8.11,124.413,18.534,10.271,141.148,11.816,111.834,13.72,5.088,...,26.214,7.922,137.526,9.753,7.939,7.26,3.85,3.588,14.757,3.149
reliability3,14.825,19.047,52.693,10.921,30.582,71.271,11.392,44.725,13.049,24.124,...,5.922,24.951,67.35,24.737,26.364,24.01,15.51,21.679,36.987,13.97
reliability4,75.9,53.971,94.959,74.835,52.448,97.64,52.074,68.551,43.114,55.492,...,67.57,53.158,94.32,55.73,52.219,45.6,55.381,99.013,59.157,40.529
reliability5,79.579,2.478,108.852,63.954,6.256,120.525,80.731,81.622,54.274,2.583,...,79.955,2.65,118.855,0.0,0.0,0.0,0.0,0.0,0.0,0.0
reliability6,142.836,3.08,95.119,76.143,10.493,99.976,125.51,99.043,95.116,0.0,...,165.322,5.66,96.162,0.0,7.198,0.0,5.67,0.0,3.129,2.386


In [57]:
# checking unique values

t.columns.str[:-3].unique()

Index(['allogrooming', 'allolicking', 'selflicking'], dtype='object')

### Manually checking indiv csv to sanity check

In [65]:
path = 'csv/reliability1_KT.csv'
AZ_2 = pd.read_csv(path, header = None)
AZ_2.head()

Unnamed: 0,0,1,2,3,4
0,allolicking,,118.105,131.941,13.836
1,allolicking,,150.036,154.123,4.087
2,allolicking,,172.446,173.002,0.556
3,allolicking,,222.633,232.075,9.442
4,allolicking,,237.999,239.493,1.494


In [66]:
AZ_2[0].unique()

array(['allolicking', 'allogrooming', 'selflicking'], dtype=object)

In [67]:
AZ_2[0] = AZ_2[0].apply(standardize)

In [68]:
AZ_2[0].unique()

array(['allolicking', 'allogrooming', 'selflicking'], dtype=object)

In [69]:
AZ_2.groupby(0).sum()[4]

0
allogrooming     25.750
allolicking     156.374
selflicking       4.954
Name: 4, dtype: float64

In [70]:
d = dict(df.groupby(0).sum()[4])
curr_vid_data = {}
all_vid_data = {}

In [71]:
for key, value in d.items():
#     print (value, key)
    curr_vid_data[f'{key}_{initials}'] = value

    

In [72]:
curr_vid_data

{'allogrooming_MW': 79.955, 'allolicking_MW': 2.65, 'selflicking_MW': 118.855}

In [124]:
test = {'Allogrooming_MA': 79.955, 'Allolicking_MA': 2.65, 'Self-licking_MA': 118.855}

In [125]:
all_vid_data[video_name] = curr_vid_data

In [126]:
all_vid_data

{'reliability5': {'Allogrooming_MW': 79.955,
  'Allolicking_MW': 2.65,
  'Self-licking_MW': 118.855}}

In [129]:
all_vid_data[video_name].update(test)

In [130]:
all_vid_data

{'reliability5': {'Allogrooming_MW': 79.955,
  'Allolicking_MW': 2.65,
  'Self-licking_MW': 118.855,
  'Allogrooming_MA': 79.955,
  'Allolicking_MA': 2.65,
  'Self-licking_MA': 118.855}}

In [73]:
test2 = {'reliability4': {'Allogrooming_MW': 79.955,
  'Allolicking_MW': 2.65,
  'Self-licking_MW': 118.855,
  'Allogrooming_MA': 79.955,
  'Allolicking_MA': 2.65,
  'Self-licking_MA': 118.855}}
test3 = {'reliability1': {'Allogrooming_MW': 79.955,
  'Allolicking_MW': 2.65,
  'Self-licking_MW': 118.855,
  'Allogrooming_MA': 79.955,
  'Allolicking_MA': 2.65,
  'Self-licking_MA': 118.855}}

In [76]:
all_vid_data.update(test2)

In [77]:
all_vid_data

{'reliability1': {'Allogrooming_MW': 79.955,
  'Allolicking_MW': 2.65,
  'Self-licking_MW': 118.855,
  'Allogrooming_MA': 79.955,
  'Allolicking_MA': 2.65,
  'Self-licking_MA': 118.855},
 'reliability4': {'Allogrooming_MW': 79.955,
  'Allolicking_MW': 2.65,
  'Self-licking_MW': 118.855,
  'Allogrooming_MA': 79.955,
  'Allolicking_MA': 2.65,
  'Self-licking_MA': 118.855}}

In [137]:
pd.DataFrame(all_vid_data).T

Unnamed: 0,Allogrooming_MW,Allolicking_MW,Self-licking_MW,Allogrooming_MA,Allolicking_MA,Self-licking_MA
reliability5,79.955,2.65,118.855,79.955,2.65,118.855
reliability4,79.955,2.65,118.855,79.955,2.65,118.855


In [67]:
values

0
Allogrooming     79.955
Allolicking       2.650
Self-licking    118.855
Name: 4, dtype: float64

In [63]:
#Create one .csv that contains total duration of each behavior for each video in the current directory
import os
import pandas as pd
import numpy as np

summary_df = None

def process_csv_files(directory):
    behavior_durations_per_video = {}

    for filename in os.listdir(directory):
        if filename.endswith(".csv"):
            filepath = os.path.join(directory, filename)
            df = pd.read_csv(filepath)

            # Dictionary to hold total duration for each behavior in the current file
            behavior_total_duration = {}

            # Assuming the columns are: [Behavior, Unknown, Start Time, End Time, Duration]
            for _, row in df.iterrows():
                behavior = row[0]
                duration = row[4]

                # Update the total duration for each behavior in the current video
                if behavior not in behavior_total_duration:
                    behavior_total_duration[behavior] = 0
                behavior_total_duration[behavior] += duration

            # Store the total durations for this video in the main dictionary
            # Use the filename (without extension) as the key
            behavior_durations_per_video[filename] = behavior_total_duration
#             print(behavior_durations_per_video)

    # Create a DataFrame to hold the summary of each video and behavior durations
    all_behaviors = set()
    for behavior_durations in behavior_durations_per_video.values():
        all_behaviors.update(behavior_durations.keys())
    
    all_behaviors = sorted(all_behaviors)  # Sort the behaviors for consistency

    # Create a list of dictionaries where each dictionary contains the video and its behavior durations
    summary_data = []
    for video, behavior_durations in behavior_durations_per_video.items():
        row_data = {'Video': video}
        for behavior in all_behaviors:
            row_data[behavior] = behavior_durations.get(behavior, 0)  # Fill with 0 if behavior not present
        summary_data.append(row_data)

    # Create the DataFrame from the summary data
    summary_df = pd.DataFrame(summary_data)
    return summary_df

    # Save the summary DataFrame to a CSV file
#     summary_csv = os.path.join(directory, 'video_behavior_durations.csv')
#     summary_df.to_csv(summary_csv, index=False)

    print(f"Video behavior durations saved to {summary_csv}")

# Get the current working directory
directory_path = os.getcwd()

process_csv_files('csv')


Unnamed: 0,Video,Allogroming,Allogrooming,Allolicking,Self licking,Self-Grooming,Self-licking,allo groom,allo lick,allo licking,allogroom,allogrooming,allolick,allolicking,self lick,self licking,self-licking,selflick,selflicking
0,reliability5_AN.csv,0.0,0.000,0.000,0.0,0.0,0.000,0.000,0.000,0.0,0.0,76.321,0.0,2.478,0.000,108.852,0.000,0.0,0.000
1,reliability2_MR.csv,0.0,0.000,0.000,0.0,0.0,0.000,0.000,0.000,0.0,0.0,5.620,0.0,6.430,0.000,134.912,0.000,0.0,0.000
2,reliability1_PRB.csv,0.0,24.368,139.178,0.0,0.0,7.140,0.000,0.000,0.0,0.0,0.000,0.0,0.000,0.000,0.000,0.000,0.0,0.000
3,reliability2_NK.csv,0.0,0.000,0.000,0.0,0.0,0.000,0.000,0.000,0.0,0.0,17.110,0.0,7.939,0.000,128.879,0.000,0.0,0.000
4,reliability5_AZ.csv,0.0,0.000,0.000,0.0,0.0,0.000,0.000,0.000,0.0,0.0,63.954,0.0,6.256,0.000,0.000,0.000,0.0,117.967
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67,reliability4_DS.csv,0.0,0.000,0.000,0.0,0.0,0.000,0.000,0.000,0.0,0.0,43.114,0.0,55.492,0.000,92.059,0.000,0.0,0.000
68,reliability5_MA.csv,0.0,0.000,0.000,0.0,0.0,0.000,87.067,0.177,0.0,0.0,0.000,0.0,0.000,120.994,0.000,0.000,0.0,0.000
69,reliability5_AB.csv,0.0,0.000,0.000,0.0,0.0,0.000,0.000,0.000,0.0,0.0,65.812,0.0,0.000,0.000,108.320,0.000,0.0,0.000
70,reliability2_BF.csv,0.0,0.000,0.000,0.0,0.0,0.000,0.000,0.000,0.0,0.0,11.816,0.0,3.149,0.000,0.000,110.834,0.0,0.000


NameError: name 'summary_df' is not defined