In [13]:
import pandas as pd
import numpy as np
import re
import os

from pathlib import Path
from sklearn.model_selection import train_test_split

from ba_dev.utils import load_config_yaml

paths = load_config_yaml('../path_config.yml')

In [2]:
# function to inquire the session number from a path
def extract_session(path: Path | str):
    path = str(path)
    match = re.search(r'_(\d{2})_', path)
    if match:
        return int(match.group(1))
    else:
        raise ValueError(f'Could not extract session number from {path}')

In [8]:
# handling paths
# all paths are relative to the root of the dataset path
dataset_path = paths['dataset']

relative_label_edited = Path('info/labels')
relative_label_archive = Path('info/labels_archive')

label_edited = dataset_path / relative_label_edited
label_archive = dataset_path / relative_label_archive

In [9]:
# Making sure all Paths exist

file_paths = label_edited.glob('*.csv')

ds = pd.DataFrame()

for path in file_paths:
    print(path)
    data = pd.read_csv(path)
    ds = pd.concat([ds, data], ignore_index=True)

# # Debugging: Identify the problematic entry in the 'Directory' column
# for idx, value in ds['Directory'].items():
#     try:
#         # Attempt to combine dataset_path with the value
#         _ = dataset_path / value
#     except TypeError as e:
#         # Print the problematic value and its type
#         print(f"Error at index {idx}: value={value}, type={type(value)}")
#         break

# Combine dataset_path with the 'Directory' column to form absolute paths
absolute_paths = ds['Directory'].apply(lambda x: dataset_path / x)

# Check if the absolute paths exist
non_existent_paths = ds[~absolute_paths.apply(lambda x: x.exists())]

if not non_existent_paths.empty:
    print(f"Found {len(non_existent_paths)} rows with non-existent paths in 'Directory':")
    print(non_existent_paths)
else:
    print("All paths in the 'Directory' column exist relative to the dataset_path.")
    

C:\Users\kraft\BA\dataset\info\labels\session_01_labels.csv
C:\Users\kraft\BA\dataset\info\labels\session_02_labels.csv
C:\Users\kraft\BA\dataset\info\labels\session_03_labels.csv
C:\Users\kraft\BA\dataset\info\labels\session_04_labels.csv
C:\Users\kraft\BA\dataset\info\labels\session_05_labels.csv
C:\Users\kraft\BA\dataset\info\labels\session_06_labels.csv
C:\Users\kraft\BA\dataset\info\labels\session_07_labels.csv
All paths in the 'Directory' column exist relative to the dataset_path.


In [12]:
# transform all labels to standardized format and save them in the same folder
file_paths = label_archive.glob('*.csv')

data_keys = ["session", "SerialNumber", "seq_nr", "seq_id", "Directory", "DateTime_start", "DateTime_end",
                "duration_seconds", "first_file", "last_file", "n_files", "all_files",
                "label", "duplicate_label", "label2"]

for path in file_paths:
    session = extract_session(path)
    data = pd.read_csv(path)
    
    if 'duplicate_label' not in data.columns:
        data['duplicate_label'] = np.nan
    
    if session == 7:
        data = data.drop(columns=['Sequence_number'])
        data['label2'] = np.where(data['label'] == 'myodes_glareolus', 'cricetidae', data['label'])
        data['DateTime_start']  = np.nan
        data['DateTime_end']  = np.nan
    
    data['Directory'] = data['Directory'].apply(lambda x: f"sessions/session_0{session}/{x}" if pd.notna(x) else x)
    data['session'] = session
    
    data['seq_id'] = (data['session'] * 1_000_000 + data.index).astype(int)
    data = data[[col for col in data_keys if col in data.columns]]

    data.to_csv(label_edited / f'session_0{session}_labels.csv', index=False, encoding='utf-8') # uncomment to save the files
    

In [16]:
# creating overview what is available per session

available_keys = pd.DataFrame(columns=data_keys)

file_paths = label_edited.glob('*.csv')

for file in file_paths:
    session = extract_session(file)
    data = pd.read_csv(file)
    keys = data.keys()

    keys_dict = {}
    for key in keys:
        if key == 'session':
            keys_dict[key] = session
        else:
            if data[key].isnull().all():
                keys_dict[key] = 'no'
            else:
                keys_dict[key] = 'yes'

    available_keys = pd.concat([available_keys, pd.DataFrame([keys_dict])], ignore_index=True)



# Create markdown table
markdown_table = available_keys.sort_values(by="session", ascending=True).to_markdown(index=False)
print(markdown_table)

|   session | SerialNumber   | seq_nr   | seq_id   | Directory   | DateTime_start   | DateTime_end   | duration_seconds   | first_file   | last_file   | n_files   | all_files   | label   | duplicate_label   | label2   |
|----------:|:---------------|:---------|:---------|:------------|:-----------------|:---------------|:-------------------|:-------------|:------------|:----------|:------------|:--------|:------------------|:---------|
|         1 | yes            | yes      | yes      | yes         | yes              | yes            | yes                | yes          | yes         | yes       | yes         | yes     | yes               | yes      |
|         2 | yes            | yes      | yes      | yes         | yes              | yes            | yes                | yes          | yes         | yes       | yes         | yes     | no                | yes      |
|         3 | yes            | yes      | yes      | yes         | yes              | yes            | yes              

In [14]:
path = label_edited / 'all/'

path.mkdir(parents=True, exist_ok=True)

In [15]:
# creating a combined label file for all sessions, spliting it into train and test set per sequence and exploding all_files

file_paths = label_edited.glob('*.csv')

combined_data = pd.DataFrame()
for file in file_paths:
    data = pd.read_csv(file)
    combined_data = pd.concat([combined_data, data])

train, test = train_test_split(combined_data, test_size=0.1, random_state=42)

train['split'] = 'train'
test['split'] = 'test'

df_split = pd.concat([train, test]).sort_values(by='seq_id', ascending=True)

df_split['all_files'] = df_split['all_files'].str.split(',')

df_explode = df_split.explode("all_files", ignore_index=True)

df_explode['file'] = df_explode['all_files']

df_explode['duplicate_label'] = df_explode['duplicate_label'].map({1.0: True, 0.0: False})

col_selection = ['session', 'seq_id', 'Directory', 'file', 'n_files', 'label', 'duplicate_label', 'label2','split', 'SerialNumber', 'DateTime_start', 'DateTime_end', 'duration_seconds']

df_final = df_explode[col_selection]

df_final.to_csv(label_edited / 'all/all_sessions_labels.csv', index=False)

In [16]:
data = pd.read_csv(label_edited / 'all/all_sessions_labels.csv', dtype={6: str})


In [17]:
def check_file_exists(row):
    path = dataset_path / Path(row['Directory']) / Path(row['file'])
    # Check if the file exists
    if path.exists():
        return True
    else:
        print(f"File does not exist: {path}")
    return path.exists()

In [18]:
data['file_exists'] = data.apply(check_file_exists, axis=1)

In [19]:
data[data['file_exists'] == False].drop_duplicates(subset=['seq_id'], keep='first')

Unnamed: 0,session,seq_id,Directory,file,n_files,label,duplicate_label,label2,split,SerialNumber,DateTime_start,DateTime_end,duration_seconds,file_exists


In [20]:
data['duplicate_label'].unique()

array(['False', 'True', nan], dtype=object)