In [1]:
import pandas as pd
import numpy as np
import re

from pathlib import Path
from sklearn.model_selection import train_test_split

In [2]:
# handling paths
# all paths are relative to the root of the dataset path
dataset_path = Path('/cfs/earth/scratch/kraftjul/BA/dataset')

relative_label_path = Path('archive/labels_backup/all_sessions')
relative_labels_save_path = Path('info/labels')

label_path = dataset_path / relative_label_path
labels_save_path = dataset_path / relative_labels_save_path

In [3]:
# function to inquire the session number from a path
def extract_session(path: Path):
    path = str(path)
    match = re.search(r'_(\d{2})_', path)
    if match:
        return int(match.group(1))
    else:
        raise ValueError(f'Could not extract session number from {path}')

In [4]:
# transform all labels to standardized format and save them in the same folder
file_paths = [f for f in label_path.iterdir() if f.is_file()]

data_keys = ["session", "SerialNumber", "seq_nr", "seq_id", "Directory", "DateTime_start", "DateTime_end",
                "duration_seconds", "first_file", "last_file", "n_files", "all_files",
                "label", "duplicate_label", "label2"]

for path in file_paths:
    session = extract_session(path)
    data = pd.read_csv(path)
    
    if 'duplicate_label' not in data.columns:
        data['duplicate_label'] = np.nan
    
    if session == 7:
        data = data.drop(columns=['Sequence_number'])
        data['label2'] = np.where(data['label'] == 'myodes_glareolus', 'cricetidae', data['label'])
        data['DateTime_start']  = np.nan
        data['DateTime_end']  = np.nan
    
    data['Directory'] = data['Directory'].apply(lambda x: f"sessions/session_0{session}/{x}" if pd.notna(x) else x)
    data['session'] = session
    
    data['seq_id'] = (data['session'] * 1_000_000 + data.index).astype(int)
    data = data[[col for col in data_keys if col in data.columns]]

    data.to_csv(labels_save_path / f'session_0{session}_labels.csv', index=False) # uncomment to save the files
    

In [5]:
# creating overview what is available per session

available_keys = pd.DataFrame(columns=data_keys)

file_paths = [f for f in labels_save_path.iterdir() if f.is_file() and not f.name.startswith("all")]

for file in file_paths:
    session = extract_session(file)
    data = pd.read_csv(file)
    keys = data.keys()

    keys_dict = {}
    for key in keys:
        if key == 'session':
            keys_dict[key] = session
        else:
            if data[key].isnull().all():
                keys_dict[key] = 'no'
            else:
                keys_dict[key] = 'yes'

    available_keys = pd.concat([available_keys, pd.DataFrame([keys_dict])], ignore_index=True)



# Create markdown table
markdown_table = available_keys.sort_values(by="session", ascending=True).to_markdown(index=False)
print(markdown_table)

|   session | SerialNumber   | seq_nr   | seq_id   | Directory   | DateTime_start   | DateTime_end   | duration_seconds   | first_file   | last_file   | n_files   | all_files   | label   | duplicate_label   | label2   |
|----------:|:---------------|:---------|:---------|:------------|:-----------------|:---------------|:-------------------|:-------------|:------------|:----------|:------------|:--------|:------------------|:---------|
|         1 | yes            | yes      | yes      | yes         | yes              | yes            | yes                | yes          | yes         | yes       | yes         | yes     | yes               | yes      |
|         2 | yes            | yes      | yes      | yes         | yes              | yes            | yes                | yes          | yes         | yes       | yes         | yes     | no                | yes      |
|         3 | yes            | yes      | yes      | yes         | yes              | yes            | yes              

In [None]:
# creating a combined label file for all sessions, spliting it into train and test set per sequence and exploding all_files

file_paths = [f for f in labels_save_path.iterdir() if f.is_file()]

combined_data = pd.DataFrame()
for file in file_paths:
    data = pd.read_csv(file)
    combined_data = pd.concat([combined_data, data])

train, test = train_test_split(combined_data, test_size=0.1, random_state=42)

train['split'] = 'train'
test['split'] = 'test'

df_split = pd.concat([train, test]).sort_values(by='seq_id', ascending=True)

df_split['all_files'] = df_split['all_files'].str.split(',')

df_explode = df_split.explode("all_files", ignore_index=True)

df_explode['file'] = df_explode['all_files']

df_explode['duplicate_label'] = df_explode['duplicate_label'].map({1.0: True, 0.0: False})

col_selection = ['session', 'seq_id', 'Directory', 'file', 'n_files', 'label', 'duplicate_label', 'label2','split', 'SerialNumber', 'DateTime_start', 'DateTime_end', 'duration_seconds']

df_final = df_explode[col_selection]

df_final.to_csv(labels_save_path / 'all_sessions_labels.csv', index=False)

In [9]:
data = pd.read_csv(labels_save_path / 'all_sessions_labels.csv', dtype={6: str})


In [10]:
def check_file_exists(row):
    path = dataset_path / Path(row['Directory']) / Path(row['file'])
    return path.exists()

In [11]:
data['file_exists'] = data.apply(check_file_exists, axis=1)

In [12]:
data[data['file_exists'] == False].drop_duplicates(subset=['seq_id'], keep='first')

Unnamed: 0,session,seq_id,Directory,file,n_files,label,duplicate_label,label2,split,SerialNumber,DateTime_start,DateTime_end,duration_seconds,file_exists


In [None]:
data['duplicate_label'].unique()