# Utility scripts to organize data files

In [None]:
import os
import glob
import pandas as pd

In [None]:
# add filename to v3,4,5 labels.txt

versions = range(3,6)

file_count = 0
for v in versions:
    df = pd.read_csv(f'/home/riikoro/fossil_data/tooth_samples/v{v}/labels.txt')
    df['filename'] = df.index.astype('str') + '.png'
    colnames = ['filename', 'tooth', 'u']
    df = df.reindex(columns=colnames)
    df['u'] = df['u'].astype('Int64') # int64 allows nans
    print(df.head())
    df.to_csv(f'/home/riikoro/fossil_data/tooth_samples/v{v}/labelsnew.txt', index=False)
    # break

In [57]:
# create labels.txt files for v8-21, filename | tooth | u (to have labels similar w v3, v4)
versions = range(8,22)
base_path = f'/home/riikoro/fossil_data/tooth_samples/'

for v in versions:
    # read up low to df
    df = pd.read_csv(f'{base_path}/v{v}/upper_lower_labels.txt', names=['filename', 'u'], header=0)
    # print(df.head())

    # read azure labels
    azure_labels = pd.read_csv(f'{base_path}/v{v}/azure_labels.txt', header=None, names=['tooth'])
    azure_labels['filename'] = azure_labels.index.astype('str') + '.png'
    # print(azure_labels.head())

    # azure labels contains labels for all images (also deleted images that are not tooth markings) --> merge by left join (drop azure labels where filename does not exist in df)
    df = df.merge(azure_labels, on='filename', how='left')
    df['u'] = df['u'].astype('Int64')
    # reorder
    colnames = ['filename', 'tooth', 'u']
    df = df.reindex(columns=colnames)

    df.to_csv(f'/home/riikoro/fossil_data/tooth_samples/v{v}/labels.txt', index=False)
    # break


In [110]:
# create label files with filename | azure_output | MPI | upper | index for all versions (labels_full.csv)
base_path = f'/home/riikoro/fossil_data/tooth_samples/'
versions = [3,4,5]
versions.extend(list(range(8,22)))

# function for removing invalid tooth indices
def fix_tooth_labels(tooth, index):
    # Fix index: set to none if mpi is c
    # Fix index: number is something else than 1,2,3,4
    # Fix index: mpi is m or i and index is 4
    invalid_index = tooth == 'C' or (index not in ['1','2','3','4']) or (tooth in ['M', 'I'] and index not in ['1','2','3'])
    new_index = None if invalid_index else index
    return new_index

for v in versions:
    # read up low to df
    df = pd.read_csv(f'{base_path}/v{v}/labels.txt', header=0, names=['filename', 'azure_output', 'u'])
    df['u'] = df['u'].astype('Int64')

    # Create MPI column, fill with azure outputs where output is m p or i. put none for where output is not mpi
    df['MPI'] = df['azure_output'].str[0] # str to vectorize
    df['MPI'] = df['MPI'].apply(lambda s: s.upper() if s.upper() in ['M', 'P', 'I', 'C'] else None)

    # Create index column
    df['tooth_index'] = df['azure_output'].str[-1] # last character to not get indexoutofbounds errors for c's
    df['tooth_index'] = df.apply(lambda x: fix_tooth_labels(x.MPI, x.tooth_index), axis=1).astype('Int64')

    # set c's to none in mpi column
    df.to_csv(f'{base_path}/v{v}/labels_full.csv', index=False)