# Tunisian Horses - Only stills
* do not use the cropped images, because the cropped images are most probably extracted from the stills

## Create metadata

In [None]:
import shutil
import os
import pandas as pd

# 2. Prepare data for model training:
# Base path to the database
base_dir = '../data/THoDBRL2015'

metadata = []

# Consolidate images into a training directory
output_train_dir = os.path.join(base_dir, 'training_data')

os.makedirs(output_train_dir, exist_ok=True)

# Iterate through each Part folder
for part in ['Part1', 'Part2', 'Part3', 'Part4', 'Part5']:
    videos_dir = os.path.join(base_dir, part, 'videos')

    for horse_id_folder in os.listdir(videos_dir):
        horse_path = os.path.join(videos_dir, horse_id_folder)

        if os.path.isdir(horse_path):  # Ensure it's a directory
            # Find all stills folders (e.g., images, images1, images2, etc.)
            for stills_folder in os.listdir(horse_path):
                stills_path = os.path.join(horse_path, stills_folder)

                if os.path.isdir(stills_path) and stills_folder.startswith('images'):  # Check for folders named 'images*'
                    target_dir = os.path.join(output_train_dir, f'horse_{horse_id_folder}')

                    os.makedirs(target_dir, exist_ok=True)
                    
                    # Copy all image files from the stills folder to the target directory
                    for img_file in os.listdir(stills_path):
                        img_path = os.path.join(stills_path, img_file)

                        if img_file.endswith(('.jpg', '.jpeg', '.png')):  # Ensure it's an image file
                            metadata.append({
                                'horse_id': horse_id_folder,
                                'image_path': img_path
                            })
                        
                            # shutil.copy(img_path, target_dir)

                    # print(f"Copied images from {stills_path} to {target_dir}")

# Create DataFrame with specified dtypes
metadata_df = pd.DataFrame(metadata, dtype='object').astype({
    'horse_id': 'int64',          # Assuming it's an integer
    'image_path': 'string',
})




## Descriptive Analysis

In [None]:
print(f"Total images: {len(metadata_df)}")
print(f"Total horses: {metadata_df['horse_id'].nunique()}")


In [None]:
print(metadata_df[:5])

In [None]:
grouped_horse = metadata_df.groupby(['horse_id'], observed=True)

grouped_horse.describe()

In [None]:
# Images per horse id
grouped_horse.size().agg(['min', 'max', 'mean'])

In [None]:
df = pd.DataFrame(df)

## Create CNN