# Rearranging the Stanford dataset to train and test splits

We will use two datasets optimized for the [torchvision.datasets.ImageFolder](https://pytorch.org/vision/main/generated/torchvision.datasets.ImageFolder.html)

First dataset will have splits according to the listings from `ImageSplits` folder.

Second dataset will have classic random splits 80:20.

In [6]:
import os
import shutil
# Define the paths
source_folder = 'Stanford40/JPEGImages'
splits_folder = 'Stanford40/ImageSplits'

## Given splits

In [1]:


## The process to Generate the first dataset according to listings in the `ImageSplits` folder is as follows:

# create the folder `data/Stanford40/given_splits`

# open folder Stanford40/ImageSplits

# enumerate every .txt file in the folder
# skip `actions.txt`

# get the suffix of the file name - `_train` or `_test`
# remainder of the file name is the class name

# create the folder `data/Stanford40/given_splits/<suffix without underscore>/<class_name>`

# read the text file line by line
# each line is the name of an image file
# copy that image file from the Stanford40/JPEGImages to the folder `data/Stanford40/given_splits/<suffix without underscore>/<class_name>/`


destination_folder = 'data/Stanford40/given_splits'

# Create the destination folder if it doesn't exist
os.makedirs(destination_folder, exist_ok=True)

# Enumerate every .txt file in the splits folder
for split_file in os.listdir(splits_folder):
    if split_file.endswith('.txt') and split_file != 'actions.txt' and split_file != 'train.txt' and split_file != 'test.txt':
        # Get the suffix and class name
        suffix = split_file.split('_')[-1].replace('.txt', '')
        class_name = '_'.join(split_file.split('_')[:-1])
        
        # Create the class folder in the destination
        class_folder = os.path.join(destination_folder, suffix, class_name)
        os.makedirs(class_folder, exist_ok=True)
        
        # Read the text file line by line
        with open(os.path.join(splits_folder, split_file), 'r') as file:
            for line in file:
                image_file = line.strip()
                # Copy the image file to the destination folder
                shutil.copy(os.path.join(source_folder, image_file), class_folder)




## (OPTIONAL) Given ungrouped splits

Stanford dataset contains the "train.txt" and "test.txt" files with filelists.
Processing them creates the same split as processing the individual "\*_train.txt" and "\*_test.txt" files for each class so processing them is not necessary (or processing per-class files is unnecessary)

In [7]:
# ensure folder `data/Stanford40/given_ungrouped_splits` is created
# ensure subfolders `train` and `test` are created in `data/Stanford40/given_ungrouped_splits`

# open file `Stanford40/ImageSplits/train.txt`
# read the file line by line
# each line is the name of an image file
# name is the class name followed by the underscore and the image number
# ensure that the class name folder is created in `data/Stanford40/given_ungrouped_splits/train/`
# copy the image file from `Stanford40/JPEGImages` to the class name folder in `data/Stanford40/given_ungrouped_splits/train/`

# repeat the above with the `test.txt` file and the `test` folder
ungrouped_splits_folder = 'data/Stanford40/given_ungrouped_splits'
os.makedirs(ungrouped_splits_folder, exist_ok=True)

train_folder = os.path.join(ungrouped_splits_folder, 'train')
test_folder = os.path.join(ungrouped_splits_folder, 'test')
os.makedirs(train_folder, exist_ok=True)
os.makedirs(test_folder, exist_ok=True)

# Process train.txt
with open(os.path.join(splits_folder, 'train.txt'), 'r') as file:
    for line in file:
        image_file = line.strip()
        class_name = image_file.rsplit('_', 1)[0]
        class_folder = os.path.join(train_folder, class_name)
        os.makedirs(class_folder, exist_ok=True)
        shutil.copy(os.path.join(source_folder, image_file), class_folder)

# Process test.txt
with open(os.path.join(splits_folder, 'test.txt'), 'r') as file:
    for line in file:
        image_file = line.strip()
        class_name = image_file.rsplit('_', 1)[0]
        class_folder = os.path.join(test_folder, class_name)
        os.makedirs(class_folder, exist_ok=True)
        shutil.copy(os.path.join(source_folder, image_file), class_folder)



## Create random splits

Go over the source folder and copy the images from it to the random splits folder 80:20 train:test, with classes as subfolders.

In [2]:
import os
import random
import shutil

# ensure that the directory data/Stanford40/random_splits exists

# ensure two folders inside: `train` and `test`

# enumerate every image in the folder Stanford40/JPEGImages

# for each image, split its filename into two parts: the class name and the numeric index at the end of the filename.
# ensure that the folder with the class name exists in the `train` folder and the `test` folder

# with 80% probability copy the image to the `train` folder under the corresponding class folder
# with 20% probability copy the image to the `test` folder under the corresponding class folder
random_splits_folder = 'data/Stanford40/random_splits'
os.makedirs(random_splits_folder, exist_ok=True)

train_folder = os.path.join(random_splits_folder, 'train')
test_folder = os.path.join(random_splits_folder, 'test')
os.makedirs(train_folder, exist_ok=True)
os.makedirs(test_folder, exist_ok=True)

for image_file in os.listdir(source_folder):
    if not image_file.lower().endswith('.jpg'):
        continue
    name_part = os.path.splitext(image_file)[0]
    sep_pos = name_part.rfind('_')
    class_name = name_part[:sep_pos]

    train_class = os.path.join(train_folder, class_name)
    test_class = os.path.join(test_folder, class_name)
    os.makedirs(train_class, exist_ok=True)
    os.makedirs(test_class, exist_ok=True)

    if random.random() < 0.8:
        shutil.copy(os.path.join(source_folder, image_file), train_class)
    else:
        shutil.copy(os.path.join(source_folder, image_file), test_class)



## Count images in folders

Go over the created splits folders and collect the numbers of images per train/test split and class.
Creates the `report.csv` file in this folder.

In [9]:
import os
import pandas as pd

# Function to count images in a folder
def count_images(folder):
    return len([name for name in os.listdir(folder) if os.path.isfile(os.path.join(folder, name))])

# Initialize a list to store the report data
report_data = []

destination_folder = 'data/Stanford40/given_splits'

# Process given_splits
for split in ['train', 'test']:
    split_folder = os.path.join(destination_folder, split)
    for class_name in os.listdir(split_folder):
        class_folder = os.path.join(split_folder, class_name)
        num_images = count_images(class_folder)
        report_data.append({
            'Dataset': 'given_splits',
            'Split': split,
            'Class': class_name,
            'Num Images': num_images
        })

random_splits_folder = 'data/Stanford40/random_splits'

# Process random_splits
for split in ['train', 'test']:
    split_folder = os.path.join(random_splits_folder, split)
    for class_name in os.listdir(split_folder):
        class_folder = os.path.join(split_folder, class_name)
        num_images = count_images(class_folder)
        report_data.append({
            'Dataset': 'random_splits',
            'Split': split,
            'Class': class_name,
            'Num Images': num_images
        })

# Create a DataFrame from the report data
report_df = pd.DataFrame(report_data)

# Pivot the table
pivot_df = report_df.pivot_table(index='Class', columns=['Dataset', 'Split'], values='Num Images', fill_value=0)

# Flatten the column hierarchy
pivot_df.columns = [f'{dataset} - {split} counts' for dataset, split in pivot_df.columns]

# Reset the index to make 'Class' a column
pivot_df.reset_index(inplace=True)

# Save the pivot table to a CSV file
pivot_df.to_csv('report.csv', index=False)

# Display the pivot table
print(pivot_df)

                           Class  given_splits - test counts  \
0                     applauding                       184.0   
1                blowing_bubbles                       159.0   
2                 brushing_teeth                       100.0   
3             cleaning_the_floor                       112.0   
4                       climbing                       195.0   
5                        cooking                       188.0   
6                  cutting_trees                       103.0   
7             cutting_vegetables                        89.0   
8                       drinking                       156.0   
9                feeding_a_horse                       187.0   
10                       fishing                       173.0   
11                 fixing_a_bike                       128.0   
12                  fixing_a_car                       151.0   
13                     gardening                        99.0   
14           holding_an_umbrella        