In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
import shutil
import json

# Creating jsons for CLIP

train_dir = '/content/drive/MyDrive/Applied CV Project/freiburg_groceries_scenes/images/train'
test_dir = '/content/drive/MyDrive/Applied CV Project/freiburg_groceries_scenes/images/test'
labels_path = '/content/drive/MyDrive/Applied CV Project/freiburg_groceries_scenes/labels.txt'

labels_dict = {}
with open(labels_path, 'r') as file:
  for line in file:
    parts = line.strip().split(';')
    if len(parts) == 2:
      image_id, labels = parts
      labels_dict[image_id] = labels.split(',') # split labels by ',' to create list

labels_dict

{'scene_000_0': [' CEREAL',
  'HONEY',
  'JAM',
  'COFFEE',
  'TEA',
  'OIL',
  'VINEGAR',
  'SUGAR',
  'SPICES',
  'RICE',
  'FISH',
  'CORN',
  'BEANS',
  'PASTA',
  'TOMATO_SAUCE'],
 'scene_000_1': [' CEREAL',
  'HONEY',
  'JAM',
  'COFFEE',
  'TEA',
  'OIL',
  'VINEGAR',
  'SUGAR',
  'SPICES',
  'RICE',
  'FISH',
  'CORN',
  'BEANS',
  'PASTA',
  'TOMATO_SAUCE'],
 'scene_001_0': [' CEREAL',
  'HONEY',
  'JAM',
  'MILK',
  'COFFEE',
  'TEA',
  'OIL',
  'VINEGAR',
  'SUGAR',
  'SPICES',
  'RICE',
  'FISH',
  'CORN',
  'BEANS',
  'PASTA',
  'TOMATO_SAUCE'],
 'scene_001_1': [' CEREAL',
  'HONEY',
  'JAM',
  'MILK',
  'COFFEE',
  'TEA',
  'OIL',
  'VINEGAR',
  'SUGAR',
  'SPICES',
  'RICE',
  'FISH',
  'CORN',
  'BEANS',
  'PASTA',
  'TOMATO_SAUCE'],
 'scene_002_0': [' TEA',
  'CEREAL',
  'HONEY',
  'COFFEE',
  'MILK',
  'PASTA',
  'RICE',
  'SPICES',
  'TOMATO_SAUCE',
  'FOSH',
  'BEANS',
  'CORN',
  'OIL',
  'VINEGAR',
  'SUGAR'],
 'scene_002_1': [' TEA',
  'CEREAL',
  'HONEY',
  'COF

In [None]:
def create_json_for_dataset(dataset_dir, dataset_type):
  dataset = []

  base_dir = "/content/drive/MyDrive/Applied CV Project/freiburg_groceries_scenes"

  for file_name in os.listdir(dataset_dir):

    if not file_name.endswith(".png"): # filter out non-png images
      continue

    image_id = os.path.splitext(file_name)[0]
    if image_id in labels_dict:
      data_entry = {
          "image_id" : image_id,
          "file_path": os.path.join('./',os.path.relpath(dataset_dir, base_dir),file_name),
          "labels" : [label.replace("_", " ").lower() for label in labels_dict[image_id]]
      }
      dataset.append(data_entry)

  # Write out json
  json_file_path = os.path.join(base_dir,  f'{dataset_type}_data.json')
  with open(json_file_path, 'w') as json_file:
    json.dump(dataset, json_file)

  print(f"Created {json_file_path}")


In [None]:
create_json_for_dataset(train_dir,"train")
create_json_for_dataset(test_dir,"test")

Created /content/drive/MyDrive/Applied CV Project/freiburg_groceries_scenes/train_data.json
Created /content/drive/MyDrive/Applied CV Project/freiburg_groceries_scenes/test_data.json


In [None]:
# CLIP_training_instructions
instructions = "All image paths are relative to the root directory, i.e., freiburg_groceries_scenes"
file_path = "/content/drive/MyDrive/Applied CV Project/freiburg_groceries_scenes/instructions.txt"

with open(file_path, 'w') as f:
  f.write(instructions)

JSON creation for freiburg_groceries_dataset

In [None]:
def create_json_for_dataset_freiburg_groceries(dataset_dir, dataset_type):
  dataset = []

  base_dir = "/content/drive/MyDrive/Applied CV Project/freiburg_groceries_dataset"

  # iterate through all the individual folders and find the dataset_type
  categories = [f for f in os.listdir(base_dir) if os.path.isdir(os.path.join(base_dir, f))]
  categories_paths = [os.path.join(base_dir, x) for x in categories]

  for category in categories:
    category_path = os.path.join(base_dir, category, dataset_type)
    print(category_path)

    if os.path.exists(category_path):
      for file_name in os.listdir(category_path):
        if not file_name.endswith(".png"): # filter out non-png images
          continue

        image_id = os.path.splitext(file_name)[0]
        data_entry = {
            "image_id" : image_id,
            "file_path": os.path.join('./',os.path.relpath(category_path, base_dir),file_name),
            "labels" : [category.replace("_", " ").lower()] # just the directory name
        }
        dataset.append(data_entry)
    print(category)

  # Write out json
  json_file_path = os.path.join(base_dir,  f'{dataset_type}_data.json')
  with open(json_file_path, 'w') as json_file:
    json.dump(dataset, json_file)

  print(f"Created {json_file_path}")


In [None]:
create_json_for_dataset_freiburg_groceries("/content/drive/MyDrive/Applied CV Project/freiburg_groceries_dataset", "test")
create_json_for_dataset_freiburg_groceries("/content/drive/MyDrive/Applied CV Project/freiburg_groceries_dataset", "train")

/content/drive/MyDrive/Applied CV Project/freiburg_groceries_dataset/FLOUR/test
FLOUR
/content/drive/MyDrive/Applied CV Project/freiburg_groceries_dataset/SUGAR/test
SUGAR
/content/drive/MyDrive/Applied CV Project/freiburg_groceries_dataset/WATER/test
WATER
/content/drive/MyDrive/Applied CV Project/freiburg_groceries_dataset/NUTS/test
NUTS
/content/drive/MyDrive/Applied CV Project/freiburg_groceries_dataset/RICE/test
RICE
/content/drive/MyDrive/Applied CV Project/freiburg_groceries_dataset/CHIPS/test
CHIPS
/content/drive/MyDrive/Applied CV Project/freiburg_groceries_dataset/TOMATO_SAUCE/test
TOMATO_SAUCE
/content/drive/MyDrive/Applied CV Project/freiburg_groceries_dataset/CHOCOLATE/test
CHOCOLATE
/content/drive/MyDrive/Applied CV Project/freiburg_groceries_dataset/VINEGAR/test
VINEGAR
/content/drive/MyDrive/Applied CV Project/freiburg_groceries_dataset/MILK/test
MILK
/content/drive/MyDrive/Applied CV Project/freiburg_groceries_dataset/SODA/test
SODA
/content/drive/MyDrive/Applied CV Pr

In [None]:
# CLIP_training_instructions
instructions = "All image paths are relative to the root directory, i.e., freiburg_groceries_dataset"
file_path = "/content/drive/MyDrive/Applied CV Project/freiburg_groceries_scenes/instructions.txt"

with open(file_path, 'w') as f:
  f.write(instructions)

Fruits-360 Json

In [None]:
import os, json

In [None]:
import os, json, re
def create_json_for_dataset_fruits360(dataset_type):
  num_imgs = 0
  dataset = []

  base_dir = f"/content/drive/MyDrive/Applied CV Project/fruits-360_dataset/fruits-360/{dataset_type}"

  # iterate through all the individual folders and find the dataset_type
  categories = [f for f in os.listdir(base_dir) if os.path.isdir(os.path.join(base_dir, f))]
  # categories_paths = [os.path.join(base_dir, x) for x in categories]

  # print(categories_paths)
  # return

  for category in categories:
    category_path = os.path.join(base_dir, category)
    # print(category_path)
    # return

    if os.path.exists(category_path):
      for file_name in os.listdir(category_path):
        num_imgs += 1
        if not file_name.endswith(".jpg"): # filter out non-jpg images
          continue
        # print(file_path)

        cleaned_category = re.sub(r'\d+', '', category).strip().replace("_", " ").lower()
        image_id = os.path.splitext(file_name)[0]
        data_entry = {
            "image_id" : image_id,
            "file_path": os.path.join('./',os.path.relpath(category_path, base_dir),file_name),
            "labels" : [cleaned_category] # just the directory name
        }
        dataset.append(data_entry)
        # print(data_entry)
    # print(category)

  # Write out json
  json_file_path = os.path.join(base_dir,  f'{dataset_type}_data.json')
  with open(json_file_path, 'w') as json_file:
    json.dump(dataset, json_file)

  print(f"Created {json_file_path}")
  print(num_imgs)


In [None]:
create_json_for_dataset_fruits360('Test')

Created /content/drive/MyDrive/Applied CV Project/fruits-360_dataset/fruits-360/Test/Test_data.json
22688


In [None]:
create_json_for_dataset_fruits360('Training')

Created /content/drive/MyDrive/Applied CV Project/fruits-360_dataset/fruits-360/Training/Training_data.json
67882


Checking for which files/folders have not bee successfully uploaded.

In [None]:
import os

def list_drive_files_with_paths(drive_base_path):
    drive_files = []
    for root, dirs, files in os.walk(drive_base_path):
        for file in files:
            if file.endswith(".jpg"):
                # Construct the path relative to the base directory in Google Drive
                relative_path = os.path.relpath(root, drive_base_path)
                drive_files.append(os.path.join('Training/',relative_path, file))
    return drive_files

In [None]:
drive_base_path = "/content/drive/MyDrive/Applied CV Project/fruits-360_dataset/fruits-360/Training"  # Adjust to your Google Drive structure
drive_files = list_drive_files_with_paths(drive_base_path)
print(drive_files)
print(len(drive_files))

Output hidden; open in https://colab.research.google.com to view.

In [None]:
local_file_path = "/content/local_files_list.txt"
with open(local_file_path,'r') as file:
  local_files = ['Training/'+line.strip() for line in file]
print(local_files[0])

Training/Tomato 4/r_236_100.jpg


In [None]:
missing_files = [file for file in local_files if file not in drive_files]
print(f"Number of missing files: {len(missing_files)}")

Number of missing files: 492


In [None]:
# missing_files = [file for file in local_files if file not in drive_files]
print(missing_files)
missing_folders = set()

for file_path in missing_files:
  folder_name = file_path.split('/')[1]
  missing_folders.add(folder_name)

# with open('/content/missing_folder.txt','w') as f:
#   for folder in sorted(missing_folders):
#     f.write("%s\n" % folder)

['Training/Peach Flat/r_236_100.jpg', 'Training/Peach Flat/247_100.jpg', 'Training/Peach Flat/r_226_100.jpg', 'Training/Peach Flat/257_100.jpg', 'Training/Peach Flat/r_150_100.jpg', 'Training/Peach Flat/121_100.jpg', 'Training/Peach Flat/r_140_100.jpg', 'Training/Peach Flat/131_100.jpg', 'Training/Peach Flat/198_100.jpg', 'Training/Peach Flat/188_100.jpg', 'Training/Peach Flat/18_100.jpg', 'Training/Peach Flat/r_132_100.jpg', 'Training/Peach Flat/143_100.jpg', 'Training/Peach Flat/r_122_100.jpg', 'Training/Peach Flat/153_100.jpg', 'Training/Peach Flat/r_254_100.jpg', 'Training/Peach Flat/225_100.jpg', 'Training/Peach Flat/r_244_100.jpg', 'Training/Peach Flat/235_100.jpg', 'Training/Peach Flat/300_100.jpg', 'Training/Peach Flat/r_209_100.jpg', 'Training/Peach Flat/278_100.jpg', 'Training/Peach Flat/r_219_100.jpg', 'Training/Peach Flat/310_100.jpg', 'Training/Peach Flat/268_100.jpg', 'Training/Peach Flat/r_25_100.jpg', 'Training/Peach Flat/r_313_100.jpg', 'Training/Peach Flat/r_303_100.j

In [None]:
print(missing_folders)

{'Peach Flat'}


In [None]:
print(len(missing_folders))

1


In [None]:
# Find extra files in Google Drive
extra_files = [file for file in drive_files if file not in local_files]


In [None]:
# Extract folder names from the paths of extra files
extra_folders = set()  # Use a set to avoid duplicate folder names
for file_path in extra_files:
    # This splits the path and extracts the second component
    # Adjust the index if necessary based on your path structure
    folder_name = file_path.split('/')[1]  # Adjust if your structure is different
    extra_folders.add(folder_name)

print(extra_folders)

{'Nut Pecan', 'Peach Flat (1)', 'Granadilla', 'Kohlrabi', 'Fig', 'Onion White', 'Tamarillo', 'Pepper Red'}


In [None]:
import os

def list_all_directories(base_path):
    all_dirs = {}
    for root, dirs, files in os.walk(base_path):
        # Store the list of directories in this path
        if dirs:  # If there are any directories under 'root'
            all_dirs[root] = dirs
    return all_dirs

# Adjust the base path to where your directories start
base_path = '/content/drive/MyDrive/Applied CV Project/fruits-360_dataset/fruits-360/Training'
all_dirs = list_all_directories(base_path)


In [None]:
all_dirs

{'/content/drive/MyDrive/Applied CV Project/fruits-360_dataset/fruits-360/Training': ['Tomato 4',
  'Banana Lady Finger',
  'Apple Red Delicious',
  'Pear Red',
  'Tomato 3',
  'Huckleberry',
  'Blueberry',
  'Melon Piel de Sapo',
  'Strawberry',
  'Cherry 1',
  'Dates',
  'Pomegranate',
  'Avocado',
  'Nut Forest',
  'Tomato 2',
  'Carambula',
  'Granadilla',
  'Potato Red Washed',
  'Tamarillo',
  'Kohlrabi',
  'Cherry Wax Yellow',
  'Pepper Red',
  'Guava',
  'Lemon',
  'Fig',
  'Kiwi',
  'Apple Golden 2',
  'Ginger Root',
  'Pear Stone',
  'Apple Red 1',
  'Pitahaya Red',
  'Raspberry',
  'Strawberry Wedge',
  'Nut Pecan',
  'Mandarine',
  'Quince',
  'Pear Monster',
  'Cauliflower',
  'Apple Golden 3',
  'Redcurrant',
  'Pepper Yellow',
  'Apple Red Yellow 1',
  'Chestnut',
  'Cherry Rainier',
  'Cucumber Ripe 2',
  'Banana Red',
  'Plum',
  'Physalis',
  'Grape Pink',
  'Maracuja',
  'Cucumber Ripe',
  'Potato Sweet',
  'Hazelnut',
  'Nectarine',
  'Cantaloupe 2',
  'Mangostan',


In [None]:
import os

def find_directories_with_subdirectories(base_path):
    # This will store directories that contain other directories
    directories_with_subdirectories = []

    for root, dirs, files in os.walk(base_path):
        # If this directory contains other directories, note it down
        if dirs:  # 'dirs' is a list of directory names within 'root'
            directories_with_subdirectories.append(root)

    return directories_with_subdirectories

# Set the base directory path
base_path = '/content/drive/MyDrive/Applied CV Project/fruits-360_dataset/fruits-360/Training'  # Adjust to your base directory
directories_with_subdirectories = find_directories_with_subdirectories(base_path)

# Print out the directories that contain other directories
print("Directories containing other directories:")
for directory in directories_with_subdirectories:
    print(directory)


Directories containing other directories:
/content/drive/MyDrive/Applied CV Project/fruits-360_dataset/fruits-360/Training


In [None]:
import json

# Assuming 'data.json' is your JSON file
with open('/content/drive/MyDrive/Applied CV Project/fruits-360_dataset/fruits-360/Training/Training_data.json', 'r') as file:
    dataset = json.load(file)


# List to hold entries that are not in the second level
incorrect_paths = []

for item in dataset:
    # Count the number of slashes in the file_path
    num_slashes = item['file_path'].count('/')

    # Check if there are more than two slashes (adjust this number based on your base path)
    if num_slashes > 2:
        incorrect_paths.append(item)

# Print out or process entries with incorrect paths
for item in incorrect_paths:
    print(item)

print(incorrect_paths)


[]


March 31, 2024: Updating the train json for fruits 360 to rectify bugs.

In [None]:
import json, os

base_dir = f"/content/drive/MyDrive/Applied CV Project/fruits-360_dataset/fruits-360/Training"

# Load in the json file
with open('/content/drive/MyDrive/Applied CV Project/fruits-360_dataset/fruits-360/Training/Training_data.json', 'r') as file:
    dataset = json.load(file)

for item in dataset:
  if "Peach Flat (1)" in item["file_path"]:
    item["file_path"] = item["file_path"].replace("Peach Flat (1)", "Peach Flat")

  if "peach flat ()" in item["labels"]:
    item["labels"]= [label.replace("peach flat ()", "peach flat") for label in item["labels"]]

json_file_path = os.path.join(base_dir,  'Training_data_updated.json')

with open(json_file_path, 'w') as json_file:
    json.dump(dataset, json_file)


April 11, 2024: Creating new jsons for training composite fruits-360 and webscraped images

In [3]:
import os, json, re
def create_json_for_dataset_fruits360_new(dataset_type):
  num_imgs = 0
  dataset = []

  base_dir = f"/content/drive/MyDrive/Applied CV Project/fruits-360_dataset/fruits-360/{dataset_type}"

  # iterate through all the individual folders and find the dataset_type
  categories = [f for f in os.listdir(base_dir) if os.path.isdir(os.path.join(base_dir, f))]
  # categories_paths = [os.path.join(base_dir, x) for x in categories]

  # print(categories)
  # return

  for category in categories:
    category_path = os.path.join(base_dir, category)
    # print(category_path)
    # return

    if os.path.exists(category_path):
      for file_name in os.listdir(category_path):
        num_imgs += 1
        if not file_name.endswith(".jpg"): # filter out non-jpg images
          continue
        # print(file_path)

        cleaned_category = re.sub(r'\d+', '', category).strip().replace("_", " ").lower()
        image_id = os.path.splitext(file_name)[0]
        data_entry = {
            "image_id" : image_id,
            "file_path": os.path.join('./',os.path.relpath(category_path, base_dir),file_name),
            "labels" : [cleaned_category] # just the directory name
        }
        dataset.append(data_entry)
        # print(data_entry)
    # print(category)

  # Write out json
  json_file_path = os.path.join(base_dir,  f'{dataset_type}_data_april_11.json')
  with open(json_file_path, 'w') as json_file:
    json.dump(dataset, json_file)

  print(f"Created {json_file_path}")
  print(num_imgs)


In [4]:
create_json_for_dataset_fruits360_new('Test')

Created /content/drive/MyDrive/Applied CV Project/fruits-360_dataset/fruits-360/Test/Test_data_april_11.json
23027


In [None]:
create_json_for_dataset_fruits360_new('Training')

Created /content/drive/MyDrive/Applied CV Project/fruits-360_dataset/fruits-360/Training/Training_data_april_11.json
69167
