# Counts the number of images in each folder

In [8]:
import os

def count_images(folder_name):
  image_count = 0
  for filename in os.listdir(folder_name):
    # Check if it's an image file
    if filename.lower().endswith((".jpg", ".jpeg", ".png")):
      image_count += 1
  return image_count

# Count images in Fish folder
fish_image_count = count_images("Dataset/Fish")

# Count images in Not Fish folder
not_fish_image_count = count_images("Dataset/Not Fish")

print(f"Number of images in Fish folder: {fish_image_count}")
print(f"Number of images in Not Fish folder: {not_fish_image_count}")

Number of images in Fish folder: 1380
Number of images in Not Fish folder: 2879


# Rename Images

In [9]:
def rename_images(folder_name, prefix):
  count = 1
  for filename in os.listdir(folder_name):
    # Check if it's an image file and modify extensions as needed
    if filename.lower().endswith((".jpg", ".jpeg", ".png")):
      # Create new filename with prefix and sequential number
      new_filename = f"{prefix}_image_{count}{os.path.splitext(filename)[1]}"
      # Rename the file
      os.rename(os.path.join(folder_name, filename), os.path.join(folder_name, new_filename))
      count += 1

# Rename fish images
rename_images("Dataset/Fish", "fish")

# Rename non-fish images
rename_images("Dataset/Not Fish", "not_fish")

print("Images renamed successfully!")

Images renamed successfully!


# Split Dataset and Manage Folders

In [11]:
import os
import shutil
import random

# Set the paths
dataset_dir = 'Dataset'
fish_dir = os.path.join(dataset_dir, 'Fish')
not_fish_dir = os.path.join(dataset_dir, 'Not Fish')

train_dir = os.path.join(dataset_dir, 'Train')
test_dir = os.path.join(dataset_dir, 'Test')

# Create Train and Test directories
os.makedirs(os.path.join(train_dir, 'Fish'), exist_ok=True)
os.makedirs(os.path.join(train_dir, 'Not Fish'), exist_ok=True)
os.makedirs(os.path.join(test_dir, 'Fish'), exist_ok=True)
os.makedirs(os.path.join(test_dir, 'Not Fish'), exist_ok=True)

# Function to split the dataset
def split_dataset(source_dir, train_dir, test_dir, split_ratio=0.8):
    all_files = os.listdir(source_dir)
    random.shuffle(all_files)
    split_point = int(len(all_files) * split_ratio)
    train_files = all_files[:split_point]
    test_files = all_files[split_point:]

    for file_name in train_files:
        src_path = os.path.join(source_dir, file_name)
        dst_path = os.path.join(train_dir, file_name)
        shutil.copy(src_path, dst_path)

    for file_name in test_files:
        src_path = os.path.join(source_dir, file_name)
        dst_path = os.path.join(test_dir, file_name)
        shutil.copy(src_path, dst_path)

# Split Fish images
split_dataset(fish_dir, os.path.join(train_dir, 'Fish'), os.path.join(test_dir, 'Fish'))

# Split Not Fish images
split_dataset(not_fish_dir, os.path.join(train_dir, 'Not Fish'), os.path.join(test_dir, 'Not Fish'))

print("Dataset split completed!")

Dataset split completed!


# Merge folders

In [20]:
import os
import shutil

def merge_images(source_dirs, target_dir):
    if not os.path.exists(target_dir):
        os.makedirs(target_dir)

    for source_dir in source_dirs:
        for filename in os.listdir(source_dir):
            source_file = os.path.join(source_dir, filename)
            if os.path.isfile(source_file):
                shutil.move(source_file, target_dir)
        # Remove the source directory after all files have been moved
        shutil.rmtree(source_dir)

# Paths to the train and test directories
train_fish = 'Dataset/Train/Fish'
train_not_fish = 'Dataset/Train/Not Fish'
train_target = 'Dataset/Train'

test_fish = 'Dataset/Test/Fish'
test_not_fish = 'Dataset/Test/Not Fish'
test_target = 'Dataset/Test'

# Merge train images and delete source folders
merge_images([train_fish, train_not_fish], train_target)

# Merge test images and delete source folders
merge_images([test_fish, test_not_fish], test_target)

Moved: Dataset/Train/Fish\fish_image_1.jpg to Dataset/Train
Moved: Dataset/Train/Fish\fish_image_10.jpg to Dataset/Train
Moved: Dataset/Train/Fish\fish_image_1000.jpg to Dataset/Train
Moved: Dataset/Train/Fish\fish_image_1001.jpg to Dataset/Train
Moved: Dataset/Train/Fish\fish_image_1002.jpg to Dataset/Train
Moved: Dataset/Train/Fish\fish_image_1004.jpg to Dataset/Train
Moved: Dataset/Train/Fish\fish_image_1005.jpg to Dataset/Train
Moved: Dataset/Train/Fish\fish_image_1006.jpg to Dataset/Train
Moved: Dataset/Train/Fish\fish_image_1008.jpg to Dataset/Train
Moved: Dataset/Train/Fish\fish_image_1009.jpg to Dataset/Train
Moved: Dataset/Train/Fish\fish_image_101.jpg to Dataset/Train
Moved: Dataset/Train/Fish\fish_image_1010.jpg to Dataset/Train
Moved: Dataset/Train/Fish\fish_image_1011.jpg to Dataset/Train
Moved: Dataset/Train/Fish\fish_image_1012.jpg to Dataset/Train
Moved: Dataset/Train/Fish\fish_image_1013.jpg to Dataset/Train
Moved: Dataset/Train/Fish\fish_image_1014.jpg to Dataset/Trai

# Process Train set and labels

In [21]:
import pandas as pd
import numpy as np
import os

Id = []
for dirname, _, filenames in os.walk('Dataset/Train/'):
    for filename in filenames:
        Id.append(os.path.join(dirname, filename))
Id[:10]

['Dataset/Train/fish_image_1.jpg',
 'Dataset/Train/fish_image_10.jpg',
 'Dataset/Train/fish_image_1000.jpg',
 'Dataset/Train/fish_image_1001.jpg',
 'Dataset/Train/fish_image_1002.jpg',
 'Dataset/Train/fish_image_1004.jpg',
 'Dataset/Train/fish_image_1005.jpg',
 'Dataset/Train/fish_image_1006.jpg',
 'Dataset/Train/fish_image_1008.jpg',
 'Dataset/Train/fish_image_1009.jpg']

In [22]:
# Convert image id's to pandas dataframe
train = pd.DataFrame().assign(filename=Id)
train.head()

Unnamed: 0,filename
0,Dataset/Train/fish_image_1.jpg
1,Dataset/Train/fish_image_10.jpg
2,Dataset/Train/fish_image_1000.jpg
3,Dataset/Train/fish_image_1001.jpg
4,Dataset/Train/fish_image_1002.jpg


In [23]:
train.tail()

Unnamed: 0,filename
3402,Dataset/Train/not_fish_image_994.jpg
3403,Dataset/Train/not_fish_image_995.jpg
3404,Dataset/Train/not_fish_image_996.jpg
3405,Dataset/Train/not_fish_image_997.jpg
3406,Dataset/Train/not_fish_image_999.jpg


In [24]:
# Add new column 'label' and process labels for images
train['label'] = train['filename']

for i in range(len(train)):
    if 'fish' in train['filename'][i]:
        train['label'][i] = 'fish'
    if 'not_fish' in train['filename'][i]:
        train['label'][i] = 'not_fish'

In [25]:
train.head()

Unnamed: 0,filename,label
0,Dataset/Train/fish_image_1.jpg,fish
1,Dataset/Train/fish_image_10.jpg,fish
2,Dataset/Train/fish_image_1000.jpg,fish
3,Dataset/Train/fish_image_1001.jpg,fish
4,Dataset/Train/fish_image_1002.jpg,fish


In [26]:
train.tail()

Unnamed: 0,filename,label
3402,Dataset/Train/not_fish_image_994.jpg,not_fish
3403,Dataset/Train/not_fish_image_995.jpg,not_fish
3404,Dataset/Train/not_fish_image_996.jpg,not_fish
3405,Dataset/Train/not_fish_image_997.jpg,not_fish
3406,Dataset/Train/not_fish_image_999.jpg,not_fish


In [27]:
train['label'].value_counts()

label
not_fish    2303
fish        1104
Name: count, dtype: int64

# Process Test set and labels

In [28]:
Id = []
for dirname, _, filenames in os.walk('Dataset/Test/'):
    for filename in filenames:
        Id.append(os.path.join(dirname, filename))
Id[:10]

['Dataset/Test/fish_image_100.jpg',
 'Dataset/Test/fish_image_1003.jpg',
 'Dataset/Test/fish_image_1007.jpg',
 'Dataset/Test/fish_image_1015.jpg',
 'Dataset/Test/fish_image_1018.jpg',
 'Dataset/Test/fish_image_1027.jpg',
 'Dataset/Test/fish_image_1030.jpg',
 'Dataset/Test/fish_image_1033.jpg',
 'Dataset/Test/fish_image_1036.jpg',
 'Dataset/Test/fish_image_1038.jpg']

In [29]:
# Convert image id's to pandas dataframe
test = pd.DataFrame().assign(filename=Id)
test.head()

Unnamed: 0,filename
0,Dataset/Test/fish_image_100.jpg
1,Dataset/Test/fish_image_1003.jpg
2,Dataset/Test/fish_image_1007.jpg
3,Dataset/Test/fish_image_1015.jpg
4,Dataset/Test/fish_image_1018.jpg


In [30]:
test.tail()

Unnamed: 0,filename
847,Dataset/Test/not_fish_image_984.jpg
848,Dataset/Test/not_fish_image_989.jpg
849,Dataset/Test/not_fish_image_990.jpg
850,Dataset/Test/not_fish_image_992.jpg
851,Dataset/Test/not_fish_image_998.jpg


In [31]:
# Add new column 'label' and process labels for images
test['label'] = test['filename']

for i in range(len(test)):
    if 'fish' in test['filename'][i]:
        test['label'][i] = 'fish'
    if 'not_fish' in test['filename'][i]:
        test['label'][i] = 'not_fish'

In [32]:
test.head()

Unnamed: 0,filename,label
0,Dataset/Test/fish_image_100.jpg,fish
1,Dataset/Test/fish_image_1003.jpg,fish
2,Dataset/Test/fish_image_1007.jpg,fish
3,Dataset/Test/fish_image_1015.jpg,fish
4,Dataset/Test/fish_image_1018.jpg,fish


In [33]:
test.tail()

Unnamed: 0,filename,label
847,Dataset/Test/not_fish_image_984.jpg,not_fish
848,Dataset/Test/not_fish_image_989.jpg,not_fish
849,Dataset/Test/not_fish_image_990.jpg,not_fish
850,Dataset/Test/not_fish_image_992.jpg,not_fish
851,Dataset/Test/not_fish_image_998.jpg,not_fish


In [34]:
test['label'].value_counts()

label
not_fish    576
fish        276
Name: count, dtype: int64

In [35]:
# Save train and test dataframe as csv for future use
train.to_csv('train.csv', index=False)
test.to_csv('test.csv', index=False)