# Splitting the data into pairs of images

For Siamese Networks, it is necessary to build pairs of images for the input (to give to the model). In this Notebbok, we will split the dataset in 3 files like it has been done for the training of the CNN models.

In [23]:
import pandas as pd
import os
import numpy as np
from itertools import combinations, product

In [8]:
train_df = pd.read_csv(os.getcwd() + "/Split_Tanker_Bulk_Container_frugal_vv/train.csv")
val_df = pd.read_csv(os.getcwd() + "/Split_Tanker_Bulk_Container_frugal_vv/validation.csv")
test_df = pd.read_csv(os.getcwd() + "/Split_Tanker_Bulk_Container_frugal_vv/test.csv")

image_path = '../OpenSARShip/Categories/'

## Generate pairs of images

In [27]:
def generate_all_pairs(dataframe):
    pairs = []
    labels = []
    categories = dataframe['file_path'].apply(lambda x: x.split('\\')[0]).unique()
    
    # Generate all unique combinations within the same class for similar pairs
    for category in categories:
        same_class_images = dataframe[dataframe['file_path'].str.contains(category, regex=False)]['file_path'].tolist()
        for pair in combinations(same_class_images, 2):
            pairs.append(pair)
            labels.append(1)  # Similar pair
    
    # Generate all unique combinations across different classes for dissimilar pairs
    for cat1, cat2 in combinations(categories, 2):
        cat1_images = dataframe[dataframe['file_path'].str.contains(cat1, regex=False)]['file_path'].tolist()
        cat2_images = dataframe[dataframe['file_path'].str.contains(cat2, regex=False)]['file_path'].tolist()
        
        for pair in product(cat1_images, cat2_images):
            pairs.append(pair)
            labels.append(0)  # Dissimilar pair
    
    return np.array(pairs), np.array(labels)

In [28]:
def save_pairs_to_csv(pairs, labels, csv_path):
    df = pd.DataFrame({'image_1': pairs[:, 0], 'image_2': pairs[:, 1], 'label': labels})
    # Shuffle the DataFrame
    shuffled_df = df.sample(frac=1).reset_index(drop=True)
    # Save the shuffled DataFrame to a CSV file
    shuffled_df.to_csv(csv_path, index=False)


In [5]:
def load_pairs_from_csv(csv_path):
    df = pd.read_csv(csv_path)
    pairs = df[['image_1', 'image_2']].values
    labels = df['label'].values
    return pairs, labels

In [9]:
print(train_df.head())


                                           file_path
0  Bulk Carrier\BulkCarrier_Visual_Cargo_x1902_y8...
1  Bulk Carrier\BulkCarrier_Visual_Cargo_x5067_y1...
2  Bulk Carrier\BulkCarrier_Visual_Cargo_x2412_y9...
3  Bulk Carrier\BulkCarrier_Visual_Cargo_x2084_y6...
4  Container Ship\ContainerShip_Visual_Cargo_x114...


In [29]:
train_pairs, train_labels = generate_all_pairs(train_df)
val_pairs, val_labels = generate_all_pairs(val_df)
test_pairs, test_labels = generate_all_pairs(test_df)

In [30]:
save_pairs_to_csv(train_pairs, train_labels, 'train_pairs.csv')
save_pairs_to_csv(val_pairs, val_labels, 'val_pairs.csv')
save_pairs_to_csv(test_pairs, test_labels, 'test_pairs.csv')