# Splitting the data into pairs of images

For Siamese Networks, it is necessary to build pairs of images for the input (to give to the model). In this Notebbok, we will split the dataset in 3 files like it has been done for the training of the CNN models.

In [1]:
import pandas as pd
import os
import numpy as np
from itertools import combinations, product

In [2]:
train_df = pd.read_csv(os.getcwd() + "/Split_Tanker_Bulk_Container_frugal_vv/train.csv")
val_df = pd.read_csv(os.getcwd() + "/Split_Tanker_Bulk_Container_frugal_vv/validation.csv")
test_df = pd.read_csv(os.getcwd() + "/Split_Tanker_Bulk_Container_frugal_vv/test.csv")

image_path = '../OpenSARShip/Categories/'

## Generate pairs of images

In [3]:
def generate_all_pairs(dataframe):
    pairs = []
    labels = []
    categories = dataframe['file_path'].apply(lambda x: x.split('\\')[0]).unique()
    
    # Generate all unique combinations within the same class for similar pairs
    for category in categories:
        same_class_images = dataframe[dataframe['file_path'].str.contains(category, regex=False)]['file_path'].tolist()
        for pair in combinations(same_class_images, 2):
            pairs.append(pair)
            labels.append(1)  # Similar pair
    
    # Generate all unique combinations across different classes for dissimilar pairs
    for cat1, cat2 in combinations(categories, 2):
        cat1_images = dataframe[dataframe['file_path'].str.contains(cat1, regex=False)]['file_path'].tolist()
        cat2_images = dataframe[dataframe['file_path'].str.contains(cat2, regex=False)]['file_path'].tolist()
        
        for pair in product(cat1_images, cat2_images):
            pairs.append(pair)
            labels.append(0)  # Dissimilar pair
    
    return np.array(pairs), np.array(labels)

In [3]:
def save_pairs_to_csv(pairs, labels, csv_path):
    df = pd.DataFrame({'image_1': pairs[:, 0], 'image_2': pairs[:, 1], 'label': labels})
    # Shuffle the DataFrame
    shuffled_df = df.sample(frac=1).reset_index(drop=True)
    # Save the shuffled DataFrame to a CSV file
    shuffled_df.to_csv(csv_path, index=False)


In [5]:
def load_pairs_from_csv(csv_path):
    df = pd.read_csv(csv_path)
    pairs = df[['image_1', 'image_2']].values
    labels = df['label'].values
    return pairs, labels

In [6]:
print(train_df.head())


                                           file_path
0  Bulk Carrier\BulkCarrier_Visual_Cargo_x1902_y8...
1  Bulk Carrier\BulkCarrier_Visual_Cargo_x5067_y1...
2  Bulk Carrier\BulkCarrier_Visual_Cargo_x2412_y9...
3  Bulk Carrier\BulkCarrier_Visual_Cargo_x2084_y6...
4  Container Ship\ContainerShip_Visual_Cargo_x114...


In [29]:
train_pairs, train_labels = generate_all_pairs(train_df)
val_pairs, val_labels = generate_all_pairs(val_df)
test_pairs, test_labels = generate_all_pairs(test_df)

In [30]:
save_pairs_to_csv(train_pairs, train_labels, 'train_pairs.csv')
save_pairs_to_csv(val_pairs, val_labels, 'val_pairs.csv')
save_pairs_to_csv(test_pairs, test_labels, 'test_pairs.csv')

## Generate pairs of 1/4 of the dataset

Genrating pairs of images for 1/4 of the dataset.

In [7]:
def generate_and_save_pairs(dataframe, csv_path):
    # Sample a quarter of the DataFrame
    sampled_df = dataframe.sample(frac=0.25, random_state=42)  # Use a fixed seed for reproducibility

    # Generate all pairs from the sampled DataFrame
    pairs, labels = generate_all_pairs(sampled_df)
    
    # Save the pairs to a CSV file
    save_pairs_to_csv(pairs, labels, csv_path)

# Paths to your CSV files
train_csv_path = os.getcwd() + "/Split_Tanker_Bulk_Container_frugal_vv/train.csv"
val_csv_path = os.getcwd() + "/Split_Tanker_Bulk_Container_frugal_vv/validation.csv"
test_csv_path = os.getcwd() + "/Split_Tanker_Bulk_Container_frugal_vv/test.csv"

# Load the original DataFrames
train_df = pd.read_csv(train_csv_path)
val_df = pd.read_csv(val_csv_path)
test_df = pd.read_csv(test_csv_path)

# Generate and save pairs for each subset
generate_and_save_pairs(train_df, 'train_pairs.csv')
generate_and_save_pairs(val_df, 'val_pairs.csv')
generate_and_save_pairs(test_df, 'test_pairs.csv')


## Split mstar in pairs

In [1]:
import os
from itertools import combinations, product
import numpy as np
import pandas as pd

def generate_pairs(data_root):
    pairs = []
    labels = []
    categories = os.listdir(data_root)

    # Generate pairs
    for category in categories:
        category_path = os.path.join(data_root, category)
        images = [os.path.join(category_path, img) for img in os.listdir(category_path)]
        for pair in combinations(images, 2):
            pairs.append(pair)
            labels.append(1)  # Similar pair

    for cat1, cat2 in combinations(categories, 2):
        cat1_path = os.path.join(data_root, cat1)
        cat2_path = os.path.join(data_root, cat2)
        cat1_images = [os.path.join(cat1_path, img) for img in os.listdir(cat1_path)]
        cat2_images = [os.path.join(cat2_path, img) for img in os.listdir(cat2_path)]
        for pair in product(cat1_images, cat2_images):
            pairs.append(pair)
            labels.append(0)  # Dissimilar pair

    return np.array(pairs), np.array(labels)

def save_pairs_to_csv(pairs, labels, csv_filename):
    df = pd.DataFrame({'image_1': pairs[:, 0], 'image_2': pairs[:, 1], 'label': labels})
    df.to_csv(csv_filename, index=False)

def split_dataset(pairs, labels, split_ratio=0.5):
    # Shuffle and split the dataset
    indices = np.arange(pairs.shape[0])
    np.random.shuffle(indices)
    pairs = pairs[indices]
    labels = labels[indices]

    split_point = int(pairs.shape[0] * split_ratio)
    return (pairs[split_point:], labels[split_point:], pairs[:split_point], labels[:split_point])

# Generate pairs for the train dataset
train_pairs, train_labels = generate_pairs('../mstar/TRAIN')
save_pairs_to_csv(train_pairs, train_labels, 'train_pairs.csv')

# Generate pairs for the test dataset and split into validation and test
test_pairs, test_labels = generate_pairs('../mstar/TEST')
test_pairs, test_labels, validation_pairs, validation_labels = split_dataset(test_pairs, test_labels, split_ratio=0.5)
save_pairs_to_csv(test_pairs, test_labels, 'test_pairs.csv')
save_pairs_to_csv(validation_pairs, validation_labels, 'validation_pairs.csv')


In [2]:
def save_quarter_to_csv(pairs, labels, csv_filename):
    # Calculate the quarter point of the dataset
    quarter_point = len(pairs) // 4
    # Shuffle the dataset
    shuffled_indices = np.random.permutation(len(pairs))
    # Select a quarter of the data
    selected_indices = shuffled_indices[:quarter_point]
    # Create a DataFrame with the selected data
    df = pd.DataFrame({
        'image_1': pairs[selected_indices, 0],
        'image_2': pairs[selected_indices, 1],
        'label': labels[selected_indices]
    })
    # Save the DataFrame to a CSV file
    df.to_csv(csv_filename, index=False)

# Assuming generate_pairs and other necessary functions are defined as before

# Generate pairs for the train dataset and save a quarter
train_pairs, train_labels = generate_pairs('../mstar/TRAIN')
save_quarter_to_csv(train_pairs, train_labels, 'train_pairs_quarter.csv')

# Generate pairs for the test dataset
test_pairs, test_labels = generate_pairs('../mstar/TEST')
# Split into validation and test, then save a quarter of each
test_pairs, test_labels, validation_pairs, validation_labels = split_dataset(test_pairs, test_labels, split_ratio=0.5)
save_quarter_to_csv(test_pairs, test_labels, 'test_pairs_quarter.csv')
save_quarter_to_csv(validation_pairs, validation_labels, 'validation_pairs_quarter.csv')
