In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
import matplotlib.pylab as plt

import data_preprocessing
import utils
from vit_pipeline import vit_model_names, cwd

In [2]:
data_preprocessing.coarse_grain_classes

In [5]:
fine_to_coarse, fine_to_course_idx = data_preprocessing.get_fine_to_coarse()
fine_to_coarse

In [4]:
fine_to_course_idx

# Class Distribution

In [6]:
for i, granularity in enumerate(granularities.values()):
    train_folder_name = f'train_{granularity}'
    test_folder_name = f'test_{granularity}'
    model_name = vit_model_names[0]
    
    datasets, n = data_preprocessing.get_datasets(cwd=cwd)
    
    num_train_examples = len(datasets[f'{vit_model_names[0]}_{train_folder_name}'])
    num_test_examples = len(datasets[f'{vit_model_names[0]}_{test_folder_name}'])
    train_ratio = round(num_train_examples / (num_train_examples + num_test_examples) * 100, 2)
    test_ratio = round(num_test_examples / (num_train_examples + num_test_examples) * 100, 2)
    
    if i == 0:
        print(f"Total num of examples: train: {num_train_examples} ({train_ratio}%), "
              f"test: {num_test_examples} ({test_ratio}%)")
    
    def calculate_class_distribution(dataset: data_preprocessing.ImageFolderWithName):
        class_counts = np.zeros(len(dataset.classes), 
                                dtype=int)
    
        for _, y in dataset.imgs:
            class_counts[y] += 1
    
        return class_counts
    
    # Create a single plot for one model, with the test class distribution inside the train bar
    plt.figure(figsize=(15, 6))
    width = 0.35
    x = np.arange(n)
    
    plt.bar(x, 
            calculate_class_distribution(dataset=datasets[f"{model_name}_{train_folder_name}"]), 
            width=width, 
            label='Train', 
            alpha=0.7)
    plt.bar(x, 
            calculate_class_distribution(dataset=datasets[f"{model_name}_{test_folder_name}"]), 
            width=width, 
            label='Test', 
            bottom=0, 
            alpha=0.7)
    
    plt.title(f'{granularity.capitalize()}-Grain Class Distribution')
    plt.xlabel("Class")
    plt.ylabel("Count")
    plt.xticks(x, datasets[f"{model_name}_{train_folder_name}"].classes, rotation=45)
    plt.legend()
    plt.show()

In [7]:
files_path = '/content/drive/My Drive/' if utils.is_running_in_colab() else ''
results_path = fr'{files_path}results/'
coarse_test_true_data = np.load(fr"{results_path}test_true_coarse.npy")

def get_counts(arr: np.array) -> dict:
    return ({k: 0 for k in range(np.unique(true_data).shape[0])} | 
            {k: v for k, v in zip(*np.unique(arr, return_counts=True))})

def plot_bars(arr: np.array):
    counts = get_counts(arr)
    unique_values, counts = list(counts.keys()), list(counts.values())
    plt.bar(unique_values, counts, color='skyblue')
    plt.xlabel('Value')
    plt.ylabel('Occurrences')
    plt.title(f'{arr=}'.split('=')[0])
    
    plt.xticks(unique_values)

In [9]:
data_file_path = rf'data/WEO_Data_Sheet.xlsx'
dataframes_by_sheet = pd.read_excel(data_file_path, sheet_name=None)

dataframes_by_sheet.keys()

In [10]:
train_df = dataframes_by_sheet['Training']
train_image_names = train_df['Image Name']
train_df.shape

In [11]:
train_df.head()

In [12]:
fine_grain_results_df = dataframes_by_sheet['Fine-Grain Results']
fine_grain_results_df.columns

In [13]:
coarse_grain_results_df = dataframes_by_sheet['Coarse-Grain Results']
coarse_grain_classes = coarse_grain_results_df['Class Name'].values
coarse_grain_classes

In [14]:
fine_grain_classes = {k:v for k, v in enumerate(fine_grain_results_df['Class Name'].values)}
n = len(fine_grain_classes)

fine_grain_classes

In [15]:
fine_to_coarse = {}

for fine_grain_class in fine_grain_classes.values():
    fine_grain_values = train_df[train_df['Fine-Grain Ground Truth'] == fine_grain_class]
    coarse_grain_class = fine_grain_values.iloc[0]['Course-Grain Ground Truth']
    fine_to_coarse[fine_grain_class] = coarse_grain_class

fine_to_coarse

In [16]:
import os
import shutil

train_folder = 'train_fine'
test_folder = 'test_fine'

# Create the train_coarse and test_coarse folders
train_coarse_folder = 'train_coarse'
test_coarse_folder = 'test_coarse'

os.makedirs(train_coarse_folder, exist_ok=True)
os.makedirs(test_coarse_folder, exist_ok=True)

# Initialize a dictionary to store file counters for each coarse class
coarse_class_counters = {}

# Function to copy images to coarse-grained class folders with sequential filenames
def copy_images_with_sequential_filenames(source_folder, destination_folder):
    content = [d for d in os.listdir(source_folder) if not d.startswith(".")]
    assert len(content) == n

    for fine_class in content:
        coarse_class = fine_to_coarse[fine_class]
        destination_class_path = os.path.join(destination_folder, coarse_class)
        os.makedirs(destination_class_path, exist_ok=True)

        source_class_path = os.path.join(source_folder, fine_class)

        # Initialize the file counter for this coarse class if it's not set
        if coarse_class not in coarse_class_counters:
            coarse_class_counters[coarse_class] = 0

        for file in [d for d in os.listdir(source_class_path) if not d.startswith(".")]:
            source_path = os.path.join(source_class_path, file)

            # Get the file counter for this coarse class
            file_counter = coarse_class_counters[coarse_class]

            destination_filename = f"{file_counter:04d}{os.path.splitext(file)[-1]}"
            destination_path = os.path.join(destination_class_path, destination_filename)

            shutil.copy(source_path, destination_path)

            # Increment the file counter for this coarse class
            coarse_class_counters[coarse_class] += 1

# Iterate through the train_fine and test_fine folders and copy images to train_coarse and test_coarse
for source_folder, destination_folder in [(train_folder, train_coarse_folder), (test_folder, test_coarse_folder)]:
    copy_images_with_sequential_filenames(source_folder, destination_folder)

print("Coarse-grained class folders created, and images copied with sequential filenames.")


In [17]:
def count_images_in_folder(folder_path):
    count = 0
    for root, _, files in os.walk(folder_path):
        count += len(files)
    return count

# Count the number of images in the original train and test folders
original_train_count = count_images_in_folder(train_folder)
original_test_count = count_images_in_folder(test_folder)

# Count the number of images in the new train_coarse and test_coarse folders
new_train_count = count_images_in_folder(train_coarse_folder)
new_test_count = count_images_in_folder(test_coarse_folder)

# Assert that the counts are consistent
# assert original_train_count == new_train_count, f"Train image count mismatch: {original_train_count} != {new_train_count}"
assert original_test_count == new_test_count, f"Test image count mismatch: {original_test_count} != {new_test_count}"

print("Image counts are consistent.")

In [18]:
# Debugging - print class names and image counts
for fine_class in os.listdir(train_folder):
    source_class_path = os.path.join(train_folder, fine_class)
    
    # Skip hidden system files and non-directory files
    if not os.path.isdir(source_class_path) or fine_class.startswith('.'):
        continue

    coarse_class = fine_to_coarse.get(fine_class, "other")
    num_images = len(os.listdir(source_class_path))
    print(f"Fine class: {fine_class}, Coarse class: {coarse_class}, Image count: {num_images}")


In [19]:
test_df = dataframes_by_sheet['1s_0s_Sheet']
test_df.shape

In [20]:
test_df.head()

In [21]:
test_df.columns

In [22]:
test_image_names = test_df['Image Name'].values
set(test_image_names).intersection(set(train_image_names))

In [23]:
def get_example_info(image_name: str) -> pd.Series:
    """
    :param image_name: The image name of the example to consider
    :return: A row of all the info about the example
    """

    return test_df[test_df['Image Name'] == image_name]


def get_example_fine_grain_one_hot_classes(image_name: str):
    """
    :param image_name: The image name of the example to consider
    :return: One-hot prediction vectors for all the classes on the image
    """

    return get_example_info(image_name)[fine_grain_classes].values


def get_class_name(cls: str,
                   ground_truth: bool) -> str:
    """
    :param cls: The image name of the example to consider
    :param ground_truth: Whether to get g_t data or not
    :return: A string of the class name
    """

    if ground_truth and cls == 'Air Defense':
        return 'Air Defence'

    return cls if cls != 'Self Propelled Artillery' else 'SPA'


def get_class_index(image_name: str,
                    ground_truth: bool,
                    granularity: str = 'fine') -> int:
    """
    :param image_name: The image name of the example to consider
    :param ground_truth: Whether to get ground truth data or not
    :param granularity: Fine or course label
    :return: A string of the class name
    """

    w_info = get_example_info(image_name)
    column_name_generator = lambda cls: get_class_name(cls, ground_truth) if ground_truth else (
        f"pred_{get_class_name(cls, ground_truth)}" if granularity == 'fine'
        else f"Exp 2 Prediction ({get_class_name(cls, ground_truth)})")
    classes = fine_grain_classes if granularity == 'fine' else coarse_grain_classes
    class_index = int(np.array([w_info[column_name_generator(cls)] for cls in classes]).argmax())

    return class_index


def get_fine_grain_predicted_index(image_name: str) -> int:
    """
    :param image_name: The image name of the example to consider
    :return: The fine grain predicted index
    """

    return get_class_index(image_name=image_name, ground_truth=False)


def get_fine_grain_true_index(image_name: str) -> int:
    """
    :param image_name: The image name of the example to consider
    :return: The fine grain ground truth index
    """

    return get_class_index(image_name=image_name, ground_truth=True)


def get_class(image_name: str,
              ground_truth: bool,
              granularity: str = 'fine') -> str:
    """
    :param image_name: The image name of the example to consider
    :param ground_truth: Whether to get ground truth data or not
    :param granularity: Fine or course label
    :return: A row of all the info about the example
    """

    class_index = get_class_index(image_name=image_name, ground_truth=ground_truth, granularity=granularity)
    classes = fine_grain_classes if granularity == 'fine' else coarse_grain_classes
    resulted_class = classes[class_index]

    return resulted_class


In [24]:
pred_data = [get_fine_grain_predicted_index(image_name) for image_name in test_image_names]
true_data = [get_fine_grain_true_index(image_name) for image_name in test_image_names]
accuracy_score(y_true=true_data, y_pred=pred_data)

In [28]:
vit_pred_data = np.load("vit_pred.npy", allow_pickle=True)
accuracy_score(y_true=true_data, y_pred=vit_pred_data)

In [29]:
plot_bars(pred_data)

In [30]:
plot_bars(true_data)