In [2]:
import pandas as pd
import pathlib
from pathlib import PurePath, Path
import numpy as np
import os
from tqdm import tqdm

In [3]:
def has_parent_X(path: Path, X: str) -> bool:
    for parent in path.parents:
        if parent.name == X:
            return True
    return False

In [4]:
# Define the base directory
data_dir = Path('training_datasets')

# Create a list of .fits files in the directory and its subdirectories
file_list = data_dir.glob("**/**/**/*.fits")

# Initialize the DataFrame with the desired columns
df = pd.DataFrame(columns=["file_name", "classes"])

# Function to check if a file has a specific parent directory
def has_parent_X(path, parent_dir):
    return parent_dir in path.parts

# Iterate through the list of files
for fl in tqdm(file_list):
    # Remove the 'training_datasets/' prefix from the path
    relative_path = fl.relative_to(data_dir).as_posix()
    # Determine the class based on the parent directory
    if has_parent_X(fl, "dual_AGN_datasets") and has_parent_X(fl, "final_train_data"):
        df.loc[len(df)] = {"file_name": relative_path, "classes": "dual_agn"}
    elif has_parent_X(fl, "single_AGN_datasets") and has_parent_X(fl, "confirmed_single_AGN") and not has_parent_X(fl, "HSC-I"):
        df.loc[len(df)] = {"file_name": relative_path, "classes": "single_agn"}
    elif has_parent_X(fl, "merger_datasets") and has_parent_X(fl, "new_train_data"):
        df.loc[len(df)] = {"file_name": relative_path, "classes": "merger"}
    elif has_parent_X(fl, "offset_AGN_datasets") and has_parent_X(fl, "train_data") and not has_parent_X(fl, "HSC-I"):
        df.loc[len(df)] = {"file_name": relative_path, "classes": "offset_agn"}
    elif has_parent_X(fl, "empty_space_dataset") and has_parent_X(fl, "empty_space_train_data"):
        df.loc[len(df)] = {"file_name": relative_path, "classes": "empty_space"}

276266it [07:22, 624.67it/s] 


In [5]:
df.to_csv("training_datasets/info.csv")
df

Unnamed: 0,file_name,classes
0,dual_AGN_datasets/final_train_data/object10000...,dual_agn
1,dual_AGN_datasets/final_train_data/object10000...,dual_agn
2,dual_AGN_datasets/final_train_data/object10001...,dual_agn
3,dual_AGN_datasets/final_train_data/object10003...,dual_agn
4,dual_AGN_datasets/final_train_data/object10003...,dual_agn
...,...,...
108801,single_AGN_datasets/confirmed_single_AGN/UNK_1...,single_agn
108802,single_AGN_datasets/confirmed_single_AGN/UNK_1...,single_agn
108803,single_AGN_datasets/confirmed_single_AGN/UNK_1...,single_agn
108804,single_AGN_datasets/confirmed_single_AGN/UNK_1...,single_agn


In [10]:
import pandas as pd
# Count the occurrences of each class
class_counts = df['classes'].value_counts()

# Find the minimum count
min_count = class_counts.min()
print(class_counts)

# Sample min_count instances from each class
balanced_df = df.groupby('classes').apply(lambda x: x.sample(min_count)).reset_index(drop=True)
remaining_df = df[~df["file_name"].isin(balanced_df["file_name"])]

print(balanced_df)
print(remaining_df)

dual_agn       55921
offset_agn     54457
merger         11009
empty_space     9336
single_agn      5079
Name: classes, dtype: int64
                                               file_name     classes
0      dual_AGN_datasets/final_train_data/object26599...    dual_agn
1      dual_AGN_datasets/final_train_data/object3647_...    dual_agn
2      dual_AGN_datasets/final_train_data/object23089...    dual_agn
3      dual_AGN_datasets/final_train_data/object19495...    dual_agn
4      dual_AGN_datasets/final_train_data/object23402...    dual_agn
...                                                  ...         ...
25390  single_AGN_datasets/HSC-I/confirmed_single_AGN...  single_agn
25391  single_AGN_datasets/confirmed_single_AGN/QSO_1...  single_agn
25392  single_AGN_datasets/confirmed_single_AGN/QSO_0...  single_agn
25393  single_AGN_datasets/confirmed_single_AGN/QSO_0...  single_agn
25394  single_AGN_datasets/confirmed_single_AGN/QSO_0...  single_agn

[25395 rows x 2 columns]
             

In [11]:
df.to_csv("training_datasets/info.csv")