In [2]:
import pandas as pd
import os
from collections import Counter
import shutil
from sklearn.model_selection import StratifiedShuffleSplit

def calculate_oversampling_ratios(total_distribution, max_samples_per_class):
    ratios = {}
    for class_id, count in total_distribution.items():
        if count < max_samples_per_class:
            ratios[class_id] = max_samples_per_class / count
        else:
            ratios[class_id] = 1
    return ratios

def random_oversample(folder, output_folder, seed=42, val_size=0.2):
    regions = ['frh01', 'frh02', 'frh03', 'frh04']
    class_distribution_info = []
    total_distribution_original = Counter()
    total_distribution_oversampled = Counter()
    total_distribution_validation = Counter()
    os.makedirs(output_folder, exist_ok=True)

    duplicate_id_prefix = '9999999'
    duplicate_id_counter = 0

    for region in regions:
        csv_file_path = os.path.join(folder, f'{region}.csv')
        if region == 'frh04':
            shutil.copy(csv_file_path, os.path.join(output_folder, f'{region}.csv'))
            continue
        df = pd.read_csv(csv_file_path)
        original_distribution = Counter(df['classid'])
        total_distribution_original.update(original_distribution)

        train_indices, val_indices = next(StratifiedShuffleSplit(n_splits=1, test_size=val_size, random_state=seed).split(df, df['classid']))

        df_train = df.iloc[train_indices]
        df_val = df.iloc[val_indices]

        max_samples_per_class = max(Counter(df_train['classid']).values())
        oversampling_ratios = calculate_oversampling_ratios(Counter(df_train['classid']), max_samples_per_class)

        unique_ids_per_class = {class_id: df_train[df_train['classid'] == class_id].drop_duplicates(subset=['id']) for class_id in df_train['classid'].unique()}

        oversampled_dfs = []
        for class_id, ratio in oversampling_ratios.items():
            initial_df = unique_ids_per_class[class_id]
            remaining_sample_count = int(round(len(df_train[df_train['classid'] == class_id]) * ratio)) - len(initial_df)

            if remaining_sample_count > 0:
                additional_samples = df_train[df_train['classid'] == class_id].sample(remaining_sample_count, replace=True, random_state=seed)
                # Mark duplicates and assign unique indices
                additional_samples['idx'] = [f"{duplicate_id_prefix}{duplicate_id_counter + i}" for i in range(len(additional_samples))]
                duplicate_id_counter += len(additional_samples)
                oversampled_class_df = pd.concat([initial_df, additional_samples], ignore_index=True)
            else:
                oversampled_class_df = initial_df

            oversampled_dfs.append(oversampled_class_df)

        df_oversampled = pd.concat(oversampled_dfs, ignore_index=True).sample(frac=1, random_state=seed).reset_index(drop=True)
        new_distribution = Counter(df_oversampled['classid'])

        # Save the training set
        train_csv_file_path = os.path.join(output_folder, f'{region}.csv')
        df_oversampled.to_csv(train_csv_file_path, index=False)

        # Save the validation set
        val_csv_file_path = os.path.join(output_folder, f'{region}_val.csv')
        df_val.to_csv(val_csv_file_path, index=False)

        total_distribution_oversampled.update(new_distribution)
        total_distribution_validation.update(Counter(df_val['classid']))

        class_distribution_info.append((region, original_distribution, new_distribution, Counter(df_val['classid']), Counter(df_val['classid'])))

    distribution_info_path = os.path.join(output_folder, 'class_distribution_info.txt')
    with open(distribution_info_path, 'w') as file:
        file.write(f"Seed used for oversampling: {seed}\n\n")
        for region, original, new, val_original, val_after_oversample in class_distribution_info:
            file.write(f"Original distribution for {region}:\n{original}\n")
            file.write(f"New distribution after oversampling for {region}:\n{new}\n")
            file.write(f"Validation distribution before oversampling for {region}:\n{val_original}\n")
            file.write(f"Validation distribution after oversampling for {region}:\n{val_after_oversample}\n\n")
        file.write("Total original distribution across all regions:\n")
        file.write(str(total_distribution_original))
        file.write("\n\nTotal oversampled distribution across all regions:\n")
        file.write(str(total_distribution_oversampled))
        file.write("\n\nTotal validation distribution across all regions:\n")
        file.write(str(total_distribution_validation))


In [3]:
# Example usage
input_folder = 'main/1'  # Replace with your input folder path
output_folder = 'main/1-RO'  # Replace with your desired output folder path
random_oversample(input_folder, output_folder)