In [1]:
import pandas as pd

#######################################################
########               DEFINE             #############
#######################################################

# Load the dataset
#source = 'd_full_clean.csv'
source = 'd_clean_remove_small_samples.csv'

outlier_removal = 'iqr'                                 # iqr: (25th and 75th), ipr: (10th and 90th)
generate_sample_sizes = [5,10,25,50,75,100,150,250]     # Multiple sample sizes are generated here however only 5,10,25,50,100 and 150 were used.

#######################################################
########          END OF DEFINE           #############
#######################################################

## Read the source data into dataframe df
df = pd.read_csv('./source/'+source)

## Use 25th to 75th percentile for IQR and 10th to 90th for IPR (no need to change anything here)
removal_range = { 'iqr':0.25,'ipr':0.10 } 

# Track sampled indices to avoid re-sampling the same rows
sampled_indices = set()

for ss in generate_sample_sizes:
    # Define a function to calculate representative samples
    def get_representative_samples(group, proportion=0.1, random_seed=42):
        # Calculate the number of samples to take based on the proportion
        n_samples = max(1, int(len(group) * proportion))
        
        # Select only numeric columns for calculation
        numeric_group = group.select_dtypes(include=['number'])
        
        # Calculate mean, median, 25th, and 75th quantiles
        mean = numeric_group.mean()
        median = numeric_group.median()
        q25 = numeric_group.quantile(removal_range[outlier_removal])
        q75 = numeric_group.quantile(1 - removal_range[outlier_removal])
        
        # Concatenate these statistics into a single DataFrame
        representative_samples = pd.DataFrame([mean, median, q25, q75])
        
        # Drop duplicates (in case some statistics are identical)
        representative_samples = representative_samples.drop_duplicates()
        
        # Exclude outliers
        Q1 = numeric_group.quantile(removal_range[outlier_removal])
        Q3 = numeric_group.quantile(1 - removal_range[outlier_removal])
        IQR = Q3 - Q1
        filtered_group = group[~((numeric_group < (Q1 - 1.5 * IQR)) | (numeric_group > (Q3 + 1.5 * IQR))).any(axis=1)]
        
        # Filter out previously sampled indices
        filtered_group = filtered_group.loc[~filtered_group.index.isin(sampled_indices)]
        
        # If there are not enough samples in filtered group, return what we have
        if len(filtered_group) < n_samples:
            sampled_indices.update(filtered_group.index)
            return filtered_group
        else:
            sampled = filtered_group.sample(n=n_samples, random_state=random_seed, replace=False)
            sampled_indices.update(sampled.index)
            return sampled

    # Determine the proportionate number of samples for each group
    total_samples = ss  # Total number of samples you want in the final dataset
    proportions = (df['target_material'].value_counts(normalize=True) * total_samples).astype(int)

    # Apply the function to each group with proportionate sampling
    representative_samples_list = pd.concat([
        get_representative_samples(group, proportion=proportions[material] / len(group))
        for material, group in df.groupby('target_material')
    ]).reset_index(drop=True)

    # Ensure the total number of samples meets the required size
    if len(representative_samples_list) < total_samples:
        remaining_indices = df.loc[~df.index.isin(sampled_indices)].index
        additional_samples_needed = total_samples - len(representative_samples_list)
        additional_samples = df.loc[remaining_indices].sample(n=additional_samples_needed, random_state=42, replace=False)
        sampled_indices.update(additional_samples.index)
        representative_samples_list = pd.concat([representative_samples_list, additional_samples]).reset_index(drop=True)
    elif len(representative_samples_list) > total_samples:
        representative_samples_list = representative_samples_list.sample(n=total_samples, random_state=42, replace=False).reset_index(drop=True)

    # Save the representative samples to a new CSV file, e.g. ./org_samples/d_clean_remove_small_samples_iqr/rs_size_50.csv  (This file contains 50 unique representative rows of the orignal source data after outlier removal and based on target_material frequencies)
    representative_samples_list.to_csv(f'./org_samples/{source.split(".")[0]}_{outlier_removal}/rs_size_{total_samples}.csv', index=False)

    # Display the representative samples
    print(f'Representative samples for size {total_samples}:\n', representative_samples_list)


Representative samples for size 5:
   target_material  target_thickness  pulse_width   energy  spot_size  \
0       aluminium             6.000           60    7.000     15.000   
1            gold             4.000         2280   46.737      2.870   
2   polypropylene             0.100          500  135.270      4.458   
3         plastic             0.634           30    2.378      3.300   
4         plastic             0.503           30    2.331      3.300   

                intensity            power  cutoff_energy  
0    45780000000000000000  116700000000000           8.00  
1   219700000000000000000   20500000000000          17.60  
2  1202000000000000000000  270500000000000          49.25  
3   642800000000000000000   79280000000000           4.50  
4   630000000000000000000   77700000000000           4.00  
Representative samples for size 10:
   target_material  target_thickness  pulse_width   energy  spot_size  \
0         plastic             0.700          279    2.348     

In [2]:
### Inspect that the relative frequencies are in overall agreement with original source data frequencies (e.g. plastic majority, followed by gold and then polystyrene)
representative_samples_list['target_material'].value_counts(normalize=True)

target_material
plastic        0.920
gold           0.048
polystyrene    0.032
Name: proportion, dtype: float64

In [3]:
# Show the absolute counts
representative_samples_list['target_material'].value_counts()

target_material
plastic        230
gold            12
polystyrene      8
Name: count, dtype: int64