In [1]:
import pandas as pd
import random, os, shutil

In [2]:
ganfd = pd.read_csv('GANFD_Data_V1.csv')
men_df = ganfd[(ganfd['gender_fem'] < 0.10)]
women_df = ganfd[(ganfd['gender_fem'] > 0.90)]

In [3]:
def process_group(group):
    """
    Process each group to ensure it contains at most two rows based on the given conditions.
    Assign a 'skintone' value: 0 for lower race_ba, 1 for higher race_ba.

    Parameters:
    group (DataFrame): Grouped DataFrame for a specific 'set'.

    Returns:
    DataFrame: Processed group with two rows max and 'skintone' column.
    """
    # Filter for Black or Multiple in race_guess
    group = group[group['race_guess'].isin(['Black', 'Multiple'])]

    # Initialize selected_rows
    selected_rows = pd.DataFrame()

    # If there are more than two Blacks, keep the highest and lowest race_ba
    black_rows = group[group['race_guess'] == 'Black']
    if len(black_rows) > 2:
        selected_rows = pd.concat([black_rows.nlargest(1, 'race_ba'), black_rows.nsmallest(1, 'race_ba')])

    # If there is one Black and one or more Multiple
    elif len(black_rows) == 1 and len(group[group['race_guess'] == 'Multiple']) >= 1:
        multiple_rows = group[group['race_guess'] == 'Multiple']
        highest_multiple = multiple_rows.nlargest(1, 'race_ba')
        selected_rows = pd.concat([black_rows, highest_multiple])

    # If there is only Multiple, keep the two with the highest race_ba
    elif len(black_rows) == 0 and len(group[group['race_guess'] == 'Multiple']) > 1:
        multiple_rows = group[group['race_guess'] == 'Multiple']
        selected_rows = multiple_rows.nlargest(2, 'race_ba')

    # Ensure the result has exactly two rows
    if len(selected_rows) != 2:
        return pd.DataFrame()  # Return empty DataFrame if conditions aren't met

    # Assign 'skintone': 0 for lower race_ba, 1 for higher race_ba
    selected_rows = selected_rows.sort_values(by='race_ba').reset_index(drop=True)
    selected_rows['skintone'] = [0, 1]

    return selected_rows

In [4]:
# Apply the processing function to each 'set'
sampled_men_df = (
    men_df.groupby('set', group_keys=False)
    .apply(process_group)
    .reset_index(drop=True)
)
sampled_men_df['gender'] = 'men'

In [5]:
# Apply the processing function to each 'set'
sampled_women_df = (
    women_df.groupby('set', group_keys=False)
    .apply(process_group)
    .reset_index(drop=True)
)
sampled_women_df['gender'] = 'women'

In [6]:
sampled_df = pd.concat([sampled_men_df, sampled_women_df])

In [7]:
trimmed_df = sampled_df[['full_ID', 'set', 'gender', 'skintone', 'race_guess', 'race_ba']]
trimmed_df['condition'] = trimmed_df['gender'].str.lower().str[0] + trimmed_df['skintone'].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  trimmed_df['condition'] = trimmed_df['gender'].str.lower().str[0] + trimmed_df['skintone'].astype(str)


In [8]:
random.seed(1048596)

In [9]:
men_sets = trimmed_df.loc[trimmed_df.gender == 'men'].set.unique().tolist()
men_sets_samples = random.sample(men_sets, 10)

In [10]:
women_sets = trimmed_df.loc[trimmed_df.gender == 'women'].set.unique().tolist()
women_sets_samples = random.sample(women_sets, 10)

In [11]:
# Filter the original DataFrame to include all rows belonging to the sampled sets
sampled_sets = men_sets_samples + women_sets_samples
balanced_df = trimmed_df[trimmed_df['set'].isin(sampled_sets)]

In [12]:
# Source folder
source_folder = 'GANFD - Cropped Grey Backgrounds'

# Destination folders
dest_folders = {
    'Images': balanced_df['full_ID'].to_list()
}

# Create destination folders if they don't exist
for folder in dest_folders.keys():
    os.makedirs(folder, exist_ok=True)

# Iterate through the files in the source folder
for filename in os.listdir(source_folder):
    if filename.endswith('.jpg'):  # Only process .jpg files
        file_id = os.path.splitext(filename)[0]  # Extract the ID (without .jpg)
        
        # Check which group the file belongs to and copy it
        for dest_folder, id_list in dest_folders.items():
            if file_id in id_list:
                source_path = os.path.join(source_folder, filename)
                dest_path = os.path.join(dest_folder, filename)
                shutil.copy(source_path, dest_path)
                break  # Move to the next file once copied

In [13]:
balanced_df.to_csv('image_lookup.csv', index=False)