In [86]:
# Importing necessary libraries
import pandas as pd

# Define paths to your datasets (update these paths based on where your files are stored)
compas_subset = pd.read_csv('data/broward_clean.csv')
chicago_faces = pd.read_csv('data/CFD.csv')

# Display the first few rows of each dataset to ensure they are loaded correctly
print("chicago_faces:")
display(chicago_faces.head())
print("\ncompas_subset:")
display(compas_subset.head())


chicago_faces:


Unnamed: 0,Target,Race,Gender,Age
0,AF-200,A,F,32.571429
1,AF-201,A,F,23.666667
2,AF-202,A,F,24.448276
3,AF-203,A,F,22.758621
4,AF-204,A,F,30.137931



compas_subset:


Unnamed: 0,block_num,id,race,sex,age
0,1,8307,2,1,53
1,1,10406,2,0,43
2,1,8267,2,0,46
3,1,5593,1,0,37
4,1,61,2,1,51


Want to change everythign to numeric values, using the following
<!-- Age:
0: (0, 21]
1: (21, 28]
2: (28, 35]
3: (35, 42]
4: (42, Inf]

Gender:
0: Male
1: Female 

Race:
1: Caucasian
2: African-American
3: Latino
4: Asian
5: Other -->


Changes to the Compas Subset

In [87]:
#Changes to the COMPAS Subset Dataset
def categorize_age(age):
    if 0 <= age <= 21:
        return 0 
    elif 21 <= age <= 28:
        return 1
    elif 28 <= age <= 35:
        return 2
    elif 35 <= age <= 42:
        return 3
    elif age > 42:
        return 4

# Apply the mapping to the 'age' column in both datasets
compas_subset['age_range'] = compas_subset['age'].apply(categorize_age)

compas_subset['race'] = compas_subset['race'].apply(lambda x: 5 if x >5 else x)

compas_subset.rename(columns={ 'id':'def_id' }, inplace=True)

print("\nCompas Subset after age mapping:")
display(compas_subset.head())



Compas Subset after age mapping:


Unnamed: 0,block_num,def_id,race,sex,age,age_range
0,1,8307,2,1,53,4
1,1,10406,2,0,43,4
2,1,8267,2,0,46,4
3,1,5593,1,0,37,3
4,1,61,2,1,51,4


In [88]:
df1 = compas_subset
# Ensure the columns are strings before concatenation
df1['race'] = df1['race'].astype(str)
df1['sex'] = df1['sex'].astype(str)
df1['age_range'] = df1['age_range'].astype(str)

# Create the 'feature' column by concatenating race, gender, and age_range
df1['feature'] = df1['race'] + '-' + df1['sex'] + '-' + df1['age_range']

# Display the new DataFrame with the 'feature' column
print(df1.head())

   block_num  def_id race sex  age age_range feature
0          1    8307    2   1   53         4   2-1-4
1          1   10406    2   0   43         4   2-0-4
2          1    8267    2   0   46         4   2-0-4
3          1    5593    1   0   37         3   1-0-3
4          1      61    2   1   51         4   2-1-4


Changes to the Chicago Faces Dataset

In [89]:
# Convert all object types to string
chicago_faces[['Target', 'Race', 'Gender']] = chicago_faces[['Target', 'Race', 'Gender']].astype('string')

# Function to categorize gender
def categorize_gender(gender):
    if gender.lower() == 'm':
        return 0
    elif gender.lower() == 'f':
        return 1
    else:
        return None  # In case there are other gender categories
    
def categorize_age(age):
    if 0 <= age <= 21:
        return 0 
    elif 21 <= age <= 28:
        return 1
    elif 28 <= age <= 35:
        return 2
    elif 35 <= age <= 42:
        return 3
    elif age > 42:
        return 4
    
# Define the mapping for the race categories
race_mapping = {
    'A': 4,  # Asian
    'B': 2,  # Black (African American)
    'L': 3,  # Hispanic
    'W': 1   # White (Caucasian)
}

# Function to categorize race
def categorize_race(race):
    return race_mapping.get(race)  # Default to 6 (Other) if not in the mapping

# Apply race categorization
chicago_faces['Race'] = chicago_faces['Race'].apply(categorize_race)
chicago_faces['Gender'] = chicago_faces['Gender'].apply(categorize_gender)
chicago_faces['Age_Range'] = chicago_faces['Age'].apply(categorize_age)

chicago_faces.rename(columns={
    'Target': 'image_id',
    'Race': 'race',
    'Gender': 'sex',
    'Age': 'age',
    'Age_Range' :'age_range'
}, inplace=True)

In [91]:
df2 = chicago_faces

# Ensure the columns are strings before concatenation
df2['race'] = df2['race'].astype(str)
df2['sex'] = df2['sex'].astype(str)
df2['age_range'] = df2['age_range'].astype(str)

# Create the 'feature' column by concatenating race, gender, and age_range
df2['feature'] = df2['race'] + '-' + df2['sex'] + '-' + df2['age_range']

# Display the new DataFrame with the 'feature' column
print(df2.head())


  image_id race sex        age age_range feature
0   AF-200    4   1  32.571429         2   4-1-2
1   AF-201    4   1  23.666667         1   4-1-1
2   AF-202    4   1  24.448276         1   4-1-1
3   AF-203    4   1  22.758621         1   4-1-1
4   AF-204    4   1  30.137931         2   4-1-2


In [93]:
#Keeping only relevant columns for each
df1 = df1[['block_num','def_id','feature']]
df2 = df2[['image_id','feature']]

display(df1.head())
display(df2.head())

Unnamed: 0,block_num,def_id,feature
0,1,8307,2-1-4
1,1,10406,2-0-4
2,1,8267,2-0-4
3,1,5593,1-0-3
4,1,61,2-1-4


Unnamed: 0,image_id,feature
0,AF-200,4-1-2
1,AF-201,4-1-1
2,AF-202,4-1-1
3,AF-203,4-1-1
4,AF-204,4-1-2


In [95]:
# Merge df1 and df2 based on matching 'feature'
merged_df = pd.merge(df1, df2, on='feature', how='left')
#merged_df.to_csv('merged_df.csv', index=False)

In [96]:
# Sort the data to ensure the first available match is selected for each 'def_id' within each 'block_num'
merged_df_sorted = merged_df.sort_values(by=['block_num', 'def_id', 'image_id'])

# Function to assign the first available 'image_id' per 'def_id' and 'block_num'
def assign_first_available_image(df):
    # Drop duplicates based on 'def_id' and 'block_num', keeping the first match
    first_available_mapping = df.drop_duplicates(subset=['def_id', 'block_num'], keep='first')
    return first_available_mapping[['def_id', 'block_num', 'image_id']]

# Apply the function to the sorted DataFrame
first_available_mapping = assign_first_available_image(merged_df_sorted)

# Display the result
print(first_available_mapping.head())


     def_id  block_num image_id
22       61          1   BF-203
927     855          1   BM-004
854     921          1   BM-011
816    1022          1   BF-001
275    1024          1   BM-001
