# Processing the 1k Image dataset for DALL-E & CNN

In [2]:
import pandas as pd
import numpy as np
import json

### combining the CloudResearch datasets into one DF

In [3]:
respondent_df = pd.read_csv("1k_dataset_1.csv")

for i in np.arange(1, 2):
    csv = "1k_dataset_" + str(i) + ".csv"
    csv_df = pd.read_csv(csv)
    respondent_df = pd.concat([respondent_df, csv_df])

respondent_df

Unnamed: 0,ParticipantId,StartTime (America/Chicago),CompletionTime (America/Chicago),CompletionType,CompletionCode,Payment,Bonus,Fees,Status,Task Data,...,Education,Sex,Occupation Field,Relationship/Marital Status,Political Party,Gender,Country Of Residence,Household Income,Race,Employment Status
0,21C99EDBFCD34AA1B3538677865FA3C4,05/06/2024 06:34:40 PM,,,,0,0,0,Returned,,...,"Some college, but no degree",Female,Business Management & Administration,In a relationship,Democrat,Woman,United States,Prefer not to say,Prefer not to say,Unemployed
1,AB4D929E25A942249E8D34299746B1A6,05/06/2024 06:34:40 PM,,,,0,0,0,Returned,,...,"Bachelor's degree (for example: BA, AB, BS)",Male,Arts,In a relationship,Democrat,Man,United States,"$60,000-$69,999",An ethnicity not listed here,Full-time
2,3CA5DB2C4DAC403EA4D1A0AC45D33054,05/06/2024 06:34:51 PM,,,,0,0,0,Returned,,...,"Bachelor's degree (for example: BA, AB, BS)",Male,Information Technology,Married,Democrat,Man,United States,"$200,000-$224,999",White,Full-time
3,259269483C7F499FB657D6B8BD0C1C97,05/06/2024 06:34:53 PM,05/06/2024 06:36:48 PM,Template,,0,0,0,Pending,"{""RowData"":[{""CellData"":""4.0"",""ColumnHeader"":""...",...,"Some college, but no degree",Male,Other,Divorced,Democrat,Man,United States,"$30,000-$39,999",White,Full-time
4,5E46EF4B2865489D9A79F709596A315F,05/06/2024 06:34:54 PM,05/06/2024 06:42:59 PM,Template,,0,0,0,Pending,"{""RowData"":[{""CellData"":""4.0"",""ColumnHeader"":""...",...,"Master's degree (for example: MA, MS, MEng, ME...",Female,Education & Training,Married,Independent,Woman,United States,"$100,000-$124,999",White,Full-time
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1349,48CF0836208947F3A9C1303E7ED49A0A,05/06/2024 06:34:13 PM,05/06/2024 06:38:03 PM,Template,,0,0,0,Pending,"{""RowData"":[{""CellData"":""5.0"",""ColumnHeader"":""...",...,"Associate degree (for example: AA, AS)",Female,Medicine,Widowed,Independent,Woman,United States,"$50,000-$59,999",Black or African American,Full-time
1350,61DC9B9B1D064196A5DF2B7F6F6ADC80,05/06/2024 06:34:25 PM,,,,0,0,0,Returned,,...,"Master's degree (for example: MA, MS, MEng, ME...",Female,Arts,Single,Democrat,Woman,United States,"Less than $10,000",White,Unemployed
1351,FCC8D9FC3CAD44BC9AA129B5A96241D4,05/06/2024 06:34:28 PM,,,,0,0,0,Returned,,...,"Some college, but no degree",Female,Information Technology,Single,Democrat,Woman,United States,"$30,000-$39,999",Black or African American,Full-time
1352,128BAFEE1C27463DACD32EBFB0AA0F80,05/06/2024 06:34:29 PM,05/06/2024 06:37:19 PM,Template,,0,0,0,Pending,"{""RowData"":[{""CellData"":""1.0"",""ColumnHeader"":""...",...,"Bachelor's degree (for example: BA, AB, BS)",Female,Arts,Married,Democrat,Woman,United States,"$250,000 or more",White,Full-time


### cleaning and formatting df

In [4]:
respondent_df.columns

Index(['ParticipantId', 'StartTime (America/Chicago)',
       'CompletionTime (America/Chicago)', 'CompletionType', 'CompletionCode',
       'Payment', 'Bonus', 'Fees', 'Status', 'Task Data', 'Submitted Data',
       'Age', 'Education', 'Sex', 'Occupation Field',
       'Relationship/Marital Status', 'Political Party', 'Gender',
       'Country Of Residence', 'Household Income', 'Race',
       'Employment Status'],
      dtype='object')

In [5]:
# want only actual responses (responses that are "Pending")

respondent_df = respondent_df[respondent_df["Status"] == "Pending"]

In [6]:
relevant_columns = ['ParticipantId', 'Task Data', 'Submitted Data',
       'Age', 'Education', 'Sex', 'Occupation Field',
       'Relationship/Marital Status', 'Political Party', 'Gender',
       'Country Of Residence', 'Household Income', 'Race',
       'Employment Status']

respondent_df = respondent_df[relevant_columns]
respondent_df = respondent_df.reset_index(drop = True)

In [7]:
respondent_demographics_df = respondent_df.drop(columns = ["Task Data", "Submitted Data"])

In [8]:
respondent_data_df = respondent_df[["Task Data", "Submitted Data"]]

In [9]:
respondent_data_df["Submitted Data"][0]

'{"Data":{"taskData":{"age":"40s","jaw":"Rounded","eyes":"Hazel","hair":"Brown short","nose":"Smaller sharp","race":"white","beard":"None","gender":"female","eyebrows":"Brown","expression":"smiling"}}}'

In [10]:
def get_data_from_json(task_json: str, task: bool):
    """
    Returns the value associated with the given field attribute (ex: Race, Memorable)
    Args:
        field: the specific field that is being retrieved from the JSON 
        task: Boolean True if it is task data, False if it is respondent data (accessing JSON differs)
    """
    json_object = json.loads(task_json)
    if task: 
        json_object_rows = json_object["RowData"]
        task_columns = [row["ColumnHeader"] for row in json_object_rows]
        task_columns = [column + "_og_img" for column in task_columns]
        json_object_row_headers_values = [row["CellData"] for row in json_object_rows]
        res_dict = {task_columns[i]: json_object_row_headers_values[i] for i in range(len(task_columns))}
        return res_dict
    else:
        json_object_rows = json_object["Data"]
        description_data = json_object_rows["taskData"]
        description_headers = ([*description_data.keys()])
        description_headers = [column + "_description" for column in description_headers]
        description_values = ([*description_data.values()])
        res_dict = {description_headers[i]: description_values[i] for i in range(len(description_values))}
        return res_dict


In [11]:
def reconstruct_output_df(respondent_df):
    """
    Returns the value associated with the given field attribute (ex: Race, Memorable)
    Args:
        field: the specific field that is being retrieved from the JSON 
        task: Boolean True if it is task data, False if it is respondent data (accessing JSON differs)
    """
    
    # creating the new columns for each row of the task data
    expanded_task_data = respondent_df.apply(lambda row: get_data_from_json(row["Task Data"], task = True), axis = 1, result_type = 'expand')
    respondent_df = pd.concat([expanded_task_data, respondent_df], axis = "columns")

    # getting the columns associated with one respondent description (same for all respondents so generalizable to other rows)
    expanded_description_data = respondent_df.apply(lambda row: get_data_from_json(row["Submitted Data"], task = False), axis = 1, result_type = 'expand')
    respondent_df = pd.concat([expanded_description_data, respondent_df], axis = "columns")
    
    return respondent_df


In [12]:
race_testing_df = reconstruct_output_df(respondent_df)

In [13]:
race_testing_df.to_csv("1k_dataset_output.csv", mode="w", index=False)

In [14]:
race_testing_df

Unnamed: 0,age_description,jaw_description,eyes_description,hair_description,nose_description,race_description,beard_description,gender_description,eyebrows_description,expression_description,...,Education,Sex,Occupation Field,Relationship/Marital Status,Political Party,Gender,Country Of Residence,Household Income,Race,Employment Status
0,40s,Rounded,Hazel,Brown short,Smaller sharp,white,,female,Brown,smiling,...,"Some college, but no degree",Male,Other,Divorced,Democrat,Man,United States,"$30,000-$39,999",White,Full-time
1,mid 30s,"pointed, soft","small, brown, friendly","short, brown, straight","wide, small nostrils",white,"brown, scruffy, mustash",male,"brown, whispy",smiling,...,"Master's degree (for example: MA, MS, MEng, ME...",Female,Education & Training,Married,Independent,Woman,United States,"$100,000-$124,999",White,Full-time
2,90's,a round jaw,light blue eyes,short cropped hair,wide nose,black,no beard,female,thinning light brows,smiling,...,High school graduate - high school diploma or ...,Female,Retail,In a relationship,Prefer not to say,Woman,United States,Prefer not to say,White,Full-time
3,42,Roundish,Hazel,Gray,Average,white,Slight gray,male,Gray,neutral,...,High school graduate - high school diploma or ...,Male,Other,Single,Democrat,Man,United States,"$20,000-$29,999",Black or African American,Student
4,37,Wide,Brown,Brown,Narrow,white,Shaved,male,Narrow,smiling,...,"Bachelor's degree (for example: BA, AB, BS)",Male,Retail,Single,Something else,Man,United States,"$10,000-$19,999",White,Student
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1649,40s to early 50s,Rounded jaw,Dark,Brown or black,Medium sized nostrils,asian,No facial hair,female,Thick black,smiling,...,"Master's degree (for example: MA, MS, MEng, ME...",Male,Education & Training,Married,Democrat,Man,United States,"$100,000-$124,999",White,Full-time
1650,50s,Round,small green eyes,"Short-thin, receding hairline",Small nostrils big bridge,white,Thick bushy mustache with beard trim,male,Thin darkish blonde,smiling,...,High school graduate - high school diploma or ...,Male,Other,Married,Democrat,Man,United States,"$70,000-$79,999",Filipino,Full-time
1651,28-32,round jaw line,blue eyes left eye shaped differently than rig...,Blonde with brown highlights,long bridge sharp tip,white,none,female,arched bark brown thin,smiling,...,"Associate degree (for example: AA, AS)",Female,Medicine,Widowed,Independent,Woman,United States,"$50,000-$59,999",Black or African American,Full-time
1652,Mid40s,Round,Medium sized brown,Pulled back dark brown straight hair,"Medium, larger nostrils",other,,female,Thin brown not very large,neutral,...,"Bachelor's degree (for example: BA, AB, BS)",Female,Arts,Married,Democrat,Woman,United States,"$250,000 or more",White,Full-time


In [28]:
files_to_drop = race_testing_df["Filename_og_img"].values
files_to_drop

array(['Google_1_Jane Standifer_11_oval.jpg',
       'Google_1_Louis Sung_1_oval.jpg',
       'Google_1_Lorraine Byrd_9_oval.jpg', ...,
       'Google_1_Chad Brister_11_oval.jpg',
       'Google_1_Tamara Pharr_9_oval.jpg',
       'Google_1_Eric Croft_1_oval.jpg'], dtype=object)

In [24]:
images_1k_df = pd.read_csv("1k-images.csv").
images_1k_df

TypeError: 'numpy.ndarray' object is not callable

In [32]:
# dropping these images from the 1k dataset for new survey 
images_1k_df = pd.read_csv("1k-images.csv")
second_survey_df = images_1k_df[~images_1k_df["Filename"].isin(files_to_drop)]


In [33]:
second_survey_df

Unnamed: 0,Attractive,Race,Memorable,Filename,AWSFile
592,3.0,1.0,3.0,Google_1_Sean Bissonnette_1_oval.jpg,https://mturk-1k-images.s3.amazonaws.com/Googl...
677,2.0,1.0,2.0,Google_1_Larry Perryman_13_oval.jpg,https://mturk-1k-images.s3.amazonaws.com/Googl...
726,3.0,1.0,4.0,Google_1_Samuel Chartier_17_oval.jpg,https://mturk-1k-images.s3.amazonaws.com/Googl...
748,2.0,1.0,2.0,Google_1_Jack Province_9_oval.jpg,https://mturk-1k-images.s3.amazonaws.com/Googl...
758,3.0,1.0,3.0,Google_1_Wayne Barrios_9_oval.jpg,https://mturk-1k-images.s3.amazonaws.com/Googl...
...,...,...,...,...,...
995,1.0,2.0,1.0,Google_1_Gregory Given_1_oval.jpg,https://mturk-1k-images.s3.amazonaws.com/Googl...
996,3.0,1.0,3.0,Google_1_Mike Spruill_3_oval.jpg,https://mturk-1k-images.s3.amazonaws.com/Googl...
997,2.0,2.0,4.0,Google_1_Danielle Christenson_15_oval.jpg,https://mturk-1k-images.s3.amazonaws.com/Googl...
998,2.0,1.0,3.0,Google_1_Catherine Fiske_17_oval.jpg,https://mturk-1k-images.s3.amazonaws.com/Googl...


In [35]:
second_survey_df.to_csv("1k_survey_2.csv", mode="w", index=False)

In [3]:
# getting the list of participants who have taken the survey already 
first_survey = pd.read_csv("1k_dataset_1.csv")
first_survey

Unnamed: 0,ParticipantId,StartTime (America/Chicago),CompletionTime (America/Chicago),CompletionType,CompletionCode,Payment,Bonus,Fees,Status,Task Data,...,Education,Sex,Occupation Field,Relationship/Marital Status,Political Party,Gender,Country Of Residence,Household Income,Race,Employment Status
0,21C99EDBFCD34AA1B3538677865FA3C4,05/06/2024 06:34:40 PM,,,,0,0,0,Returned,,...,"Some college, but no degree",Female,Business Management & Administration,In a relationship,Democrat,Woman,United States,Prefer not to say,Prefer not to say,Unemployed
1,AB4D929E25A942249E8D34299746B1A6,05/06/2024 06:34:40 PM,,,,0,0,0,Returned,,...,"Bachelor's degree (for example: BA, AB, BS)",Male,Arts,In a relationship,Democrat,Man,United States,"$60,000-$69,999",An ethnicity not listed here,Full-time
2,3CA5DB2C4DAC403EA4D1A0AC45D33054,05/06/2024 06:34:51 PM,,,,0,0,0,Returned,,...,"Bachelor's degree (for example: BA, AB, BS)",Male,Information Technology,Married,Democrat,Man,United States,"$200,000-$224,999",White,Full-time
3,259269483C7F499FB657D6B8BD0C1C97,05/06/2024 06:34:53 PM,05/06/2024 06:36:48 PM,Template,,0,0,0,Pending,"{""RowData"":[{""CellData"":""4.0"",""ColumnHeader"":""...",...,"Some college, but no degree",Male,Other,Divorced,Democrat,Man,United States,"$30,000-$39,999",White,Full-time
4,5E46EF4B2865489D9A79F709596A315F,05/06/2024 06:34:54 PM,05/06/2024 06:42:59 PM,Template,,0,0,0,Pending,"{""RowData"":[{""CellData"":""4.0"",""ColumnHeader"":""...",...,"Master's degree (for example: MA, MS, MEng, ME...",Female,Education & Training,Married,Independent,Woman,United States,"$100,000-$124,999",White,Full-time
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1349,48CF0836208947F3A9C1303E7ED49A0A,05/06/2024 06:34:13 PM,05/06/2024 06:38:03 PM,Template,,0,0,0,Pending,"{""RowData"":[{""CellData"":""5.0"",""ColumnHeader"":""...",...,"Associate degree (for example: AA, AS)",Female,Medicine,Widowed,Independent,Woman,United States,"$50,000-$59,999",Black or African American,Full-time
1350,61DC9B9B1D064196A5DF2B7F6F6ADC80,05/06/2024 06:34:25 PM,,,,0,0,0,Returned,,...,"Master's degree (for example: MA, MS, MEng, ME...",Female,Arts,Single,Democrat,Woman,United States,"Less than $10,000",White,Unemployed
1351,FCC8D9FC3CAD44BC9AA129B5A96241D4,05/06/2024 06:34:28 PM,,,,0,0,0,Returned,,...,"Some college, but no degree",Female,Information Technology,Single,Democrat,Woman,United States,"$30,000-$39,999",Black or African American,Full-time
1352,128BAFEE1C27463DACD32EBFB0AA0F80,05/06/2024 06:34:29 PM,05/06/2024 06:37:19 PM,Template,,0,0,0,Pending,"{""RowData"":[{""CellData"":""1.0"",""ColumnHeader"":""...",...,"Bachelor's degree (for example: BA, AB, BS)",Female,Arts,Married,Democrat,Woman,United States,"$250,000 or more",White,Full-time


In [6]:
exclude_participants_ids = first_survey["ParticipantId"].values

In [7]:
with open('exclude_participants.txt', 'w') as f:
    for line in exclude_participants_ids:
        f.write(f"{line}\n")