# Christmas Paper 2024: In-silico Celebrity Survey

In [1]:
# put Open AI Key here!
%env OPENAI_API_KEY = "PUT OPEN AI KEY HERE!"

env: OPENAI_API_KEY="PUT OPEN AI KEY HERE!"


In [1]:
CELEBRITY = "Britney Spears"
N = 5  # number of samples per demographic and time point
YEAR = 'tbd'  # (2018, 2021, 2024) see below

In [4]:
from openai import OpenAI

import numpy as np
import pandas as pd

from tqdm import tqdm
import os

In [4]:
client = OpenAI()

In [5]:
# define survey questions
question1 = f"How would you describe your overall opinion of {CELEBRITY}? \n\
1) Very negative 2) Negative 3) Neutral 4) Positive 5) Very positive \n\
Please answer only with 1, 2, 3, 4, or 5. This is very important."

question2 = f"How relatable do you find {CELEBRITY}? \n\
1) Not at all 2) Slightly 3) Moderately 4) Quite a bit 5) Very much \n\
Please answer only with 1, 2, 3, 4, or 5. This is very important."

question3 = f"How inspiring do you find {CELEBRITY}? \n\
1) Not at all 2) Slightly 3) Moderately 4) Quite a bit 5) Very much \n\
Please answer only with 1, 2, 3, 4, or 5. This is very important."

question4 = f"How likable do you find {CELEBRITY}? \n\
1) Not at all 2) Slightly 3) Moderately 4) Quite a bit 5) Very much \n\
Please answer only with 1, 2, 3, 4, or 5. This is very important."

question5 = f"How relevant do you find {CELEBRITY}? \n\
1) Not at all 2) Slightly 3) Moderately 4) Quite a bit 5) Very much \n\
Please answer only with 1, 2, 3, 4, or 5. This is very important."

In [6]:
# define demographic groups
demo1 = f"You are a male person between 20 and 30 years old in January {YEAR}."
demo2 = f"You are a female person between 20 and 30 years old in January {YEAR}."
demo3 = f"You are a male person between 50 and 60 years old in January {YEAR}."
demo4 = f"You are a female person between 50 and 60 years old in January {YEAR}."
demo5 = f"You are a queer person in January {YEAR}."
demo6 = f"You are a straight person in January {YEAR}."
demo7 = f"You are a white person in January {YEAR}."
demo8 = f"You are a non-white person in January {YEAR}."

In [7]:
# define additional GPT instructions to prevent faulty 
instructions = "You are taking part in a survey. Don't excuse yourself with being an AI."

In [8]:
# simulate survey
# for each demographic, year and survey question, simulate N instances
answers = []
for DEMO in tqdm((demo1, demo2, demo3, demo4, demo5, demo6, demo7, demo8)):
    answers_temp = []
    for year in (2018, 2021, 2024):
        YEAR = year
        for n in range(N):
            n_temp = []
            for QUESTION in (question1, question2, question3, question4, question5):
                completion = client.chat.completions.create(
                            model="gpt-4o",
                            messages=[
                                {"role": "system", "content": DEMO},
                                {"role": "user", "content": DEMO + " " + instructions + " " + QUESTION}
                            ]
                )
                n_temp.append(completion.choices[0].message.content)
            answers_temp.append(n_temp + [YEAR])
    answers.append(answers_temp)

100%|█████████████████████████████████████████████| 8/8 [05:58<00:00, 44.80s/it]


In [9]:
def process_columns(demo_name, index):
    """ helper function to put answers variable into dataframe format """
    x = pd.DataFrame(answers[index], columns=['overall', 'relatable', 'inspiring', 'likable', 'relevant', 'year'])
    x['demo'] = demo_name
    return x

# populate dataframes per demographic
male_young = process_columns('male-young', 0)
female_young = process_columns('female-young', 1)
male_old = process_columns('male-old', 2)
female_old = process_columns('female-old', 3)
queer = process_columns('queer', 4)
non_queer = process_columns('non-queer', 5)
white = process_columns('white', 6)
non_white = process_columns('non-white', 7)

# concatenate separate dataframes to one large dataframe
survey_results = pd.concat([male_young, female_young, male_old, female_old, queer, non_queer, white, non_white])
survey_results = survey_results.reset_index().drop(columns=['index'])

# some data cleaning:
# extract digits from GPT's answers
# replace with np.nan if digits not in range 1 to 5
for c in ['overall', 'relatable', 'inspiring', 'likable', 'relevant']:
    survey_results[c] = [''.join(filter(str.isdigit, x)) if ''.join(filter(str.isdigit, x)) != '' else '99' for x in survey_results[c]]
    survey_results[c] = [x if 1<=int(x)<=5 else np.nan for x in survey_results[c]]

survey_results.head()

Unnamed: 0,overall,relatable,inspiring,likable,relevant,year,demo
0,3,3,3,3,3,2018,male-young
1,3,2,3,3,3,2018,male-young
2,3,3,3,3,3,2018,male-young
3,3,2,3,3,3,2018,male-young
4,3,2,3,4,3,2018,male-young


In [10]:
# store results to file
PATH = 'insilico-survey/'

if not os.path.exists(PATH):
    os.makedirs(PATH)

survey_results.to_csv(PATH + f"{CELEBRITY}.csv", index=False)

In [6]:
survey_results = pd.read_csv(f"{CELEBRITY}.csv")

In [8]:
def create_dashboard(df):
    fig = make_subplots(
        rows=2, cols=2,
        subplot_titles=('Trends Over Time', 'Demographic Comparison',
                        'Score Heatmap', 'Year Comparison'),
        specs=[[{"type": "scatter"}, {"type": "box"}],
               [{"type": "heatmap"}, {"type": "box"}]]
    )
    
    # Add traces for the time trends plot
    metrics = ['overall', 'relatable', 'inspiring', 'likable', 'relevant']
    for metric in metrics:
        yearly_means = df.groupby('year')[metric].mean()
        fig.add_trace(go.Scatter(
            x=yearly_means.index,
            y=yearly_means.values,
            name=metric.capitalize(),
            mode='lines+markers'
        ), row=1, col=1)
    
    # Add traces for the demographic comparison plot
    fig.add_trace(go.Box(
        x=df['demo'],
        y=df['overall'],
        name='Overall',
        marker_color='blue'
    ), row=1, col=2)
    
    # Add traces for the heatmap
    pivot_table = df.pivot_table(
        values=metrics,
        index='demo',
        aggfunc='mean'
    )
    fig.add_trace(go.Heatmap(
        z=pivot_table.values,
        x=pivot_table.columns,
        y=pivot_table.index,
        colorscale='RdBu',
        text=np.round(pivot_table.values, 2),
        texttemplate='%{text}',
        textfont={"size": 10},
        showscale=True
    ), row=2, col=1)
    
    # Add traces for the year comparison plot
    fig.add_trace(go.Box(
        x=df['year'],
        y=df['overall'],
        name='Overall',
        marker_color='green'
    ), row=2, col=2)
    
    fig.update_layout(height=1000, width=1200, title_text="Survey Results Dashboard")
    return fig

# Create and show dashboard
dashboard = create_dashboard(survey_results)
dashboard.show()