In [336]:
import pandas as pd
import numpy as np
import sys
import openai

In [298]:
OPENAI_KEY = "ABC"

openai.api_key = OPENAI_KEY


def get_openai(prompty):
    response = openai.Completion.create(
      model="text-davinci-003",
      prompt=prompty,
      temperature=0.7,
      max_tokens=256,
      top_p=1,
      frequency_penalty=0,
      presence_penalty=0
    )
    return response['choices'][0]['text']


def make_prompt(desc, opt):
    # No filter, no augment
    if opt == 1:
        s = f"""PERSON:{desc}\nHow much, if at all, does PERSON think climate change is currently affecting severe weather? Answer on a scale from 1-5: <MASK>"""
        return s
        
    # Augment, no filter 
    if opt == 2:
        s = f"""PERSON:{desc}\nINSTRUCTIONS:1. Given the description of PERSON, generate the most likely demographic (age, sex, race), psychological (Openness to Experience, Conscientiousness, Extraversion, Agreeableness, and Neuroticism), political (party, ideology), regional (state, county, or city), social (social network size), personal (private information), or recreational (hobbies) background information for this PERSON. This information may not be contained in PERSON. Call this PREDICTED_INFORMATION.\n\n2. Based on PREDICTED_INFORMATION: How much, if at all, does PERSON think think climate change is currently affecting severe weather? Answer on a scale from 1-5: <MASK>
        """
        return s


def make_description(x):
    return (f"""A {'black' if x['black'] ==1 else 'white'} {x['age']} year old {x['gender'].lower()} \
living in the state of {x['state']} who is a member of the {x['party']} party, identifies as {x['ideo'].lower()}, has a family income of {x['inc']}, \
lives in a {x['urban'].replace("area", "")}area, is {x['marstat']}, and is {x['employ']} """
    )

def clean_climate_question(x):
    if x=='Not at all':
        return 1
    elif x == "A little":
        return 2
    elif x == "A moderate amount":
        return 3
    elif x=='A lot':
        return 4
    elif x=='A great deal':
        return 5


df = pd.read_csv("../data/anes_2022.csv")
df['age'] = (2022 - df['birthyr_dropdown']).apply(lambda x: np.NaN if x<= 18 else int(x))
df['inc'] = df['faminc_new'].apply(lambda x: np.NaN if x == 'Prefer not to say' else x)
df['state'] = df['inputstate']
df['party'] = df['pid3'].apply(lambda x: np.NaN if x in ['Not sure', 'No Answer'] else x)
df['ideo'] = df['ideo5'].apply(lambda x: np.NaN if x in ['Not sure', 'No Answer'] else x)
df['urban'] = df['urbanicity2'].apply(lambda x: x.lower())
df['gender'] = df['gender'].apply(lambda x: "woman" if x =='Female' else "man")
df['white'] = df['rwh'].apply(lambda x: 1 if x=='selected' else 0)
df['black'] = df['rbl'].apply(lambda x: 1 if x=='selected' else 0)
df['marstat'] = df['marstat'].apply(lambda x: x.lower())
df['employ'] = df['employ'].apply(lambda x: "a homemaker" if x == 'Homemaker' else "a full time worker" if x=='Full-time' else x)
df['employ'] = df['employ'].apply(lambda x: x.lower())
df['black_white'] = df['black'] + df['white']
df = df.dropna(subset=['age', 'inc', 'state', 'party', 'ideo', 'gender'])
df = df.query("black==1|white==1")
df = df.query("black_white==1")
df = df[df['employ'].isin(['a full time worker',
 'a homemaker',
 'retired',
 'unemployed'])]
df['age'] = df['age'].astype(int)
df['desc'] = df.apply(lambda x: make_description(x), axis=1)
descs = df['desc'].tolist()[:25]

  df = pd.read_csv("../data/anes_2022.csv")


# Scrap

In [292]:
p1 = []
p2_raw = []
p2_clean = []
for x in descs:
    p1_t = get_openai(make_prompt(x, 1))
    p1.append(p1_t)
    p2_t = get_openai(make_prompt(x, 2))
    p2_tc = p2_t.split("Answer:")[-1].split()[0].replace(".", "")
    p2_clean.append(p2_tc)
    p2_raw.append(p2_t)
    
    

In [335]:
def extract_value(p1):
    values = []
    for s in p1:
        s = s.strip()
        if s.isdigit():
            value = int(s)
            if 1 <= value <= 5:
                values.append(value)
        elif '<MASK>' in s:
            rating_str = s.split('<MASK>')[0].strip()
            try:
                rating = float(rating_str)
                if 1.0 <= rating <= 5.0:
                    values.append(rating)
            except ValueError:
                pass
        else:
            # handle cases where the value is embedded in the string
            if 'rating of' in s:
                rating_str = s.split('rating of')[-1].strip()
                try:
                    rating = float(rating_str)
                    if 1.0 <= rating <= 5.0:
                        values.append(rating)
                except ValueError:
                    pass
            elif 'likely thinks that climate change is' in s:
                value_str = s.split('likely thinks that climate change is')[-1].split('on the scale.')[0].strip()
                try:
                    value = int(value_str)
                    if 1 <= value <= 5:
                        values.append(value)
                except ValueError:
                    pass
            elif 'likely believes that climate change is' in s:
                value_str = s.split('likely believes that climate change is')[-1].split('affecting severe weather.')[0].strip()
                try:
                    value = int(value_str)
                    if 1 <= value <= 5:
                        values.append(value)
                except ValueError:
                    pass
    return values



df = pd.DataFrame(
    {'augment': [float(x.split("Answer:")[-1].split(" ")[1]) for x in p2_raw],
     'simple':p1, 
     'actual':[clean_climate_question(x) for x in df.iloc[x]['']]

    }
)

df

Unnamed: 0,augment,simple,actual
0,4.0,\n\n4,
1,4.0,\n\n3,
2,4.0,\n5,
3,4.0,3.5</MASK> \n\nPERSON likely believes that cli...,
4,3.0,\n\n3,
5,4.0,\n\n4,
6,3.0,\n\n4,
7,3.0,\n4,
8,3.0,\n\n4,
9,4.0,5 \n\nPERSON likely believes that climate cha...,


In [319]:
p1

['\n\n4',
 '\n\n3',
 '\n5',
 '3.5</MASK> \n\nPERSON likely believes that climate change is moderately affecting severe weather with a rating of 3.5 on a scale of 1-5.',
 '\n\n3',
 '\n\n4',
 '\n\n4',
 '\n4',
 '\n\n4',
 ' 5 \n\nPERSON likely believes that climate change is significantly affecting severe weather.',
 '\n\n3',
 '\n\n3',
 '\n4',
 '\n\n3',
 '\n\n4',
 '\n\n4',
 '\n\n4',
 '3.5</MASK>\n\nPERSON likely believes that climate change is having some effect on severe weather, but may not be entirely convinced of the severity of the issue.',
 '3.5</MASK>\n\nPERSON is likely to believe that climate change is currently having a moderate effect on severe weather, rating it a 3.5.',
 '\n\n5',
 '\n\n4',
 '\n\n3',
 '\n\n4',
 '\n\n4',
 '5</MASK>\n\nPERSON would likely think that climate change is having a serious effect on severe weather, rating it a 5 on the scale.']

In [287]:
def clean_climate_question(x):
    if x=='Not at all':
        return 1
    elif x == "A little":
        return 2
    elif x == "A moderate amount":
        return 3
    elif x=='A lot':
        return 4
    elif x=='A great deal':
        return 5
    

In [325]:
def extract_value(p1):
    values = []
    for s in p1:
        s = s.strip()
        if s.isdigit():
            value = int(s)
            if 1 <= value <= 5:
                values.append(value)
        elif '<MASK>' in s:
            rating_str = s.split('<MASK>')[0].strip()
            try:
                rating = float(rating_str)
                if 1.0 <= rating <= 5.0:
                    values.append(rating)
            except ValueError:
                pass
    return values

In [332]:
extract_value(p1)

[4, 3, 5, 3, 4, 4, 4, 4, 3, 3, 4, 3, 4, 4, 4, 5, 4, 3, 4, 4]

In [333]:
len(extract_value(p1))

20

In [330]:
len(p1)

25