In [1]:
import pandas as pd

In [2]:
# Constants
COLUMNS_TO_CONVERT = ['age_targeted', 'ethnicity_targeted', 'gender_targeted', 'education_targeted', 
                      'religious_affiliation_targeted', 'occupation_targeted', 
                      'geographic_location_targeted', 'party_affiliation_targeted', 
                      'ideological_affiliation_targeted', 'political_engagement_targeted']
COLUMNS_TO_DUMMY = ['treatment_condition', 'number_attributes_targeted']
DV_RESPONSE_COLS = ['dv_response1', 'dv_response2', 'dv_response3', 'dv_response4', 'dv_response5']

# Function to create dummy variables
def create_dummies(df, column, prefix=None):
    dummies = pd.get_dummies(df[column], prefix=prefix)
    return pd.concat([df, dummies], axis=1)

# Function to convert column values to binary
def convert_to_binary(df, columns):
    for col in columns:
        df[col] = df[col].apply(lambda x: 1 if x == 't' else 0)
    return df

# Load data
df = pd.read_csv('full_data_raw.csv')

# Preprocess data
df = df[df['attention_check'] == "pass"]

# Correcting the typo in ~10 values in the 'meta_perception' column
df['meta_perception'] = df['meta_perception'].replace("somehwhat_different", "somewhat_different")

for col in COLUMNS_TO_DUMMY:
    prefix = 'a' if col == 'number_attributes_targeted' else None
    df = create_dummies(df, col, prefix)
df = convert_to_binary(df, COLUMNS_TO_CONVERT)

# Reverse 'dv_response_2' and create the dependent variable measure
df['dv_response2'] = 100 - df['dv_response2']
df['dv_response_mean'] = df[DV_RESPONSE_COLS].mean(axis=1)

#drop columns called prolific_pid, start_time, end_time, total_time, attention_check, session_id
df = df.drop(['start_time', 'end_time', 'total_time', 'attention_check', 'session_id', 'study_id'], axis=1)

political_ideology_mapping = {
    'very_liberal': 0,
    'moderately_liberal': 1,
    'other': 2,
    'neutral': 2,
    'moderate_republican': 3,
    'very_republican': 4
}

df['political_ideology_coded'] = df['ideological_affiliation'].map(political_ideology_mapping)

political_partianship_mapping = {
    'strong_democrat': 0,
    'moderate_democrat': 1,
    'other': 2,
    'independent': 2,
    'moderate_republican': 3,
    'strong_republican': 4
}

df['political_party_coded'] = df['party_affiliation'].map(political_partianship_mapping)

political_engagement_mapping = {
    'highly_disengaged': 0,
    'moderately_unengaged': 1,
    'moderately_engaged': 2,
    'highly_engaged': 3
}

df['political_engagement_coded'] = df['political_engagement'].map(political_engagement_mapping)

#save the cleaned data to a csv file called "GPTarget2023"
df.to_csv('GPTarget2024.csv', index=False)