### Create a variable-label map, then output the results in Stata format

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import glob

import pandas as pd
import numpy as np
import unidecode

import mtglobals

### (a) Load older (non-2-stage) results (skip this if you only ran the 2-stage version)

In [3]:
df = pd.read_csv("../results_2stage/results_2021-11-02.csv")

In [5]:
df.rename(columns={'Answer.age':'age', 'Answer.onlinehrs':'onlinehrs', 'Answer.reason':'reason'}, inplace=True)

In [6]:
df

Unnamed: 0,hit_id,worker_id,submitted,reward,StartDate,EndDate,Status,IPAddress,Progress,Duration (in seconds),...,generated_jobtitle_22,generated_jobtitle_23,generated_jobtitle_24,generated_jobtitle_25,generated_jobtitle_26,generated_jobtitle_27,generated_jobtitle_28,generated_jobtitle_29,generated_jobtitle_30,_merge
0,3YCT0L9OMNAF5SRYGORITDH2MQ4SNX,A2I0RSK2Y87ETH,False,1.5,,,,,,,...,,,,,,,,,,left_only
1,3ULIZ0H1VB6BLJMYZ5GC3VU755S511,ACHEDJOIAGMP2,True,0.5,2021-11-01 20:38:47,2021-11-01 20:51:10,IP Address,185.186.60.87,100.0,743.0,...,Supervisor,Supervisor,Supervisor,Supervisor,Supervisor,Supervisor,Supervisor,Supervisor,Supervisor,both
2,3TFJJUELSIQ39PD9XJ3YMKAMPSS2CN,A18KS7Y1G3IUCM,False,1.5,,,,,,,...,,,,,,,,,,left_only
3,3E9ZFLPWOZTVN826DIKDND2DF6CXIU,AYUQBR5XZDJHZ,False,1.5,,,,,,,...,,,,,,,,,,left_only
4,3VZYA8PITPZBEU1G3MCCPQBX0DM50Y,A2VMHLKYJYO5TB,False,0.5,,,,,,,...,,,,,,,,,,left_only
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87,3P458N04Q2IRCNO4OFZL0XVWQAFX2J,A11F3MA5FWH6SJ,True,1.5,2021-11-01 14:15:35,2021-11-01 14:27:41,IP Address,24.88.146.255,100.0,725.0,...,Freelancer,Freelancer,Freelancer,Freelancer,Freelancer,Freelancer,Freelancer,Freelancer,Freelancer,both
88,3IH9TRB0FC0NX7HD6PZ6S4YCARXI1N,A2JFL3H254VGZ7,True,1.5,2021-11-01 13:57:36,2021-11-01 14:07:32,IP Address,63.75.245.171,100.0,596.0,...,WORK,WORK,WORK,WORK,WORK,WORK,WORK,WORK,WORK,both
89,3RWSQDNYLANUDV9VCG86E9PEV9OFFU,AE4P5KVUE9HST,True,1.5,2021-11-01 19:45:16,2021-11-01 19:54:46,IP Address,206.195.93.25,100.0,569.0,...,Retail Representive,Retail Representive,Retail Representive,Retail Representive,Retail Representive,Retail Representive,Retail Representive,Retail Representive,Retail Representive,both
90,3FDWKV9VCO3P2NY2TS7IVZAQ2XFUMF,A2VZGR99TJC2BZ,True,0.5,2021-11-01 13:17:18,2021-11-01 13:25:05,IP Address,72.53.179.173,100.0,466.0,...,IT Technician,IT Technician,IT Technician,IT Technician,IT Technician,IT Technician,IT Technician,IT Technician,IT Technician,both


Drop extraneous cols

In [40]:
# Uncomment for old versions
#drop_cols = ['Q11', 'Q22', 'Q12', 'Q90', 'Q13', 'Q14', 'Q15', 'Q20',
#             'Q19', 'Q16', 'Q17', 'Q18'] #, 'what_would_it_take', 'end_essay']
#drop_cols = drop_cols + [col for col in df.columns.values 
#                         if col.startswith("name_") or col.startswith("cur_") or col.startswith("val_")]
drop_cols = ['Duration (in seconds)'] + [col for col in df.columns.values
             if col.startswith("name_") or col.startswith("cur_") or col.startswith("val_") or "tense" in col]

In [41]:
df.drop(columns=drop_cols, inplace=True)

In [42]:
len(df.columns)

517

In [43]:
# Cool, now generate labels and output a .dta
labels = {
    'restdb_id': 'ID in the RestDB wage database',
    'worker_id': 'MTurk ID for the worker',
    'age': 'Respondent\'s specified age, in years',
    'onlinehrs': 'Number of hours per week the respondent spends doing online tasks for money',
    'reason': 'Respondent\'s primary reason for doing online tasks for money',
}
prefix_map = {
    'response': 'Response to offer ',
    'generated_jobtitle_': 'Job title generated for offer ',
    'generated_wage_': 'Wage generated for offer ',
    'generated_hrsweek_': 'Weekly hours generated for offer ',
    'generated_controlhrs_': 'Control over weekly hours? for offer ',
    'generated_paidsick_': 'Paid sick leave generated for offer ',
    'generated_friends_': 'Work with friends, generated for offer ',
    'generated_commute_': 'Commute time generated for offer ',
    'generated_physical_': 'Requires physical exertion, generated for offer ',
    'generated_skills_': 'Learn transferrable skills, generated for offer ',
    'generated_vaccine_': 'Requires vaccination, generated for offer ',
    'generated_express_': 'Opportunities for expression, generated for offer ',
    'generated_coworkers_': 'Dependable coworkers, generated for offer ',
    'generated_suprespect_': 'Supervisor treats with respect, offer ',
    'generated_supfair_': 'Supervisor treats everyone fairly, offer ',
}
for i in range(1,31):
    i_padded = str(i).zfill(2)
    for cur_var, cur_prefix in prefix_map.items():
        if cur_var == "response":
            varname = cur_var + str(i)
        else:
            varname = cur_var + i_padded
        value = cur_prefix + i_padded
        labels[varname] = value
len(labels)

455

In [44]:
'age' in labels.keys()

True

In [45]:
df.columns.values[:20]

array(['hit_id', 'worker_id', 'accepted_offer', 'reward', 'StartDate',
       'EndDate', 'Status', 'IPAddress', 'Progress', 'Finished',
       'RecordedDate', 'ResponseId', 'RecipientLastName',
       'RecipientFirstName', 'RecipientEmail', 'ExternalReference',
       'LocationLatitude', 'LocationLongitude', 'DistributionChannel',
       'UserLanguage'], dtype=object)

In [46]:
not_labeled = set(df.columns.values).difference(set(labels.keys()))

In [47]:
len(not_labeled)

66

In [48]:
not_labeled

{'DistributionChannel',
 'EndDate',
 'ExternalReference',
 'Finished',
 'IPAddress',
 'LocationLatitude',
 'LocationLongitude',
 'Progress',
 'RecipientEmail',
 'RecipientFirstName',
 'RecipientLastName',
 'RecordedDate',
 'ResponseId',
 'StartDate',
 'Status',
 'UserLanguage',
 '_merge',
 'accepted_offer',
 'commute',
 'controlhrs',
 'coworkers',
 'currently_employed',
 'education',
 'entered_commute',
 'entered_controlhrs',
 'entered_coworkers',
 'entered_express',
 'entered_friends',
 'entered_hrs',
 'entered_jobtitle',
 'entered_physical',
 'entered_sickleave',
 'entered_skills',
 'entered_supfair',
 'entered_suprespect',
 'entered_vaccine',
 'entered_wage',
 'ever_employed',
 'express',
 'friends',
 'fulltime',
 'gender',
 'gender_4_TEXT',
 'hit_id',
 'how_easy',
 'hrs',
 'jobtitle',
 'lowest_wage',
 'most_recent_fulltime',
 'most_recent_job',
 'mturk_code',
 'num_tasks',
 'offer',
 'physical',
 'race',
 'race_6_TEXT',
 'reward',
 'savings',
 'sickleave',
 'skills',
 'state',
 'su

In [49]:
# Rename "submitted" to "accepted_offer"
df.rename(columns={'submitted':'accepted_offer'}, inplace=True)

In [50]:
df['accepted_offer'].value_counts(dropna=False)

True     60
False    32
Name: accepted_offer, dtype: int64

In [51]:
# Convert NaN to None
df = df.where(pd.notnull(df), None)

In [52]:
df['accepted_offer']

0     False
1      True
2     False
3     False
4     False
      ...  
87     True
88     True
89     True
90     True
91     True
Name: accepted_offer, Length: 92, dtype: bool

In [53]:
obj_cols = list(df.select_dtypes(include=['object']).columns)

In [54]:
for cur_obj_col in obj_cols:
    df[cur_obj_col] = df[cur_obj_col].astype(str)

In [55]:
# Output to .dta
df.to_stata("../results/results_clean.dta", variable_labels=labels)

### (b) Label the newer 2-stage results

In [3]:
all_pkl_fpaths = glob.glob("../results_2stage/pilot_results_timegaps_*.pkl")
all_pkl_fpaths

['../results_2stage\\pilot_results_timegaps_20221014_150129.pkl',
 '../results_2stage\\pilot_results_timegaps_20221015_171141.pkl']

In [4]:
# Load the .pkl
df = pd.read_pickle(all_pkl_fpaths[-1])

In [5]:
df.columns

Index(['mt_id', 'wage_mturk', 'accepted', 'stage2_creation',
       'stage2_creation_str', 'stage2_accept_time', 'stage2_submit_time',
       'lwage', 'age', 'onlinehrs_str',
       ...
       'generated_jobtitle_26', 'generated_jobtitle_27',
       'generated_jobtitle_28', 'generated_jobtitle_29',
       'generated_jobtitle_30', 'qualtrics_source', '_merge', 'result',
       'stage1_accept_submit_gap', 'stage2_accept_submit_gap'],
      dtype='object', length=527)

In [6]:
# Cool, now generate labels and output a .dta
labels = {
    'reward_stage1': 'Reward offered for first-stage (3-question survey) HIT',
    'reward_stage2': 'Reward offered for the second-stage (full Qualtrics survey) HIT',
    'mt_id': 'MTurk ID for the worker',
    'age': 'Respondent\'s specified age, in years',
    'onlinehrs': 'Number of hours per week the respondent spends doing online tasks for money',
    'reason': 'Respondent\'s primary reason for doing online tasks for money',
    'submitted_hit': 'TRUE if respondent submitted the MTurk HIT, FALSE otherwise',
    'case': "'rejected' if respondent rejected offer, 'did_not_finish' if they started but didn't finish survey, 'completed_survey' otherwise"
}

### (c) Output to Stata format

In [7]:
timestamp = mtglobals.gen_timestamp()

In [8]:
df

Unnamed: 0,mt_id,wage_mturk,accepted,stage2_creation,stage2_creation_str,stage2_accept_time,stage2_submit_time,lwage,age,onlinehrs_str,...,generated_jobtitle_26,generated_jobtitle_27,generated_jobtitle_28,generated_jobtitle_29,generated_jobtitle_30,qualtrics_source,_merge,result,stage1_accept_submit_gap,stage2_accept_submit_gap
0,A101J0P2UYUKA1,1.05,1,2022-10-13 15:32:15,2022-10-13 15:32:15-07:00,2022-10-13 15:41:19-07:00,2022-10-13 15:48:31-07:00,0.048790,23,40,...,INFORMATION TECHNOLOGY,INFORMATION TECHNOLOGY,INFORMATION TECHNOLOGY,INFORMATION TECHNOLOGY,INFORMATION TECHNOLOGY,"Job+Quality+NonWM+13.0_October+15,+2022_18.07....",both,finished_survey,0 days 00:01:55,0 days 00:07:12
1,A10AKR84P1WXHL,0.98,1,2022-10-03 22:26:00,2022-10-03 22:26:00-07:00,2022-10-03 22:28:30-07:00,2022-10-03 23:05:39-07:00,-0.020203,27,36,...,Healthcare,Healthcare,Healthcare,Healthcare,Healthcare,"Job+Quality+NonWM+13.0_October+15,+2022_18.07....",both,finished_survey,0 days 00:09:33,0 days 00:37:09
2,A10W3EILX06DFO,0.50,1,2022-10-13 17:29:28,2022-10-13 17:29:28-07:00,2022-10-13 18:07:12-07:00,2022-10-13 19:03:30-07:00,-0.693147,35,40,...,IT,IT,IT,IT,IT,"Job+Quality+NonWM+13.0_October+15,+2022_18.07....",both,finished_survey,0 days 00:02:44,0 days 00:56:18
3,A114Q54Z648PKO,1.05,1,2022-10-13 18:37:57,2022-10-13 18:37:57-07:00,2022-10-13 19:41:48-07:00,2022-10-13 20:30:31-07:00,0.048790,25,8,...,,,,,,"Job+Quality+NonWM+13.0_October+15,+2022_18.07....",both,did_not_finish,0 days 00:24:13,0 days 00:48:43
4,A1198W1SPF1R4,1.02,0,2022-10-07 14:06:16,2022-10-07 14:06:16-07:00,NaT,NaT,0.019803,54,40,...,,,,,,"Job+Quality+NonWM+13.0_October+15,+2022_18.07....",both,did_not_finish,0 days 00:03:08,NaT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
795,AZFXFW2ZDX54B,1.00,1,2022-10-14 18:56:27,2022-10-14 18:56:27-07:00,2022-10-15 10:20:48-07:00,2022-10-15 10:39:45-07:00,0.000000,25,120,...,Marketing Manager,Marketing Manager,Marketing Manager,Marketing Manager,Marketing Manager,"Job+Quality+NonWM+13.0_October+15,+2022_18.07....",both,finished_survey,0 days 00:09:51,0 days 00:18:57
796,AZH32KEBRPRES,0.98,1,2022-10-03 22:20:25,2022-10-03 22:20:25-07:00,2022-10-03 22:22:15-07:00,2022-10-03 22:47:16-07:00,-0.020203,32,25,...,ADMIN,ADMIN,ADMIN,ADMIN,ADMIN,"Job+Quality+NonWM+13.0_October+15,+2022_18.07....",both,finished_survey,0 days 00:04:01,0 days 00:25:01
797,AZNWVHY3AUSNT,0.95,1,2022-10-10 22:04:06,2022-10-10 22:04:06-07:00,2022-10-10 22:07:03-07:00,2022-10-10 22:32:03-07:00,-0.051293,51,48,...,Middle level manager,Middle level manager,Middle level manager,Middle level manager,Middle level manager,"Job+Quality+NonWM+13.0_October+15,+2022_18.07....",both,finished_survey,0 days 00:02:41,0 days 00:25:00
798,AZPXDIOTEH989,0.50,1,2022-10-13 15:32:47,2022-10-13 15:32:47-07:00,2022-10-13 15:37:33-07:00,2022-10-13 15:58:15-07:00,-0.693147,25,35,...,information technology,information technology,information technology,information technology,information technology,"Job+Quality+NonWM+13.0_October+15,+2022_18.07....",both,finished_survey,0 days 00:24:52,0 days 00:20:42


In [9]:
df = df.applymap(lambda x: unidecode.unidecode(x) if type(x) == str else x)

In [10]:
df.dtypes

mt_id                                object
wage_mturk                           object
accepted                              int64
stage2_creation              datetime64[ns]
stage2_creation_str                  object
                                 ...       
qualtrics_source                     object
_merge                               object
result                               object
stage1_accept_submit_gap    timedelta64[ns]
stage2_accept_submit_gap    timedelta64[ns]
Length: 527, dtype: object

In [11]:
df['stage1_accept_time'] = df['stage1_accept_time'].apply(str)
df['stage1_submit_time'] = df['stage1_submit_time'].apply(str)
df['stage2_creation'] = df['stage2_creation'].apply(str)
df['stage2_accept_time'] = df['stage2_accept_time'].apply(str)
df['stage2_submit_time'] = df['stage2_submit_time'].apply(str)
df['stage1_accept_submit_gap'] = df['stage1_accept_submit_gap'].apply(str)
df['stage2_accept_submit_gap'] = df['stage2_accept_submit_gap'].apply(str)

In [12]:
df.dtypes

mt_id                       object
wage_mturk                  object
accepted                     int64
stage2_creation             object
stage2_creation_str         object
                             ...  
qualtrics_source            object
_merge                      object
result                      object
stage1_accept_submit_gap    object
stage2_accept_submit_gap    object
Length: 527, dtype: object

In [13]:
df.to_stata(f"../results_2stage/pilot_results_labels_{timestamp}.dta",
            variable_labels=labels, version=117)