Basically take the stage2_launched_workers.csv and add in the remaining data we need:

* Their accept + submit time on stage 1
* Their answers on stage 1
* Whether they accepted the stage 2 HIT
* Their accept + submit time on stage 2
* The Qualtrics data for the accepters

In [56]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [97]:
import datetime

import pandas as pd
import numpy as np

import mtglobals

In [58]:
client, mturk_environment = mtglobals.gen_client()

Your account balance is 499927.16


In [59]:
all_hits = mtglobals.download_all_hits(client)

Downloading list of hits from 2021-11-05 00:00:00+00:00 onwards to ../results_2stage/all_hit_data.pkl
p2:gbkrVjGo4mv+y0SfXwW4SaU1yC4J+OHs3W6KIYqXiwVM9uomThEuSuzAvGlyHbw=


In [60]:
len(all_hits)

7

In [61]:
newest_hits = all_hits[:92]

In [62]:
# Split hits based on what stage they're from
stage1_hits = [h for h in newest_hits if not h['Title'].startswith('Custom workplace')]
stage2_hits = [h for h in newest_hits if h['Title'].startswith('Custom workplace')]
print(len(stage1_hits))
print(len(stage2_hits))

1
6


### Parse stage1 results

In [63]:
stage1_hit_id = stage1_hits[0]['HITId']
stage1_submissions = mtglobals.get_hit_submissions(client, stage1_hit_id)

In [64]:
len(stage1_submissions)

6

In [65]:
stage1_submissions[0]

{'AssignmentId': '3PS7W85Z803YIAYDUGI591EVJZM9T7',
 'WorkerId': 'A2BNQ8VAMP3ZYO',
 'HITId': '39KV3A5D198Q27ZB0HATXGVCMOT7SL',
 'AssignmentStatus': 'Approved',
 'AutoApprovalTime': datetime.datetime(2021, 12, 20, 10, 49, 17, tzinfo=tzlocal()),
 'AcceptTime': datetime.datetime(2021, 11, 20, 10, 48, 45, tzinfo=tzlocal()),
 'SubmitTime': datetime.datetime(2021, 11, 20, 10, 49, 17, tzinfo=tzlocal()),
 'ApprovalTime': datetime.datetime(2021, 11, 20, 10, 51, 9, tzinfo=tzlocal()),
 'Answer': '<?xml version="1.0" encoding="ASCII"?><QuestionFormAnswers xmlns="http://mechanicalturk.amazonaws.com/AWSMechanicalTurkDataSchemas/2005-10-01/QuestionFormAnswers.xsd"><Answer><QuestionIdentifier>age</QuestionIdentifier><FreeText>28</FreeText></Answer><Answer><QuestionIdentifier>onlinehrs</QuestionIdentifier><FreeText>40</FreeText></Answer><Answer><QuestionIdentifier>reason</QuestionIdentifier><FreeText>money</FreeText></Answer></QuestionFormAnswers>'}

In [70]:
# Now extract the data we want for each submission
def extract_stage1_data(s):
    s_data = {'worker_id': s['WorkerId'], 'assignment_id': s['AssignmentId'],
              'stage1_accept_time':s['AcceptTime'], 'stage1_submit_time':s['SubmitTime'],
              'stage1_accept_submit_gap':(s['SubmitTime']-s['AcceptTime'])}
    s_answers = mtglobals.parse_stage1_answer(s['Answer'])
    # Add each answer as a field
    for cur_key, cur_val in s_answers.items():
        answer_var = "stage1_" + cur_key
        s_data[answer_var] = cur_val
    return s_data

stage1_data = {s['WorkerId']: extract_stage1_data(s) for s in stage1_submissions}

In [71]:
stage1_data

{'A2BNQ8VAMP3ZYO': {'worker_id': 'A2BNQ8VAMP3ZYO',
  'assignment_id': '3PS7W85Z803YIAYDUGI591EVJZM9T7',
  'stage1_accept_time': datetime.datetime(2021, 11, 20, 10, 48, 45, tzinfo=tzlocal()),
  'stage1_submit_time': datetime.datetime(2021, 11, 20, 10, 49, 17, tzinfo=tzlocal()),
  'stage1_accept_submit_gap': datetime.timedelta(seconds=32),
  'stage1_age': '28',
  'stage1_onlinehrs': '40',
  'stage1_reason': 'money'},
 'ADXJ0I0MRHLPX': {'worker_id': 'ADXJ0I0MRHLPX',
  'assignment_id': '3NL0RFNU0GOYIL2EJC8MR2GJEUHK4G',
  'stage1_accept_time': datetime.datetime(2021, 11, 20, 10, 50, 55, tzinfo=tzlocal()),
  'stage1_submit_time': datetime.datetime(2021, 11, 20, 10, 54, 7, tzinfo=tzlocal()),
  'stage1_accept_submit_gap': datetime.timedelta(seconds=192),
  'stage1_age': '30',
  'stage1_onlinehrs': '30',
  'stage1_reason': 'skills'},
 'A3HZFB2JLF3JMY': {'worker_id': 'A3HZFB2JLF3JMY',
  'assignment_id': '386PBUZZXGYZ593H7J7RC4IHLRCJL9',
  'stage1_accept_time': datetime.datetime(2021, 11, 20, 10,

### Now go through and get the result for each *stage2* hit

In [73]:
for n, h in enumerate(stage2_hits):
    print((n, str(h['CreationTime']), h['Title'], h['Reward'], h['NumberOfAssignmentsCompleted'],
     h['NumberOfAssignmentsPending'], h['NumberOfAssignmentsAvailable']))

(0, '2021-11-20 17:24:51-08:00', 'Custom workplace survey HIT for worker id A362MXJ7439Q76', '1.50', 0, 0, 1)
(1, '2021-11-20 11:32:34-08:00', 'Custom workplace survey HIT for worker id A2YO837C0O1E91', '0.50', 1, 0, 0)
(2, '2021-11-20 11:32:34-08:00', 'Custom workplace survey HIT for worker id A2JP9IKRHNLRPI', '1.50', 0, 0, 1)
(3, '2021-11-20 11:11:55-08:00', 'Custom workplace survey HIT for worker id A3HZFB2JLF3JMY', '0.50', 1, 0, 0)
(4, '2021-11-20 11:05:53-08:00', 'Custom workplace survey HIT for worker id ADXJ0I0MRHLPX', '1.50', 1, 0, 0)
(5, '2021-11-20 10:52:45-08:00', 'Custom workplace survey HIT for worker id A2BNQ8VAMP3ZYO', '0.50', 1, 0, 0)


In [81]:
# Now compile a dataset using mtglobals.get_hit_assignments() to determine whether
# or not a given HIT was submitted
outcome_data = []
for cur_hit in stage2_hits:
    cur_title = cur_hit['Title']
    print(f"Processing {cur_title}")
    cur_id = cur_hit['HITId']
    cur_worker_id = mtglobals.worker_id_from_title(cur_title)
    cur_reward = cur_hit['Reward']
    was_submitted = False
    hit_submissions = mtglobals.get_hit_submissions(client, cur_id)
    accept_time = None
    submit_time = None
    time_gap = None
    if len(hit_submissions) > 0:
        # Submitted
        was_submitted = True
        # And get data on the submission
        hit_submission = hit_submissions[0]
        accept_time = hit_submission['AcceptTime']
        submit_time = hit_submission['SubmitTime']
        time_gap = submit_time - accept_time
    # Now save the data
    cur_data = {'hit_id':cur_id, 'worker_id':cur_worker_id, 'submitted':was_submitted,
                'reward':cur_reward, 'stage2_accept_time': accept_time, 'stage2_submit_time': submit_time,
                'stage2_accept_submit_gap': time_gap}
    # But we also want to add in the stage1 data here
    cur_data.update(stage1_data[cur_worker_id])
    outcome_data.append(cur_data)

Processing Custom workplace survey HIT for worker id A362MXJ7439Q76
Processing Custom workplace survey HIT for worker id A2YO837C0O1E91
Processing Custom workplace survey HIT for worker id A2JP9IKRHNLRPI
Processing Custom workplace survey HIT for worker id A3HZFB2JLF3JMY
Processing Custom workplace survey HIT for worker id ADXJ0I0MRHLPX
Processing Custom workplace survey HIT for worker id A2BNQ8VAMP3ZYO


In [82]:
outcome_df = pd.DataFrame(outcome_data)
outcome_df

Unnamed: 0,hit_id,worker_id,submitted,reward,stage2_accept_time,stage2_submit_time,stage2_accept_submit_gap,assignment_id,stage1_accept_time,stage1_submit_time,stage1_accept_submit_gap,stage1_age,stage1_onlinehrs,stage1_reason
0,3I6NF2WGIHX8PYC1HO1FAIYD7755GL,A362MXJ7439Q76,False,1.5,NaT,NaT,NaT,3JAOYWH7VJ5AZXQ8L7GENA4EPM3L9J,2021-11-20 12:28:25-08:00,2021-11-20 12:31:19-08:00,0 days 00:02:54,30 1991,40,skills
1,309D674SH0MXX2D5GCW9NDDQJBYBCH,A2YO837C0O1E91,True,0.5,2021-11-20 11:33:11-08:00,2021-11-20 12:12:51-08:00,0 days 00:39:40,3WYP994K18S7HDV96F2SX8F880X6YJ,2021-11-20 11:04:58-08:00,2021-11-20 11:22:49-08:00,0 days 00:17:51,31,30,money
2,337F8MIIM0ETIBXGPHT2KY5J0UK40B,A2JP9IKRHNLRPI,False,1.5,NaT,NaT,NaT,3R2UR8A0IBH86EB31F2KVUUQRUWOXJ,2021-11-20 10:49:26-08:00,2021-11-20 11:12:57-08:00,0 days 00:23:31,26,20,money
3,3F6045TU7EPCBBR5688MUHHSPQS99O,A3HZFB2JLF3JMY,True,0.5,2021-11-20 11:12:46-08:00,2021-11-20 11:40:18-08:00,0 days 00:27:32,386PBUZZXGYZ593H7J7RC4IHLRCJL9,2021-11-20 10:48:44-08:00,2021-11-20 11:11:19-08:00,0 days 00:22:35,24,50,fun
4,3VDI8GSXAGURCGV3FM8IM2849X5G8X,ADXJ0I0MRHLPX,True,1.5,2021-11-20 11:06:48-08:00,2021-11-20 11:22:31-08:00,0 days 00:15:43,3NL0RFNU0GOYIL2EJC8MR2GJEUHK4G,2021-11-20 10:50:55-08:00,2021-11-20 10:54:07-08:00,0 days 00:03:12,30,30,skills
5,3UOMW19E6E7V8MWWF25WKRDY9FC5CA,A2BNQ8VAMP3ZYO,True,0.5,2021-11-20 10:53:16-08:00,2021-11-20 10:59:23-08:00,0 days 00:06:07,3PS7W85Z803YIAYDUGI591EVJZM9T7,2021-11-20 10:48:45-08:00,2021-11-20 10:49:17-08:00,0 days 00:00:32,28,40,money


In [77]:
high_df = outcome_df[outcome_df['reward'] == "1.50"].copy()
low_df = outcome_df[outcome_df['reward'] == "0.50"].copy()

In [78]:
high_df['submitted'].value_counts()

False    2
True     1
Name: submitted, dtype: int64

In [79]:
low_df['submitted'].value_counts()

True    3
Name: submitted, dtype: int64

### Now merge in the Qualtrics data

In [83]:
qual_fname = "Job+Quality+NonWM+12.0_November+21,+2021_16.06.xlsx"
qual_df = pd.read_excel(f"../results/qualtrics/{qual_fname}")

  warn("Workbook contains no default style, apply openpyxl's default")


In [84]:
qual_df = qual_df.iloc[1:].copy()

In [85]:
qual_df

Unnamed: 0,StartDate,EndDate,Status,IPAddress,Progress,Duration (in seconds),Finished,RecordedDate,ResponseId,RecipientLastName,...,generated_jobtitle_21,generated_jobtitle_22,generated_jobtitle_23,generated_jobtitle_24,generated_jobtitle_25,generated_jobtitle_26,generated_jobtitle_27,generated_jobtitle_28,generated_jobtitle_29,generated_jobtitle_30
1,2021-10-23 14:33:01,2021-10-23 14:37:45,IP Address,67.172.249.152,100.0,283.0,True,2021-10-23 14:37:46.219000,R_OvtQX4YtEzYaUZX,,...,proofreader,proofreader,proofreader,proofreader,proofreader,proofreader,proofreader,proofreader,proofreader,proofreader
2,2021-10-23 14:40:47,2021-10-23 14:47:56,IP Address,73.123.179.39,100.0,429.0,True,2021-10-23 14:47:58.519000,R_pt5l94KDW0qHEmR,,...,associate,associate,associate,associate,associate,associate,associate,associate,associate,associate
3,2021-10-23 15:07:12,2021-10-23 15:15:33,IP Address,68.199.9.22,100.0,501.0,True,2021-10-23 15:15:34.119000,R_3MlnSGQYDb1YMvD,,...,Financial Controller,Financial Controller,Financial Controller,Financial Controller,Financial Controller,Financial Controller,Financial Controller,Financial Controller,Financial Controller,Financial Controller
4,2021-10-23 15:08:14,2021-10-23 15:18:51,IP Address,173.47.178.180,100.0,636.0,True,2021-10-23 15:18:52.414000,R_1FqutLiMQyEclGS,,...,dept. mgr.,dept. mgr.,dept. mgr.,dept. mgr.,dept. mgr.,dept. mgr.,dept. mgr.,dept. mgr.,dept. mgr.,dept. mgr.
5,2021-10-23 15:15:02,2021-10-23 15:19:08,IP Address,71.175.7.39,100.0,246.0,True,2021-10-23 15:19:09.109000,R_BzvuLorhwD3d009,,...,contractor,contractor,contractor,contractor,contractor,contractor,contractor,contractor,contractor,contractor
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
136,2021-11-20 12:07:22,2021-11-20 12:22:20,IP Address,45.42.201.14,100.0,897.0,True,2021-11-20 12:22:21.014000,R_XTGonafhoVTmGzL,,...,IT,IT,IT,IT,IT,IT,IT,IT,IT,IT
137,2021-11-20 12:21:11,2021-11-20 12:39:46,IP Address,72.221.115.167,100.0,1114.0,True,2021-11-20 12:39:47.163000,R_1kHPDtD5JdFbRgz,,...,retail manager,retail manager,retail manager,retail manager,retail manager,retail manager,retail manager,retail manager,retail manager,retail manager
138,2021-11-20 13:08:37,2021-11-20 13:12:46,IP Address,206.54.198.154,100.0,248.0,True,2021-11-20 13:12:46.561000,R_1dc1vBYFP9bCtOo,,...,Data QA,Data QA,Data QA,Data QA,Data QA,Data QA,Data QA,Data QA,Data QA,Data QA
139,2021-11-20 12:46:54,2021-11-20 12:47:12,IP Address,98.212.187.205,7.0,17.0,False,2021-11-21 12:47:14.145000,R_1Ej5sBlSdiizamu,,...,,,,,,,,,,


In [86]:
qual_df.rename(columns={'mt_id':'worker_id'}, inplace=True)

In [87]:
qual_df['worker_id']

1      A1QUQ0TV9KVD4C
2       A5P12YJP805RG
3       AFFXVHHBUWW4D
4      A2MOKIEQZ0OF2M
5       AKQAI78JTXXC9
            ...      
136     ADXJ0I0MRHLPX
137    A3HZFB2JLF3JMY
138    A2YO837C0O1E91
139    A2JP9IKRHNLRPI
140    A2YO837C0O1E91
Name: worker_id, Length: 140, dtype: object

In [88]:
# Merge on mt_id
merged_df = outcome_df.merge(qual_df, on='worker_id', how='left', indicator=True)

In [89]:
merged_df['_merge'].value_counts()

both          6
left_only     1
right_only    0
Name: _merge, dtype: int64

In [90]:
to_drop = [c for c in merged_df.columns 
           if c.startswith("name_") or c.startswith("cur_") or c.startswith("val_") or "tense" in c]
merged_df.drop(columns=to_drop, inplace=True)

In [91]:
merged_df['stage2_accept_time']

0                         NaT
1   2021-11-20 11:33:11-08:00
2   2021-11-20 11:33:11-08:00
3                         NaT
4   2021-11-20 11:12:46-08:00
5   2021-11-20 11:06:48-08:00
6   2021-11-20 10:53:16-08:00
Name: stage2_accept_time, dtype: datetime64[ns, tzlocal()]

In [92]:
# Smh. If someone managed to do it twice, just take the most recent data
merged_df['worker_id']

0    A362MXJ7439Q76
1    A2YO837C0O1E91
2    A2YO837C0O1E91
3    A2JP9IKRHNLRPI
4    A3HZFB2JLF3JMY
5     ADXJ0I0MRHLPX
6    A2BNQ8VAMP3ZYO
Name: worker_id, dtype: object

In [93]:
merged_df.drop_duplicates(subset=['worker_id'], keep='last', inplace=True)

In [94]:
merged_df['worker_id']

0    A362MXJ7439Q76
2    A2YO837C0O1E91
3    A2JP9IKRHNLRPI
4    A3HZFB2JLF3JMY
5     ADXJ0I0MRHLPX
6    A2BNQ8VAMP3ZYO
Name: worker_id, dtype: object

In [95]:
#merged_df.to_stata("../results_2stage/results_qualtrics.dta")

In [104]:
dt_now = datetime.datetime.now()
dt_no_ms = str(dt_now).split(".")[0]
str(dt_no_ms)
dt_final = dt_no_ms.replace(" ","_").replace(":","")
dt_final

'2021-11-21_152937'

In [105]:
merged_df.to_csv(f"../results_2stage/results_qualtrics_{dt_final}.csv", index=False)