In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import datetime

import pandas as pd
import numpy as np
import statsmodels.formula.api as smf

from mtclient import MTClient
import mtglobals

### (a) Get all hits since `date_cutoff`

In [3]:
client = MTClient()
all_hits = client.download_all_hits(start_cutoff=datetime.datetime(2022,11,10,0,0,0))

Loading environment variables from .env
Using LIVE MTurk requester API
Your account balance is 4683.18
Downloading list of hits from 2022-11-10 00:00:00 to 2022-11-22 11:42:22.528918-07:53
p2:VCoe28R6HyrZ6S03sHSgU5A2zb0MJOgjj1a4TqFW7P1SFRvFRQnUojCV7lZsbZo=
p2:dTf+PTfEEU50qltDhnFRNCOh8B9Fjn48fiIzRHEMUzlsTuA3OuvAfd9iZ4j72c8=
p2:G7kw7oS9T+lTtaMfNrRVQ31H4tJOL1LBUzc0p5uZZHLGIsDR0eeAH2A/9SgcYKo=
Creation 2022-11-07 13:59:32-08:00 before start_cutoff
Saving downloaded HIT data to ../results_2stage/all_hit_data.pkl


### (b) Get all workers with the qual for the current run

In [4]:
current_qual = mtglobals.get_current_qual()
current_qual['qual_name']

'Workplace_Survey_18'

In [5]:
# Or, override it if you want to monitor a previous wave
stage2_qual_names = [current_qual['qual_name']]
#stage2_qual_names = ['Workplace_Survey_17']
#[current_qual['qual_name'],]
                     #'Workplace_Survey_16',]
                     #'Workplace_Survey_15',]
                     #'Workplace_Survey_14',]
                     #'Workplace_Survey_11',
                     #'Workplace_Survey_10',]
                     #'Workplace_Survey_09', 'Workplace_Survey_08',
                     #'Workplace_Survey_07']
#stage2_qual_name = 'Workplace_Survey_04'

In [6]:
worker_ids = []
for cur_qual_name in stage2_qual_names:
    cur_worker_ids = client.get_workers_with_qual(cur_qual_name)
    worker_ids.extend(cur_worker_ids)

In [7]:
len(worker_ids)

100

In [8]:
# Print titles for each HIT
#hit_list = [(r_num, r['Title'], r['NumberOfAssignmentsAvailable']) for r_num, r in enumerate(all_hits)]
#len(hit_list)
#hit_list[:20]

In [9]:
# Now load the launched worker HIT info
#launched_df = pd.read_csv(mtglobals.stage2_launched_fpath)
#len(launched_df)

### (c) Get the HITs for each worker id

In [10]:
def get_most_recent(worker_hits):
    sorted_hits = sorted(worker_hits, key=lambda x: x['CreationTime'], reverse=True)
    #print([h['CreationTime'] for h in sorted_hits])
    return sorted_hits[0]

def get_active_hits(all_hits, launched_df):
    active_hits = []
    for row_index, cur_row in launched_df.iterrows():
        cur_worker_id = cur_row['worker_id']
        cur_launch = cur_row['launched_time']
        #print(f"{cur_worker_id}: {cur_launch}")
        # Find the HIT for this worker
        worker_hits = [h for h in all_hits if cur_worker_id in h['Title']]
        if len(worker_hits) > 1:
            # Multiple HITs for this worker...
            print(f"{len(worker_hits)} HITs for {cur_worker_id}")
            # Use CreationTime to find the most recent HIT
            #for hit_num, cur_hit in enumerate(worker_hits):
            #    print(f"HIT #{hit_num}: {cur_hit['CreationTime']}")
            most_recent = get_most_recent(worker_hits)
            #print(f"Most recent: {most_recent['CreationTime']}")
            #print(worker_hits)
            active_hits.append((cur_worker_id,most_recent))
        elif len(worker_hits) == 1:
            #print(f"Found unique HIT for {cur_worker_id}")
            active_hits.append((cur_worker_id, worker_hits[0]))
        else:
            print(f"No HIT found for {cur_worker_id}!")
    return active_hits

In [11]:
def get_hit_for_worker(all_hits, worker_id, verbose=False):
    vprint = print if verbose else lambda x: None
    # The HIT only has the worker_id in its title, so we have to extract
    # using mtglobals.worker_id_from_title()
    #vprint([h['Title'] for h in all_hits])
    results = [h for h in all_hits if mtglobals.worker_id_from_title(h['Title']) == worker_id]
    if len(results) == 0:
        raise Exception(f"No HIT found for worker {worker_id}")
    return results[0]

In [12]:
# Get the hits created for all workers with the qual specified above
stage2_hits = [get_hit_for_worker(all_hits, w, verbose=True) for w in worker_ids]

In [13]:
#stage2_hits

In [14]:
#stage2_hits = [h for h in all_hits if h['Title'].startswith("Custom workplace survey HIT")]
#stage2_hit_info = get_active_hits(all_hits, launched_df)
#stage2_hits = [i[1] for i in stage2_hit_info]
#print(f"{len(stage2_hits)} stage-2 HITs found")
#stage2_ids = [h['HITId'] for h in stage2_hits]
#info_vars = ['HITId','HITTypeId','HITGroupId','HITLayoutId', #'Title', 'Description',
#            'Reward']

### (d) Get info about + approve the submissions (if any) for these HITs

In [15]:
def process_assignments(assignment_response):
    #print(f"=====[ Submitted HIT: {cur_title} ]=====")
    # Approve Stage2 submission
    assignment_id = assignment_response['AssignmentId']
    status = assignment_response['AssignmentStatus']
    if status == "Submitted":
        print(f"Status: {status}")
        response = client.approve_assignment(assignment_id)
        print(f"Approved assignment {assignment_id}")
        print(response)

approve_submitted = True
submitted_list = []
all_submit_info = []
for hit_num, cur_hit in enumerate(stage2_hits):
    cur_title = cur_hit['Title']
    cur_creation = cur_hit['CreationTime']
    #print("=====[ start loop iteration ]=====")
    print(f"HIT #{hit_num}: {cur_title}, created {cur_creation}")
    #print()
    cur_id = cur_hit['HITId']
    cur_reward = cur_hit['Reward']
    num_avail = cur_hit['NumberOfAssignmentsAvailable']
    assignment_responses = client.get_hit_submissions(cur_id)
    # Should have only one submission
    if len(assignment_responses) > 1:
        raise Exception(f"Multiple responses for HITId {cur_id}")
    elif len(assignment_responses) < 1:
        print("Custom HIT not yet submitted")
    else:
        assignment_response = assignment_responses[0]
        print(assignment_response)
        if approve_submitted:
            process_assignments(assignment_response)
    # Get submission info
    cur_worker_id = mtglobals.worker_id_from_title(cur_title)
    submit_info = {
        'worker_id': cur_worker_id,
        'wage': cur_reward,
        'num_avail': num_avail,
        'accepted': 1-num_avail
    }
    all_submit_info.append(submit_info)
    #print("----- end loop iteration -----\n")

HIT #0: Custom workplace survey HIT for worker id A11JCTPPYRML49, created 2022-11-20 16:35:30-08:00
Custom HIT not yet submitted
HIT #1: Custom workplace survey HIT for worker id A122QSIA01PR3L, created 2022-11-20 17:07:46-08:00
{'AssignmentId': '3X0H8UUIT2P8F6Y6YRN7DGQLOYISWP', 'WorkerId': 'A122QSIA01PR3L', 'HITId': '309D674SH0MXX2D5GCW9NDDQPXACB8', 'AssignmentStatus': 'Submitted', 'AutoApprovalTime': datetime.datetime(2022, 12, 21, 6, 25, 54, tzinfo=tzlocal()), 'AcceptTime': datetime.datetime(2022, 11, 21, 5, 47, 18, tzinfo=tzlocal()), 'SubmitTime': datetime.datetime(2022, 11, 21, 6, 25, 54, tzinfo=tzlocal()), 'Answer': '<?xml version="1.0" encoding="ASCII"?><QuestionFormAnswers xmlns="http://mechanicalturk.amazonaws.com/AWSMechanicalTurkDataSchemas/2005-10-01/QuestionFormAnswers.xsd"><Answer><QuestionIdentifier>surveyCode</QuestionIdentifier><FreeText>81996708</FreeText></Answer></QuestionFormAnswers>'}
Status: Submitted
Approved assignment 3X0H8UUIT2P8F6Y6YRN7DGQLOYISWP
{'ResponseM

In [16]:
len(all_submit_info)

100

In [17]:
all_submit_info[0]

{'worker_id': 'A11JCTPPYRML49', 'wage': '1.50', 'num_avail': 1, 'accepted': 0}

Select only the expired HITs here, if necessary

In [18]:
#accepted_info = accepted_info[:8]

### (e) Transform the downloaded data into a .dta for regression

(This is the .dta for *just* this run. The full .dta with all the data across all runs can be generated using 03a_CompileResults.ipynb)

In [19]:
# Transform this into a .dta for regression
accepted_df = pd.DataFrame(all_submit_info)

In [20]:
accepted_df['lwage'] = accepted_df['wage'].apply(lambda x: np.log(float(x)))

In [21]:
# Save the data in Stata format
accepted_df.to_stata('../results_2stage/accepted_reg_currun.dta')

### (f) But also run the regression here (for Jeff)

In [22]:
accepted_df

Unnamed: 0,worker_id,wage,num_avail,accepted,lwage
0,A11JCTPPYRML49,1.50,1,0,0.405465
1,A122QSIA01PR3L,1.50,0,1,0.405465
2,A14CBWGILA1VG4,1.01,1,0,0.009950
3,A14FOFNUBU2P5D,1.00,1,0,0.000000
4,A165ZC634AIYVY,0.90,1,0,-0.105361
...,...,...,...,...,...
95,AVZQFHMC30ZN4,0.95,0,1,-0.051293
96,AWDFPDUDNCVOX,1.50,0,1,0.405465
97,AWXHYI0RU9SNV,0.95,1,0,-0.051293
98,AWYKYM16CB00Z,0.90,0,1,-0.105361


In [23]:
# Get just the info for the most recent HITs
#accepted_info_sub = accepted_info[-106:]
#accepted_info_sub;

In [24]:
totals = {'overall':0}
accepts = {'overall':0}
for cur_info in all_submit_info:
    cur_wage = cur_info['wage']
    if cur_wage not in totals:
        totals[cur_wage] = 0
    if cur_wage not in accepts:
        accepts[cur_wage] = 0
    cur_accepted = cur_info['accepted']
    totals['overall'] = totals['overall'] + 1
    totals[cur_wage] = totals[cur_wage] + 1
    if cur_accepted == 1:
        accepts['overall'] = accepts['overall'] + 1
        accepts[cur_wage] = accepts[cur_wage] + 1

In [25]:
totals

{'overall': 100,
 '1.50': 7,
 '1.01': 13,
 '1.00': 6,
 '0.90': 11,
 '0.50': 11,
 '1.10': 7,
 '0.99': 4,
 '1.05': 11,
 '1.02': 7,
 '0.98': 8,
 '0.95': 15}

In [26]:
accepts

{'overall': 53,
 '1.50': 5,
 '1.01': 10,
 '1.00': 3,
 '0.90': 6,
 '0.50': 7,
 '1.10': 4,
 '0.99': 3,
 '1.05': 5,
 '1.02': 2,
 '0.98': 3,
 '0.95': 5}

In [27]:
for cur_wage in sorted(totals.keys()):
    header = f"{cur_wage}:" if cur_wage == "overall" else f"${cur_wage}:"
    print(header)
    print(f"{accepts[cur_wage]} accepted out of {totals[cur_wage]} offers")
    accept_rate = accepts[cur_wage] / totals[cur_wage]
    print(f"Accept rate: {accept_rate:.4f}")
#print(f"Regression intercept: {intercept:.4f}")

$0.50:
7 accepted out of 11 offers
Accept rate: 0.6364
$0.90:
6 accepted out of 11 offers
Accept rate: 0.5455
$0.95:
5 accepted out of 15 offers
Accept rate: 0.3333
$0.98:
3 accepted out of 8 offers
Accept rate: 0.3750
$0.99:
3 accepted out of 4 offers
Accept rate: 0.7500
$1.00:
3 accepted out of 6 offers
Accept rate: 0.5000
$1.01:
10 accepted out of 13 offers
Accept rate: 0.7692
$1.02:
2 accepted out of 7 offers
Accept rate: 0.2857
$1.05:
5 accepted out of 11 offers
Accept rate: 0.4545
$1.10:
4 accepted out of 7 offers
Accept rate: 0.5714
$1.50:
5 accepted out of 7 offers
Accept rate: 0.7143
overall:
53 accepted out of 100 offers
Accept rate: 0.5300


In [28]:
accept_rate_overall = accepts['overall'] / totals['overall']
accept_rate_overall

0.53

### (e) And now compute the elasticity

In [29]:
result = smf.ols(formula='accepted ~ lwage', data=accepted_df).fit(cov_type='HC1')
print(result.summary())

                            OLS Regression Results                            
Dep. Variable:               accepted   R-squared:                       0.000
Model:                            OLS   Adj. R-squared:                 -0.010
Method:                 Least Squares   F-statistic:                   0.01510
Date:                Tue, 22 Nov 2022   Prob (F-statistic):              0.902
Time:                        11:43:11   Log-Likelihood:                -72.391
No. Observations:                 100   AIC:                             148.8
Df Residuals:                      98   BIC:                             154.0
Df Model:                           1                                         
Covariance Type:                  HC1                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      0.5287      0.052     10.245      0.0

In [30]:
# Save list of submitters
submitter_list = []
for cur_assignment in submitted_list:
    worker_id = cur_assignment['WorkerId']
    submitter_list.append(worker_id)

In [31]:
submit_df = pd.DataFrame({'worker_id':submitter_list})

In [32]:
csv_fpath = mtglobals.stage2_submitted_fpath
submit_df.to_csv(csv_fpath, index=False)