In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import datetime

import boto3
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf

from mtclient import MTClient
import mtglobals

### (a) Get all hits since `date_cutoff`

In [3]:
client = MTClient()
all_hits = client.download_all_hits()

Your account balance is 2481.66
Downloading list of hits from 2021-10-30 00:00:00+00:00 to 2022-09-21 17:27:07.117372-07:53
p2:dPZUAoRE7HA7o3GLotDDQ3EXkCYTOBlfBg8RV9hP05Gb7NKhysqDHGjgGQ6kdJM=
Saving downloaded HIT data to ../results_2stage/all_hit_data.pkl


### (b) Get all workers with the qual for the current run

In [4]:
current_qual = mtglobals.get_current_qual()
current_qual['qual_name']

'Workplace_Survey_00'

In [6]:
worker_ids = client.get_workers_with_qual(current_qual['qual_name'])

In [7]:
len(worker_ids)

0

In [46]:
# Print titles for each HIT
#hit_list = [(r_num, r['Title'], r['NumberOfAssignmentsAvailable']) for r_num, r in enumerate(all_hits)]
#len(hit_list)
#hit_list[:20]

In [47]:
# Now load the launched worker HIT info
#launched_df = pd.read_csv(mtglobals.stage2_launched_fpath)
#len(launched_df)

### (c) Get the HITs for each worker id

In [15]:
def get_most_recent(worker_hits):
    sorted_hits = sorted(worker_hits, key=lambda x: x['CreationTime'], reverse=True)
    #print([h['CreationTime'] for h in sorted_hits])
    return sorted_hits[0]

def get_active_hits(all_hits, launched_df):
    active_hits = []
    for row_index, cur_row in launched_df.iterrows():
        cur_worker_id = cur_row['worker_id']
        cur_launch = cur_row['launched_time']
        #print(f"{cur_worker_id}: {cur_launch}")
        # Find the HIT for this worker
        worker_hits = [h for h in all_hits if cur_worker_id in h['Title']]
        if len(worker_hits) > 1:
            # Multiple HITs for this worker...
            print(f"{len(worker_hits)} HITs for {cur_worker_id}")
            # Use CreationTime to find the most recent HIT
            #for hit_num, cur_hit in enumerate(worker_hits):
            #    print(f"HIT #{hit_num}: {cur_hit['CreationTime']}")
            most_recent = get_most_recent(worker_hits)
            #print(f"Most recent: {most_recent['CreationTime']}")
            #print(worker_hits)
            active_hits.append((cur_worker_id,most_recent))
        elif len(worker_hits) == 1:
            #print(f"Found unique HIT for {cur_worker_id}")
            active_hits.append((cur_worker_id, worker_hits[0]))
        else:
            print(f"No HIT found for {cur_worker_id}!")
    return active_hits

In [16]:
def get_hit_for_worker(all_hits, worker_id, verbose=False):
    vprint = print if verbose else lambda x: None
    # The HIT only has the worker_id in its title, so we have to extract
    # using mtglobals.worker_id_from_title()
    #vprint([h['Title'] for h in all_hits])
    results = [h for h in all_hits if mtglobals.worker_id_from_title(h['Title']) == worker_id]
    if len(results) == 0:
        raise Exception(f"No HIT found for worker {worker_id}")
    return results[0]

In [8]:
# Get the hits created for all workers with the qual specified above
stage2_hits = [get_hit_for_worker(all_hits, w, verbose=True) for w in worker_ids]

In [9]:
stage2_hits

[]

In [18]:
#stage2_hits = [h for h in all_hits if h['Title'].startswith("Custom workplace survey HIT")]
#stage2_hit_info = get_active_hits(all_hits, launched_df)
#stage2_hits = [i[1] for i in stage2_hit_info]
#print(f"{len(stage2_hits)} stage-2 HITs found")
#stage2_ids = [h['HITId'] for h in stage2_hits]
#info_vars = ['HITId','HITTypeId','HITGroupId','HITLayoutId', #'Title', 'Description',
#            'Reward']

### (d) Get info about + approve the submissions (if any) for these HITs

In [19]:
def check_submitted(hit_id):
    # Return 
    assignment_response = client.list_assignments_for_hit(
        HITId=hit_id,
        #AssignmentStatuses=[
        #    'Submitted'|'Approved'|'Rejected',
        #]
    )
    return assignment_response

In [20]:
[
    (n, str(h['CreationTime']), h['Title'], h['Reward'], h['NumberOfAssignmentsCompleted'],
     h['NumberOfAssignmentsPending'], h['NumberOfAssignmentsAvailable'])
    for n, h in enumerate(stage2_hits)
];

In [21]:
def process_assignments(assignment_response):
    num_results = assignment_response['NumResults']
    assignments = assignment_response['Assignments']
    if num_results > 0:
        #print(f"=====[ Submitted HIT: {cur_title} ]=====")
        # Approve Stage2 submission
        cur_assignment = assignments[0]
        submitted_list.append(cur_assignment)
        assignment_id = cur_assignment['AssignmentId']
        status = cur_assignment['AssignmentStatus']
        if status == "Submitted":
            print(f"Status: {status}")
            response = client.approve_assignment(
                AssignmentId=assignment_id,
                OverrideRejection=True
            )
            print(f"Approved assignment {assignment_id}")
            print(response)


approve_submitted = True
submitted_list = []
all_submit_info = []
for hit_num, cur_hit in enumerate(stage2_hits):
    cur_title = cur_hit['Title']
    cur_creation = cur_hit['CreationTime']
    #print("=====[ start loop iteration ]=====")
    print(f"HIT #{hit_num}: {cur_title}, created {cur_creation}")
    #print()
    cur_id = cur_hit['HITId']
    cur_reward = cur_hit['Reward']
    num_avail = cur_hit['NumberOfAssignmentsAvailable']
    assignment_response = check_submitted(cur_id)
    if approve_submitted:
        process_assignments(assignment_response)
    # Get submission info
    cur_worker_id = mtglobals.worker_id_from_title(cur_title)
    submit_info = (cur_worker_id, cur_reward, num_avail)
    all_submit_info.append(submit_info)
    #print("----- end loop iteration -----\n")

HIT #0: Custom workplace survey HIT for worker id A101034V35PRT0, created 2022-02-02 15:33:35-08:00
HIT #1: Custom workplace survey HIT for worker id A101U3FTZYY87V, created 2022-02-02 15:32:07-08:00
HIT #2: Custom workplace survey HIT for worker id A103A7PRZURWVQ, created 2022-02-02 16:16:13-08:00
HIT #3: Custom workplace survey HIT for worker id A10994CPHWH2VP, created 2022-02-02 15:34:50-08:00
HIT #4: Custom workplace survey HIT for worker id A109ZF92TH9Y8J, created 2022-02-02 15:34:18-08:00
HIT #5: Custom workplace survey HIT for worker id A10HW8JXM17XLD, created 2022-02-02 15:40:24-08:00
HIT #6: Custom workplace survey HIT for worker id A10KKMVN3C5JK2, created 2022-02-02 16:13:31-08:00
HIT #7: Custom workplace survey HIT for worker id A10KRP4WDWI8PN, created 2022-02-02 15:34:20-08:00
HIT #8: Custom workplace survey HIT for worker id A10QHF22I8ON1O, created 2022-02-02 16:16:35-08:00
HIT #9: Custom workplace survey HIT for worker id A10STLCK650JHQ, created 2022-02-02 16:12:50-08:00


In [22]:
len(all_submit_info)

1000

In [23]:
all_submit_info[0]

('A101034V35PRT0', '0.99', 0)

In [24]:
# Just switches 0 and 1 so it's 1 for accepted and 0 otherwise
accepted_info = [(i[0], i[1], 1-i[2]) for i in all_submit_info]

Select only the expired HITs here, if necessary

In [25]:
#accepted_info = accepted_info[:8]

### (e) Transform the downloaded data into a .dta for regression

(This is the .dta for *just* this run. The full .dta with all the data across all runs can be generated using 03a_CompileResults.ipynb)

In [26]:
# Transform this into a .dta for regression
accepted_df = pd.DataFrame(accepted_info, columns=['worker_id','wage','accepted'])

In [27]:
accepted_df['lwage'] = accepted_df['wage'].apply(lambda x: np.log(float(x)))

In [30]:
# Save the data in Stata format
accepted_df.to_stata('../results_2stage/accepted_reg_currun.dta')

### (f) But also run the regression here (for Jeff)

In [31]:
accepted_df

Unnamed: 0,worker_id,wage,accepted,lwage
0,A101034V35PRT0,0.99,1,-0.010050
1,A101U3FTZYY87V,1.02,1,0.019803
2,A103A7PRZURWVQ,0.95,1,-0.051293
3,A10994CPHWH2VP,1.05,0,0.048790
4,A109ZF92TH9Y8J,0.50,0,-0.693147
...,...,...,...,...
995,AZ4YEK3U8T3DY,1.50,1,0.405465
996,AZLKVYU5QZ2Z9,1.02,1,0.019803
997,AZQ4QIEBEFYQL,1.00,1,0.000000
998,AZVC4M3EXGIGI,1.01,0,0.009950


In [32]:
# Get just the info for the most recent HITs
#accepted_info_sub = accepted_info[-106:]
#accepted_info_sub;

In [33]:
totals = {'overall':0}
accepts = {'overall':0}
for cur_info in accepted_info:
    cur_wage = cur_info[1]
    if cur_wage not in totals:
        totals[cur_wage] = 0
    if cur_wage not in accepts:
        accepts[cur_wage] = 0
    cur_accepted = cur_info[2]
    totals['overall'] = totals['overall'] + 1
    totals[cur_wage] = totals[cur_wage] + 1
    if cur_accepted == 1:
        accepts['overall'] = accepts['overall'] + 1
        accepts[cur_wage] = accepts[cur_wage] + 1

In [34]:
totals

{'overall': 1000,
 '0.99': 75,
 '1.02': 92,
 '0.95': 102,
 '1.05': 93,
 '0.50': 88,
 '1.01': 84,
 '1.50': 93,
 '0.98': 93,
 '0.90': 84,
 '1.00': 100,
 '1.10': 96}

In [35]:
accepts

{'overall': 806,
 '0.99': 64,
 '1.02': 78,
 '0.95': 77,
 '1.05': 86,
 '0.50': 71,
 '1.01': 66,
 '1.50': 77,
 '0.98': 63,
 '0.90': 64,
 '1.00': 83,
 '1.10': 77}

In [49]:
for cur_wage in sorted(totals.keys()):
    header = f"{cur_wage}:" if cur_wage == "overall" else f"${cur_wage}:"
    print(header)
    print(f"{accepts[cur_wage]} accepted out of {totals[cur_wage]} offers")
    accept_rate = accepts[cur_wage] / totals[cur_wage]
    print(f"Accept rate: {accept_rate:.4f}")
#print(f"Regression intercept: {intercept:.4f}")

$0.50:
71 accepted out of 88 offers
Accept rate: 0.8068
$0.90:
64 accepted out of 84 offers
Accept rate: 0.7619
$0.95:
77 accepted out of 102 offers
Accept rate: 0.7549
$0.98:
63 accepted out of 93 offers
Accept rate: 0.6774
$0.99:
64 accepted out of 75 offers
Accept rate: 0.8533
$1.00:
83 accepted out of 100 offers
Accept rate: 0.8300
$1.01:
66 accepted out of 84 offers
Accept rate: 0.7857
$1.02:
78 accepted out of 92 offers
Accept rate: 0.8478
$1.05:
86 accepted out of 93 offers
Accept rate: 0.9247
$1.10:
77 accepted out of 96 offers
Accept rate: 0.8021
$1.50:
77 accepted out of 93 offers
Accept rate: 0.8280
overall:
806 accepted out of 1000 offers
Accept rate: 0.8060


### (e) And now compute the elasticity

In [50]:
result = smf.ols(formula='accepted ~ lwage', data=accepted_df).fit(cov_type='HC1')
print(result.summary())

                            OLS Regression Results                            
Dep. Variable:               accepted   R-squared:                       0.001
Model:                            OLS   Adj. R-squared:                 -0.000
Method:                 Least Squares   F-statistic:                    0.5364
Date:                Fri, 04 Feb 2022   Prob (F-statistic):              0.464
Time:                        12:45:29   Log-Likelihood:                -490.89
No. Observations:                1000   AIC:                             985.8
Df Residuals:                     998   BIC:                             995.6
Df Model:                           1                                         
Covariance Type:                  HC1                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      0.8069      0.013     64.415      0.0

In [35]:
# Save list of submitters
submitter_list = []
for cur_assignment in submitted_list:
    worker_id = cur_assignment['WorkerId']
    submitter_list.append(worker_id)

In [36]:
submit_df = pd.DataFrame({'worker_id':submitter_list})

In [37]:
csv_fpath = mtglobals.stage2_submitted_fpath
submit_df.to_csv(csv_fpath, index=False)