In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import datetime

import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
import tqdm

import mtglobals

In [4]:
client, mt_env = mtglobals.gen_client()

Your account balance is 498885.58


### (a) Get all HITs since `date_cutoff`

In [5]:
# Get the list of HITs you want to compile
date_cutoff = datetime.datetime(2021,10,1)
all_hits = mtglobals.download_all_hits(client, start_cutoff=date_cutoff)

Downloading list of hits from 2021-10-01 00:00:00 to 2022-02-04 12:48:17.035716-07:53 (to ../results_2stage/all_hit_data.pkl)
p2:WabWAzJp7PrhioSDhGdHSyIKP3gPdpTdRYmaOq2h1A44UWANXHyeitJcayDTLrY=
p2:mWhVIdbT15DUDx7FZiJBtx3f5GIeXIe9ARI0HkzUNFyZQ5Fa+WgcknU4reoQVPA=
p2:aOhhgH1dfOYgFHvA1CXvU2NUEnBEUR2RtQSZANXoD1YTVEvAsFFRmbsdFNrn4jg=
p2:Mgi3sQEx16r07deejZZutjqZcUlcKDRjDGNmRBWC67vxmxRuJDeYEa2qj+phpng=
p2:BKZd6kLUhZUwgZiD051XmAN52/0WrkblmO3jlOgaqpdKLz0FoP0r9JbkgbexJ60=
p2:WdzBCKZ+DJZCloxvp0ytgCWrRMzs+GYfpUbm95f9Dp+RcEL73+H7E8NrHu98PJI=
p2:80Y9NDRUuPNbDF+QBcbNyY26aTw89oogAT/Nel723kS1y9LoTos4iEM82pp9/F4=
p2:bb/e9wHnFt0mfumEqJT2mZ6FMnGuAjo1DV8eBIBNLJxO7P0D2mOkpQETeoU258I=
p2:NA5I3d+kPVGZUvBaEN0LIqgwh1RiMTFMEWZqBhy4xgl6KSyox9Qd5f9W4AAhvSHt
p2:mShJXqWcYsCvdYLUGtLdmwFKctBOZ47E8pkSGZk1HUiAlhx/VbvWuz31awJp1naH
p2:iSnBz5E+4mosqG3khjFDRbpoREDV0fSjUm0fWnQFD7QzOADwobzSs3/muJFyfHWw
p2:0XfpUCNUPaYtJvN1jWu1glMqVVWI70YhEXvZYSMsEUgGjujKjsuma69b92zCzxat
p2:Wx9o+5quU7sM3EFlcuq1RrMoTTonROlVf6qzi1QOAiqr3CCyU/T4cjA

In [6]:
len(all_hits)

3463

### (b) Specify the quals for each run you want to include in the compiled results

In [7]:
 qual_names = ['Workplace_Survey_Custom06','Workplace_Survey_Custom07',
               'Workplace_Survey_Custom08','Workplace_Survey_Custom09',
               'Workplace_Survey_Custom10','Workplace_Survey_Custom11',
               'Workplace_Survey_Custom12','Workplace_Survey_Custom13',
               'Workplace_Survey_Custom14']
#qual_names = ['Workplace_Survey_Custom12']

In [8]:
all_worker_ids = []
for cur_qual_name in qual_names:
    worker_ids = mtglobals.get_workers_with_qual(client, cur_qual_name)
    all_worker_ids.extend(worker_ids)

In [9]:
len(all_worker_ids)

3016

### (c) Get the HITs for each worker across the quals from the previous step

In [10]:
# (worker_id, hit) tuples
worker_tuples = [(wid, mtglobals.get_hit_for_worker(all_hits, wid)) for wid in all_worker_ids]

In [11]:
all_submit_info = []
for cur_tuple in tqdm.tqdm(worker_tuples):
    worker_id = cur_tuple[0]
    cur_hit = cur_tuple[1]
    hit_id = cur_hit['HITId']
    reward = cur_hit['Reward']
    num_avail = cur_hit['NumberOfAssignmentsAvailable']
    creation = cur_hit['CreationTime']
    # Convert to non-tz
    creation_naive = creation.replace(tzinfo=None)
    # 1 - this is a 0/1 accepted vs. not accepted
    accepted = 1 - num_avail
    #print(f"Processing {hit_id}, {reward}")
    submit_info = {'worker_id':worker_id, 'wage':reward, 'accepted':accepted,
                   'stage2_creation':creation_naive, 'stage2_creation_str':str(creation)}
    all_submit_info.append(submit_info)

100%|██████████| 3016/3016 [00:00<00:00, 88706.09it/s]


In [12]:
#cur_hit

In [13]:
all_submit_info[0]

{'worker_id': 'A1AHWC9TUBF2Z4',
 'wage': '1.50',
 'accepted': 0,
 'stage2_creation': datetime.datetime(2021, 12, 8, 19, 16, 16),
 'stage2_creation_str': '2021-12-08 19:16:16-08:00'}

### (d) Transform the downloaded data into .dta form for regression

In [15]:
# Transform this into a .dta for regression
accepted_df = pd.DataFrame(all_submit_info)

In [16]:
len(accepted_df)

3016

In [17]:
accepted_df['lwage'] = accepted_df['wage'].apply(lambda x: np.log(float(x)))

In [19]:
reg_result = smf.ols(formula='accepted ~ lwage', data=accepted_df).fit(cov_type='HC1')
print(reg_result.summary())

                            OLS Regression Results                            
Dep. Variable:               accepted   R-squared:                       0.004
Model:                            OLS   Adj. R-squared:                  0.004
Method:                 Least Squares   F-statistic:                     11.61
Date:                Fri, 04 Feb 2022   Prob (F-statistic):           0.000666
Time:                        12:52:29   Log-Likelihood:                -1823.8
No. Observations:                3016   AIC:                             3652.
Df Residuals:                    3014   BIC:                             3664.
Df Model:                           1                                         
Covariance Type:                  HC1                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      0.7331      0.008     90.876      0.0

In [20]:
# Also get the sample mean of the dependent var
np.mean(accepted_df['accepted'])

0.730106100795756

### (e) Check and record any workers who are suspended

(As indicated by the API failing to notify them)

In [21]:
wids = list(accepted_df['worker_id'].values)

In [22]:
with open('./mt_log.txt', 'r', encoding='utf-8') as f:
    mt_log = [l.strip().lower() for l in f.readlines()]

In [23]:
def get_wid_lines(worker_id):
    wid_lines = [line for line in mt_log if worker_id.lower() in line]
    return wid_lines
wid_lines = {wid: get_wid_lines(wid) for wid in wids}

In [24]:
failures = []
for cur_wid, cur_lines in wid_lines.items():
    fail = [l for l in cur_lines if "fail" in l]
    if len(fail) > 0:
        failures.append((cur_wid, fail))

In [25]:
fail_wids = [t[0] for t in failures]

In [26]:
fail_all = {k:v for k, v in wid_lines.items() if k in fail_wids}

In [27]:
len(fail_all)

91

In [28]:
# And merge

In [29]:
accepted_df['suspended'] = accepted_df['worker_id'].apply(lambda x: True if x in fail_wids else False)

In [30]:
accepted_df['suspended'].value_counts()

False    2925
True       91
Name: suspended, dtype: int64

### (f) Export the dataset, including suspended user data, to .dta with current timestamp

In [18]:
timestamp = str(datetime.datetime.now()).split(".")[0].replace(" ","_").replace("-","").replace(":","")

In [32]:
output_fpath = f'../results_2stage/accepted_df_{timestamp}.dta'
accepted_df.to_stata(output_fpath)