In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import datetime

import joblib
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
import tqdm

from mtclient import MTClient
import mtglobals

In [3]:
client = MTClient()

Loading environment variables from .env
Using LIVE MTurk requester API
Your account balance is 3992.28


### (a) Get all HITs since `date_cutoff`

In [4]:
# Get the list of HITs you want to compile
all_hits = client.download_all_hits(start_cutoff=datetime.datetime(2022,10,1))

Downloading list of hits from 2022-10-01 00:00:00 to 2022-10-17 17:30:18.963463-07:53
p2:UiU+Nd3Nrzx4JMQCAyOQOsvKOUOZNJzB+3UWIkL9zQKWqmbhUKOPl/I7CL5Oa0w=
p2:w2o6nKmUyaW3utaujy1C5HwvMdRVtuPpHCMYczkjv0RETLvm5TWaWM2U36RTErw=
p2:b0xfgGLEaciecsX+Q8tUcOHdFPSDS9+RauyXZrY0ynPHz1zg+EMZxWfbXO2sVX0=
p2:/OlyXZe6LUdX1l9sok8dmdPeqblv1c9GrphhJvwCb9reKYYxSUNYiTQJR3MNHiM=
p2:0PNEJ+YqgufBQr6daC+dkerzB5RAK0vDRIWdL/YdnVmhRtax6HpHhN1KkOdZI/Q=
p2:nb6+khC+6072DszQnMhDUgKVSEKmtTdMcT0SIkg/CvUtj5F0tQ7h6HlfdNx1yuc=
p2:1xSoSeXvzynAUbK0FPh0nmcJyIwK/1WYhn9KvyzaFHEtxh5KYw32IReV1PYUUwc=
p2:MSIBp5Z22BGM3uVV2zz2otblBajpGRdpniVU/QJC0Bs+ToXd9Q+XzSgLD1rGYug=
p2:B07urhccxGEqykfetb7ChVWbw2CaljAzpDAKGh37U3LhBsnrN92nbu82hGAuOYRJ
p2:Vm6m+FCuuCcJAe7DLkzzBJ+meKOd1oY8wwyP3oPUu7pXRNgnp1qjuxF/PKxvlp14
p2:jvt+BjW+pMENccrHbhXVjP5qBpJ6EZKHTMUOHp8uK4EPSw9kZceE/BXkEBS5Bt6b
Saving downloaded HIT data to ../results_2stage/all_hit_data.pkl


In [5]:
len(all_hits)

1010

### (b) Collect first round responses

In [6]:
stage1_hits = [h for h in all_hits if "instantly unlock" in h['Title']]
stage1_hits

[{'HITId': '3HEADTGN2QTGCI0U3QD4DK6JSDPRV5',
  'HITTypeId': '3IYH2PD9CAK8WJ2510OXIQAXEIJITO',
  'HITGroupId': '3TR9CMHN4O2GSXKBKIAOMCVRLG7BBQ',
  'CreationTime': datetime.datetime(2022, 10, 16, 16, 33, 9, tzinfo=tzlocal()),
  'Title': 'Quick 3-question survey about work [<15 seconds], instantly unlock 2nd-stage HIT with higher reward',
  'Description': 'survey,workplace,work',
  'Question': '<HTMLQuestion xmlns="http://mechanicalturk.amazonaws.com/AWSMechanicalTurkDataSchemas/2011-11-11/HTMLQuestion.xsd">\n  <HTMLContent><![CDATA[\n<!DOCTYPE html>\n<html>\n <head>\n  <meta http-equiv=\'Content-Type\' content=\'text/html; charset=UTF-8\'/>\n  <script>\nfunction turkGetParam( name, defaultValue ) { \n   var regexS = "[\\?&]"+name+"=([^&#]*)"; \n   var regex = new RegExp( regexS ); \n   var tmpURL = window.location.href; \n   var results = regex.exec( tmpURL ); \n   if( results == null ) { \n     return defaultValue; \n   } else { \n     return results[1];    \n   } \n}\nfunction decode(s

In [7]:
all_stage1_data = {}
for cur_stage1_hit in stage1_hits:
    # Get the responses
    submissions = client.get_hit_submissions(cur_stage1_hit['HITId'])
    for cur_submission in submissions:
        cur_worker_id = cur_submission['WorkerId']
        cur_answer_xml = cur_submission['Answer']
        cur_answer = mtglobals.parse_stage1_answer(cur_answer_xml)
        cur_accept_time = cur_submission['AcceptTime']
        cur_submit_time = cur_submission['SubmitTime']
        all_stage1_data[cur_worker_id] = {'answer': cur_answer, 'accept_time': cur_accept_time,
                                          'submit_time': cur_submit_time}

In [8]:
len(all_stage1_data)

1000

### (c) Specify the participation quals for each run you want to include in the compiled results

In [9]:
qual_names = ['Workplace_Survey_00']
#qual_names = ['Workplace_Survey_Custom06','Workplace_Survey_Custom07',
#              'Workplace_Survey_Custom08','Workplace_Survey_Custom09',
#              'Workplace_Survey_Custom10','Workplace_Survey_Custom11',
#              'Workplace_Survey_Custom12','Workplace_Survey_Custom13',
#              'Workplace_Survey_Custom14']
#qual_names = ['Workplace_Survey_Custom12']

In [10]:
all_worker_ids = []
for cur_qual_name in qual_names:
    worker_ids = client.get_workers_with_qual(cur_qual_name)
    all_worker_ids.extend(worker_ids)

In [11]:
len(all_worker_ids)

1000

### (d) Get the HITs for each worker across the quals from the previous step

In [12]:
# (worker_id, hit) tuples
worker_data = [{'id': wid, 'hit': mtglobals.get_hit_for_worker(all_hits, wid)} for wid in all_worker_ids]

In [13]:
len(worker_data)

1000

In [14]:
worker_data[0]

{'id': 'A101J0P2UYUKA1',
 'hit': {'HITId': '3NBFJK3IOIJUX8IJM3XEJOY4UAOOG9',
  'HITTypeId': '3XLRCUUVLW6TOPIFGGC3TLMGSFQU4F',
  'HITGroupId': '3AJ89545NQ0EZ0WVU9CENK1COFHBER',
  'CreationTime': datetime.datetime(2022, 10, 13, 15, 32, 15, tzinfo=tzlocal()),
  'Title': 'Custom workplace survey HIT for worker id A101J0P2UYUKA1',
  'Description': 'Custom workplace survey HIT for worker id A101J0P2UYUKA1, 30 questions, ~15mins to complete',
  'Question': '<HTMLQuestion xmlns="http://mechanicalturk.amazonaws.com/AWSMechanicalTurkDataSchemas/2011-11-11/HTMLQuestion.xsd">\n  <HTMLContent><![CDATA[\n<!DOCTYPE html>\n<html>\n <head>\n  <meta http-equiv=\'Content-Type\' content=\'text/html; charset=UTF-8\'/>\n  <script>\nfunction turkGetParam( name, defaultValue ) { \n   var regexS = "[\\?&]"+name+"=([^&#]*)"; \n   var regex = new RegExp( regexS ); \n   var tmpURL = window.location.href; \n   var results = regex.exec( tmpURL ); \n   if( results == null ) { \n     return defaultValue; \n   } else 

In [15]:
joblib.dump(worker_data, "workerdata_temp.pkl")

['workerdata_temp.pkl']

In [16]:
all_submit_info = []
for cur_data in tqdm.tqdm(worker_data):
    worker_id = cur_data['id']
    cur_hit = cur_data['hit']
    hit_id = cur_hit['HITId']
    reward = cur_hit['Reward']
    num_avail = cur_hit['NumberOfAssignmentsAvailable']
    creation = cur_hit['CreationTime']
    # Now see if they accepted, and if so, record accept+submit times
    accept_time = None
    submit_time = None
    submissions = client.get_hit_submissions(hit_id)
    if len(submissions) > 0:
        submission = submissions[0]
        accept_time = submission['AcceptTime']
        submit_time = submission['SubmitTime']
    # Convert to non-localized
    creation_naive = creation.replace(tzinfo=None)
    # 1 - this is a 0/1 accepted vs. not accepted
    accepted = 1 - num_avail
    #print(f"Processing {hit_id}, {reward}")
    submit_info = {'worker_id':worker_id, 'wage':reward, 'accepted':accepted,
                   'stage2_creation':creation_naive, 'stage2_creation_str':str(creation),
                   'stage2_accept_time': accept_time, 'stage2_submit_time': submit_time}
    all_submit_info.append(submit_info)

100%|██████████| 1000/1000 [03:06<00:00,  5.36it/s]


In [17]:
#cur_hit

In [18]:
all_submit_info[0]

{'worker_id': 'A101J0P2UYUKA1',
 'wage': '1.05',
 'accepted': 1,
 'stage2_creation': datetime.datetime(2022, 10, 13, 15, 32, 15),
 'stage2_creation_str': '2022-10-13 15:32:15-07:00',
 'stage2_accept_time': datetime.datetime(2022, 10, 13, 15, 41, 19, tzinfo=tzlocal()),
 'stage2_submit_time': datetime.datetime(2022, 10, 13, 15, 48, 31, tzinfo=tzlocal())}

### (e) Transform the downloaded data into .dta form for regression

In [102]:
# Transform this into a .dta for regression
accepted_df = pd.DataFrame(all_submit_info)

In [103]:
len(accepted_df)

1000

In [104]:
accepted_df['lwage'] = accepted_df['wage'].apply(lambda x: np.log(float(x)))

In [105]:
reg_result = smf.ols(formula='accepted ~ lwage', data=accepted_df).fit(cov_type='HC1')
print(reg_result.summary())

                            OLS Regression Results                            
Dep. Variable:               accepted   R-squared:                       0.002
Model:                            OLS   Adj. R-squared:                  0.001
Method:                 Least Squares   F-statistic:                     1.522
Date:                Mon, 17 Oct 2022   Prob (F-statistic):              0.218
Time:                        18:11:31   Log-Likelihood:                -631.00
No. Observations:                1000   AIC:                             1266.
Df Residuals:                     998   BIC:                             1276.
Df Model:                           1                                         
Covariance Type:                  HC1                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      0.7084      0.014     49.193      0.0

In [106]:
# Also get the sample mean of the dependent var
np.mean(accepted_df['accepted'])

0.707

### (f) Merge in worker stage1 answers

In [107]:
def get_answer_data(worker_id):
    return all_stage1_data[worker_id]['answer']

In [108]:
accepted_df[['age','onlinehrs_str','reason']] = accepted_df['worker_id'].apply(lambda x: pd.Series(get_answer_data(x)))

In [109]:
test_id = list(all_stage1_data.keys())[0]
all_stage1_data[test_id]

{'answer': {'age': '38', 'onlinehrs': '8', 'reason': 'money'},
 'accept_time': datetime.datetime(2022, 10, 16, 16, 33, 21, tzinfo=tzlocal()),
 'submit_time': datetime.datetime(2022, 10, 16, 16, 34, 4, tzinfo=tzlocal())}

In [110]:
accepted_df['stage1_accept_time'] = accepted_df['worker_id'].apply(lambda x: all_stage1_data[x]['accept_time'])
accepted_df['stage1_submit_time'] = accepted_df['worker_id'].apply(lambda x: all_stage1_data[x]['submit_time'])

In [111]:
### (f) Clean non-numeric responses

In [112]:
# Handle non-numeric entries
invalid_onlinehrs = ["yes",None,"Amazonmturk","YES","GOOD","yes.","NO","HOURS","$500","$2,000-$5,000"]
invalid_onlinehrs.append("Most freelancing websites list these jobs, and you can sign up on any of them to start working. Potential earnings: Rs 300 to Rs 1,500 per hour")
#accepted_df = accepted_df[~accepted_df['onlinehrs'].isin(invalid_onlinehrs)].copy()
accepted_df.loc[accepted_df['onlinehrs_str'].isin(invalid_onlinehrs), 'onlinehrs_str'] = ""

In [113]:
hrs_replace_map = {
    '10-15': 13,
    '45 hours': 45,
    '40+ hours per week': 40,
    '24 hours': 24,
    '45 to 50': 47,
    'A minimum of three hours per week per credit, plus an additional hour per class each week to review materials.': 4,
    '40 Hours': 40,
    '35+': 35,
    '$80': 80,
    '8 HOURS /30,000 TO 49,000 MONEY': 40,
    '30Hrs': 30,
    '12 Hours': 12,
    '8 HOURS': 40,
    '40+': 40,
    '50$': 50,
    'You should plan to devote a minimum of three hours per week per credit, plus an additional hour per class each week to review materials. For instance, for a three-credit online course, you will need nine hours of study time and one hour of review time each week.': 4,
    '6 HOURS': 6,
    '12 to 14bhours': 13,
    '5$': 54,
    '56-70': 63,
    '48 HOURS': 48,
    '7 HOURS': 7,
    '$10': 10,
    '40 hours per week i am earn money ,i will doing any task .': 40,
    '12hrs': 12,
    '20+': 20,
    '8 hours 500': 8,
    '40 hours': 40,
    '2 hours 500': 2,
    '56 hours': 56,
    '10-12 Hours': 11,
    '41 hours': 41,
    '5hours 700': 5,
}
accepted_df['onlinehrs_str'] = accepted_df['onlinehrs_str'].apply(lambda x: hrs_replace_map[x] if x in hrs_replace_map else x)

In [114]:
# And make sure it can be converted to numeric
accepted_df['onlinehrs'] = pd.to_numeric(accepted_df['onlinehrs_str'])
accepted_df

Unnamed: 0,worker_id,wage,accepted,stage2_creation,stage2_creation_str,stage2_accept_time,stage2_submit_time,lwage,age,onlinehrs_str,reason,stage1_accept_time,stage1_submit_time,onlinehrs
0,A101J0P2UYUKA1,1.05,1,2022-10-13 15:32:15,2022-10-13 15:32:15-07:00,2022-10-13 15:41:19-07:00,2022-10-13 15:48:31-07:00,0.048790,23,40,skills,2022-10-13 15:01:17-07:00,2022-10-13 15:03:12-07:00,40.0
1,A10AKR84P1WXHL,0.98,1,2022-10-03 22:26:00,2022-10-03 22:26:00-07:00,2022-10-03 22:28:30-07:00,2022-10-03 23:05:39-07:00,-0.020203,27,36,money,2022-10-03 22:12:21-07:00,2022-10-03 22:21:54-07:00,36.0
2,A10MSB9X1UFLJ5,0.99,0,2022-10-16 16:48:26,2022-10-16 16:48:26-07:00,NaT,NaT,-0.010050,48,90,money,2022-10-16 16:35:58-07:00,2022-10-16 16:36:56-07:00,90.0
3,A10NMPJSO8TYIT,1.50,1,2022-10-16 16:48:44,2022-10-16 16:48:44-07:00,2022-10-16 16:51:44-07:00,2022-10-16 17:41:26-07:00,0.405465,35,40,money,2022-10-16 16:43:22-07:00,2022-10-16 16:45:06-07:00,40.0
4,A10W3EILX06DFO,0.50,1,2022-10-13 17:29:28,2022-10-13 17:29:28-07:00,2022-10-13 18:07:12-07:00,2022-10-13 19:03:30-07:00,-0.693147,35,40,skills,2022-10-13 17:24:21-07:00,2022-10-13 17:27:05-07:00,40.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,AZFXFW2ZDX54B,1.00,1,2022-10-14 18:56:27,2022-10-14 18:56:27-07:00,2022-10-15 10:20:48-07:00,2022-10-15 10:39:45-07:00,0.000000,25,120,money,2022-10-14 16:28:45-07:00,2022-10-14 16:38:36-07:00,120.0
996,AZH32KEBRPRES,0.98,1,2022-10-03 22:20:25,2022-10-03 22:20:25-07:00,2022-10-03 22:22:15-07:00,2022-10-03 22:47:16-07:00,-0.020203,32,25,money,2022-10-03 22:15:50-07:00,2022-10-03 22:19:51-07:00,25.0
997,AZNWVHY3AUSNT,0.95,1,2022-10-10 22:04:06,2022-10-10 22:04:06-07:00,2022-10-10 22:07:03-07:00,2022-10-10 22:32:03-07:00,-0.051293,51,48,money,2022-10-10 22:00:11-07:00,2022-10-10 22:02:52-07:00,48.0
998,AZPXDIOTEH989,0.50,1,2022-10-13 15:32:47,2022-10-13 15:32:47-07:00,2022-10-13 15:37:33-07:00,2022-10-13 15:58:15-07:00,-0.693147,25,35,money,2022-10-13 15:01:38-07:00,2022-10-13 15:26:30-07:00,35.0


In [115]:
# And entries with more onlinehrs than hrs in a week
accepted_df.loc[accepted_df['onlinehrs'] > 168, 'onlinehrs'] = np.nan

In [116]:
accepted_df.sort_values(by=['stage2_creation'])

Unnamed: 0,worker_id,wage,accepted,stage2_creation,stage2_creation_str,stage2_accept_time,stage2_submit_time,lwage,age,onlinehrs_str,reason,stage1_accept_time,stage1_submit_time,onlinehrs
967,AWEOVYNS6G6OR,0.98,1,2022-10-03 22:15:47,2022-10-03 22:15:47-07:00,2022-10-03 22:16:22-07:00,2022-10-03 22:21:16-07:00,-0.020203,20,40,money,2022-10-03 22:12:34-07:00,2022-10-03 22:13:01-07:00,40.0
865,AJEAPP3GM7XMJ,0.90,0,2022-10-03 22:15:48,2022-10-03 22:15:48-07:00,NaT,NaT,-0.105361,28,60,skills,2022-10-03 22:12:46-07:00,2022-10-03 22:13:22-07:00,60.0
320,A28Q9L1LRIKSA9,1.10,1,2022-10-03 22:15:49,2022-10-03 22:15:49-07:00,2022-10-03 22:51:43-07:00,2022-10-03 23:09:24-07:00,0.095310,23,13,money,2022-10-03 22:12:45-07:00,2022-10-03 22:13:30-07:00,13.0
834,AG9IZHB76SAJM,1.50,1,2022-10-03 22:15:49,2022-10-03 22:15:49-07:00,2022-10-03 22:16:36-07:00,2022-10-03 22:51:44-07:00,0.405465,24,,fun,2022-10-03 22:12:46-07:00,2022-10-03 22:13:27-07:00,
952,AU8YJU8Y6AHU,0.99,1,2022-10-03 22:15:50,2022-10-03 22:15:50-07:00,2022-10-03 22:39:59-07:00,2022-10-03 22:43:14-07:00,-0.010050,62,2000,money,2022-10-03 22:12:34-07:00,2022-10-03 22:13:39-07:00,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
762,A53S7J4JGWG38,0.99,1,2022-10-17 16:58:05,2022-10-17 16:58:05-07:00,2022-10-17 17:08:08-07:00,2022-10-17 17:33:57-07:00,-0.010050,46,20,money,2022-10-16 17:34:06-07:00,2022-10-16 17:51:11-07:00,20.0
341,A2BIB03B8PIC0,0.99,0,2022-10-17 16:58:05,2022-10-17 16:58:05-07:00,NaT,NaT,-0.010050,39,20,money,2022-10-16 17:37:25-07:00,2022-10-16 17:48:16-07:00,20.0
213,A1RWNNYPXUD0T4,0.90,0,2022-10-17 16:58:06,2022-10-17 16:58:06-07:00,NaT,NaT,-0.105361,1988,3,skills,2022-10-16 17:31:25-07:00,2022-10-16 17:52:40-07:00,3.0
908,AP5IV4C2D3XR5,0.98,0,2022-10-17 16:58:07,2022-10-17 16:58:07-07:00,NaT,NaT,-0.020203,46,45,skills,2022-10-16 17:35:12-07:00,2022-10-16 18:08:41-07:00,45.0


In [117]:
drop_same_day = False

In [118]:
# Dropping obs created same day
if drop_same_day:
    accepted_df_full = accepted_df.copy()
    today_ymd = str(datetime.datetime.now()).split()[0]
    accepted_df = accepted_df[~accepted_df['stage2_creation_str'].str.startswith(today_ymd)].copy()
    #2022-10-15 15:10:31

### (g) and extract "sophisticated" workers

In [119]:
# First, pick a subset by stage2_creation date, if needed
#accepted_df = accepted_df[accepted_df['stage2_creation_str'].str.startswith("2022-10-07")].copy()

Now re-do the regression only for experienced workers

In [120]:
reason_money = accepted_df['reason'] == "money"
high_hrs = accepted_df['onlinehrs'] > 10

In [121]:
accepted_df['sophisticated'] = 0
accepted_df.loc[(reason_money & high_hrs), 'sophisticated'] = 1

In [122]:
accepted_df

Unnamed: 0,worker_id,wage,accepted,stage2_creation,stage2_creation_str,stage2_accept_time,stage2_submit_time,lwage,age,onlinehrs_str,reason,stage1_accept_time,stage1_submit_time,onlinehrs,sophisticated
0,A101J0P2UYUKA1,1.05,1,2022-10-13 15:32:15,2022-10-13 15:32:15-07:00,2022-10-13 15:41:19-07:00,2022-10-13 15:48:31-07:00,0.048790,23,40,skills,2022-10-13 15:01:17-07:00,2022-10-13 15:03:12-07:00,40.0,0
1,A10AKR84P1WXHL,0.98,1,2022-10-03 22:26:00,2022-10-03 22:26:00-07:00,2022-10-03 22:28:30-07:00,2022-10-03 23:05:39-07:00,-0.020203,27,36,money,2022-10-03 22:12:21-07:00,2022-10-03 22:21:54-07:00,36.0,1
2,A10MSB9X1UFLJ5,0.99,0,2022-10-16 16:48:26,2022-10-16 16:48:26-07:00,NaT,NaT,-0.010050,48,90,money,2022-10-16 16:35:58-07:00,2022-10-16 16:36:56-07:00,90.0,1
3,A10NMPJSO8TYIT,1.50,1,2022-10-16 16:48:44,2022-10-16 16:48:44-07:00,2022-10-16 16:51:44-07:00,2022-10-16 17:41:26-07:00,0.405465,35,40,money,2022-10-16 16:43:22-07:00,2022-10-16 16:45:06-07:00,40.0,1
4,A10W3EILX06DFO,0.50,1,2022-10-13 17:29:28,2022-10-13 17:29:28-07:00,2022-10-13 18:07:12-07:00,2022-10-13 19:03:30-07:00,-0.693147,35,40,skills,2022-10-13 17:24:21-07:00,2022-10-13 17:27:05-07:00,40.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,AZFXFW2ZDX54B,1.00,1,2022-10-14 18:56:27,2022-10-14 18:56:27-07:00,2022-10-15 10:20:48-07:00,2022-10-15 10:39:45-07:00,0.000000,25,120,money,2022-10-14 16:28:45-07:00,2022-10-14 16:38:36-07:00,120.0,1
996,AZH32KEBRPRES,0.98,1,2022-10-03 22:20:25,2022-10-03 22:20:25-07:00,2022-10-03 22:22:15-07:00,2022-10-03 22:47:16-07:00,-0.020203,32,25,money,2022-10-03 22:15:50-07:00,2022-10-03 22:19:51-07:00,25.0,1
997,AZNWVHY3AUSNT,0.95,1,2022-10-10 22:04:06,2022-10-10 22:04:06-07:00,2022-10-10 22:07:03-07:00,2022-10-10 22:32:03-07:00,-0.051293,51,48,money,2022-10-10 22:00:11-07:00,2022-10-10 22:02:52-07:00,48.0,1
998,AZPXDIOTEH989,0.50,1,2022-10-13 15:32:47,2022-10-13 15:32:47-07:00,2022-10-13 15:37:33-07:00,2022-10-13 15:58:15-07:00,-0.693147,25,35,money,2022-10-13 15:01:38-07:00,2022-10-13 15:26:30-07:00,35.0,1


In [123]:
soph_df = accepted_df[accepted_df['sophisticated'] == 1].copy()
soph_df['accepted'].mean()

0.6893939393939394

In [124]:
reg_result = smf.ols(formula='accepted ~ lwage', data=soph_df).fit(cov_type='HC1')
print(reg_result.summary())

                            OLS Regression Results                            
Dep. Variable:               accepted   R-squared:                       0.000
Model:                            OLS   Adj. R-squared:                 -0.002
Method:                 Least Squares   F-statistic:                   0.01319
Date:                Mon, 17 Oct 2022   Prob (F-statistic):              0.909
Time:                        18:11:38   Log-Likelihood:                -342.32
No. Observations:                 528   AIC:                             688.6
Df Residuals:                     526   BIC:                             697.2
Df Model:                           1                                         
Covariance Type:                  HC1                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      0.6895      0.020     34.145      0.0

### (h) Check and record any workers who are suspended

(As indicated by the API failing to notify them)

In [125]:
wids = list(accepted_df['worker_id'].values)

In [126]:
with open('./mt_log.txt', 'r', encoding='utf-8') as f:
    mt_log = [l.strip().lower() for l in f.readlines()]

In [127]:
def get_wid_lines(worker_id):
    wid_lines = [line for line in mt_log if worker_id.lower() in line]
    return wid_lines
wid_lines = {wid: get_wid_lines(wid) for wid in wids}

In [128]:
failures = []
for cur_wid, cur_lines in wid_lines.items():
    fail = [l for l in cur_lines if "fail" in l]
    if len(fail) > 0:
        failures.append((cur_wid, fail))

In [129]:
fail_wids = [t[0] for t in failures]

In [130]:
fail_all = {k:v for k, v in wid_lines.items() if k in fail_wids}

In [131]:
len(fail_all)

2

In [132]:
# And merge

In [133]:
accepted_df['suspended'] = accepted_df['worker_id'].apply(lambda x: True if x in fail_wids else False)

In [134]:
accepted_df['suspended'].value_counts()

False    998
True       2
Name: suspended, dtype: int64

### (i) Export the dataset, including suspended user data, to .dta with current timestamp

In [135]:
timestamp = mtglobals.gen_timestamp()

In [136]:
accepted_df['onlinehrs_str']

0       40
1       36
2       90
3       40
4       40
      ... 
995    120
996     25
997     48
998     35
999     10
Name: onlinehrs_str, Length: 1000, dtype: object

In [137]:
accepted_df['onlinehrs_str'].value_counts(dropna=False)

40      126
35       91
20       64
30       57
45       48
       ... 
11        1
140       1
56        1
50        1
5.00      1
Name: onlinehrs_str, Length: 113, dtype: int64

In [138]:
accepted_df_nostr = accepted_df.drop(columns=['onlinehrs_str']).copy()
accepted_df_nostr['stage1_accept_time'] = accepted_df_nostr['stage1_accept_time'].apply(str)
accepted_df_nostr['stage1_submit_time'] = accepted_df_nostr['stage1_submit_time'].apply(str)
accepted_df_nostr['stage2_accept_time'] = accepted_df_nostr['stage2_accept_time'].apply(str)
accepted_df_nostr['stage2_submit_time'] = accepted_df_nostr['stage2_submit_time'].apply(str)

In [139]:
output_fpath = f'../results_2stage/accepted_df_{timestamp}.dta'
accepted_df_nostr.to_stata(output_fpath, version=117)
print(f"Saved to {output_fpath}")

Saved to ../results_2stage/accepted_df_20221017_181143.dta


In [140]:
accepted_df.to_pickle(output_fpath.replace(".dta",".pkl"))

In [141]:
accepted_df

Unnamed: 0,worker_id,wage,accepted,stage2_creation,stage2_creation_str,stage2_accept_time,stage2_submit_time,lwage,age,onlinehrs_str,reason,stage1_accept_time,stage1_submit_time,onlinehrs,sophisticated,suspended
0,A101J0P2UYUKA1,1.05,1,2022-10-13 15:32:15,2022-10-13 15:32:15-07:00,2022-10-13 15:41:19-07:00,2022-10-13 15:48:31-07:00,0.048790,23,40,skills,2022-10-13 15:01:17-07:00,2022-10-13 15:03:12-07:00,40.0,0,False
1,A10AKR84P1WXHL,0.98,1,2022-10-03 22:26:00,2022-10-03 22:26:00-07:00,2022-10-03 22:28:30-07:00,2022-10-03 23:05:39-07:00,-0.020203,27,36,money,2022-10-03 22:12:21-07:00,2022-10-03 22:21:54-07:00,36.0,1,False
2,A10MSB9X1UFLJ5,0.99,0,2022-10-16 16:48:26,2022-10-16 16:48:26-07:00,NaT,NaT,-0.010050,48,90,money,2022-10-16 16:35:58-07:00,2022-10-16 16:36:56-07:00,90.0,1,False
3,A10NMPJSO8TYIT,1.50,1,2022-10-16 16:48:44,2022-10-16 16:48:44-07:00,2022-10-16 16:51:44-07:00,2022-10-16 17:41:26-07:00,0.405465,35,40,money,2022-10-16 16:43:22-07:00,2022-10-16 16:45:06-07:00,40.0,1,False
4,A10W3EILX06DFO,0.50,1,2022-10-13 17:29:28,2022-10-13 17:29:28-07:00,2022-10-13 18:07:12-07:00,2022-10-13 19:03:30-07:00,-0.693147,35,40,skills,2022-10-13 17:24:21-07:00,2022-10-13 17:27:05-07:00,40.0,0,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,AZFXFW2ZDX54B,1.00,1,2022-10-14 18:56:27,2022-10-14 18:56:27-07:00,2022-10-15 10:20:48-07:00,2022-10-15 10:39:45-07:00,0.000000,25,120,money,2022-10-14 16:28:45-07:00,2022-10-14 16:38:36-07:00,120.0,1,False
996,AZH32KEBRPRES,0.98,1,2022-10-03 22:20:25,2022-10-03 22:20:25-07:00,2022-10-03 22:22:15-07:00,2022-10-03 22:47:16-07:00,-0.020203,32,25,money,2022-10-03 22:15:50-07:00,2022-10-03 22:19:51-07:00,25.0,1,False
997,AZNWVHY3AUSNT,0.95,1,2022-10-10 22:04:06,2022-10-10 22:04:06-07:00,2022-10-10 22:07:03-07:00,2022-10-10 22:32:03-07:00,-0.051293,51,48,money,2022-10-10 22:00:11-07:00,2022-10-10 22:02:52-07:00,48.0,1,False
998,AZPXDIOTEH989,0.50,1,2022-10-13 15:32:47,2022-10-13 15:32:47-07:00,2022-10-13 15:37:33-07:00,2022-10-13 15:58:15-07:00,-0.693147,25,35,money,2022-10-13 15:01:38-07:00,2022-10-13 15:26:30-07:00,35.0,1,False
