Merge the MTurk results with the Qualtrics results

In [337]:
import glob

import pandas as pd
import numpy as np

Load MTurk data

In [338]:
mt_df = pd.read_csv("../results/mturk_2stage/pilot_results.csv")

In [339]:
mt_df.head(1)

Unnamed: 0,HITId,HITTypeId,Title,Description,Keywords,reward_stage1,reward_stage2,CreationTime,MaxAssignments,RequesterAnnotation,...,RejectionTime,RequesterFeedback,WorkTimeInSeconds,LifetimeApprovalRate,Last30DaysApprovalRate,Last7DaysApprovalRate,Answer.age,Answer.onlinehrs,Answer.reason,completed_stage2
0,3SA4EMRVJW3F74JZN1LQNWGEP4RP0O,379Y3Z5FS69XWD67TT3RQRTBXFCT19,Quick 3-question survey about work [<15 second...,Quick <15 second survey about workplace condit...,"survey,workplace,work",$0.10,$0.95,Wed Sep 29 13:35:54 PDT 2021,5,BatchId:4566303;OriginalHitTemplateId:920937340;,...,,,59,100% (1/1),100% (1/1),0% (0/0),26,25,money,False


In [340]:
mt_rename = {
    'Answer.age': 'age',
    'Answer.onlinehrs': 'onlinehrs',
    'Answer.reason': 'reason',
    'WorkerId': 'mt_id',
    'completed_stage2': 'submitted_hit',
}
mt_df.rename(columns=mt_rename, inplace=True)

### And load Qualtrics data

In [341]:
glob.glob("../results/mturk_2stage/*.xlsx")

['../results/mturk_2stage\\Job+Quality+NonWM+10.0_October+6,+2021_18.59.xlsx',
 '../results/mturk_2stage\\Job+Quality+NonWM+11.0_October+11,+2021_20.38.xlsx']

In [342]:
qual_df = pd.read_excel("../results/mturk_2stage/Job+Quality+NonWM+11.0_October+11,+2021_20.38.xlsx")

  warn("Workbook contains no default style, apply openpyxl's default")


In [343]:
qual_df = qual_df.iloc[1:]

In [344]:
qual_rename = {
    'Duration (in seconds)': 'duration_seconds',
}
qual_df.rename(columns=qual_rename, inplace=True)
qual_drop = ['StartDate','EndDate','RecordedDate','RecipientLastName','RecipientFirstName',
             'RecipientEmail','ExternalReference','LocationLatitude','LocationLongitude',
             'Finished','what_would_it_take']
qual_df.drop(columns=qual_drop, inplace=True)

In [345]:
qual_df['mt_id']

1    A1XJAPEPY9XDOP
2    A23N4V8XW6Y49I
3    A3HNEYFOIJWPH1
4    A23N4V8XW6Y49I
Name: mt_id, dtype: object

In [346]:
print(list(qual_df.columns)[:50], end="")

['Status', 'IPAddress', 'Progress', 'duration_seconds', 'ResponseId', 'DistributionChannel', 'UserLanguage', 'state', 'race', 'race_6_TEXT', 'gender', 'gender_4_TEXT', 'education', 'currently_employed', 'fulltime', 'most_recent_job', 'most_recent_fulltime', 'weeks_unemployment', 'how_easy', 'lowest_wage', 'savings', 'jobtitle', 'wage', 'hrs', 'controlhrs', 'sickleave', 'friends', 'commute', 'physical', 'skills', 'vaccine', 'express', 'coworkers', 'suprespect', 'supfair', 'response1', 'response2', 'response3', 'response4', 'response5', 'response6', 'response7', 'response8', 'response9', 'response10', 'response11', 'response12', 'response13', 'response14', 'response15']

### And merge

In [347]:
merge_df = mt_df.merge(qual_df, how='left', on='mt_id', indicator=True)

In [348]:
# After merging, annoyingly, we have to re-convert the numeric cols
obj_vars = ['Progress','duration_seconds']
for cur_var in obj_vars:
    merge_df[cur_var] = pd.to_numeric(merge_df[cur_var])
all_na_vars = ['race_6_TEXT','gender_4_TEXT']
for cur_var in all_na_vars:
    merge_df[cur_var] = ""

In [349]:
def gen_case(row_data):
    merge_result = row_data['_merge']
    if merge_result == 'left_only':
        return "rejected"
    # Here we know they started the survey, but not if they finished
    prog = row_data['Progress']
    if prog == 100.0:
        # They completed the Qualtrics survey
        return "finished_survey"
    return "did_not_finish"
merge_df['result'] = merge_df.apply(gen_case, axis=1)

In [350]:
merge_df['race_6_TEXT']

0     
1     
2     
3     
4     
5     
6     
7     
8     
9     
10    
Name: race_6_TEXT, dtype: object

In [351]:
#with pd.option_context('max_columns', 100):
#    display(merge_df)

In [352]:
# Due to annoying problems with to_stata, need to write to .csv
# then use Label_Vars.ipynb to generate the .dta with labels

In [353]:
merge_df.to_csv("../results/mturk_2stage/pilot_results_qualtrics.csv", index=False)

In [354]:
#merge_df.to_stata("../results/mturk_2stage/pilot_results.dta", variable_labels=labels)