<a href="https://colab.research.google.com/github/ipeirotis/mturk_demographics/blob/master/1_Fetch_Demographics_Data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Demographics API

Below we have the code that retrieves the data from the  Mechanical Turk Tracker Demographics API.

In [1]:
from datetime import datetime
import time
import numpy as np
import pandas as pd

In [2]:


# This function takes as input the response for a single survey, and transforms it into a flat dictionary
def flatten(item):
    fmt = "%Y-%m-%dT%H:%M:%S.%f%z"
    
    hit_answer_date = datetime.strptime(item["date"], fmt)
    hit_creation_str = item.get("hitCreationDate")
    
    if hit_creation_str is None: 
        hit_creation_date = None 
        diff = None
    else:
        hit_creation_date = datetime.strptime(hit_creation_str, fmt)
        # convert to unix timestamp
        hit_date_ts = time.mktime(hit_creation_date.timetuple())
        answer_date_ts = time.mktime(hit_answer_date.timetuple())
        diff = int(answer_date_ts-hit_date_ts)
    
    result = {
        "worker_id": str(item["workerId"]),
        "gender": str(item["answers"]["gender"]).lower(),
        "household_income": str(item["answers"]["householdIncome"]),
        "educational_level": str(item["answers"].get("educationalLevel")),
        "household_size": str(item["answers"]["householdSize"]),
        "marital_status": str(item["answers"].get("maritalStatus")),
        "languages_spoken": str(item["answers"].get("languagesSpoken")),
        "time_spent_on_mturk": str(item["answers"].get("timeSpentOnMturk")),
        "weekly_income_from_mturk": str(item["answers"].get("weeklyIncomeFromMturk")),
        "year_of_birth": int(item["answers"]["yearOfBirth"]),
        "location_city": str(item.get("locationCity")),
        "location_region": str(item.get("locationRegion")),
        "location_country": str(item["locationCountry"]),
        "hit_answered_date": hit_answer_date,
        "hit_creation_date": hit_creation_date,
        "post_to_completion_secs": diff
    }
    return result

In [3]:
# The code below retrieves all the responses from the Demographics API
# Since we cannot get all the responses at once, we fetch a few thousand
# records at a time, until fetching them all

import requests
import json

limit = 1000

# The API call that returns the last survey responses
# The API call that returns the last survey responses
baseurl = "https://demographics.mturk-tracker.com" + \
    "/api/survey/demographics/answers?limit=" + str(limit)

# This is the cursor variable, used to retrieve more pages of results
nextPageToken = ""

# We store the results in this list
results = []

while True:

    url = baseurl + "&cursor=" + nextPageToken

    resp = requests.get(url)
    if resp.status_code == 200:
        data = json.loads(resp.text)
        items = data.get("items")
        if items == None:
            break
        print("Retrieved ", len(items), " responses")
        responses = [flatten(item) for item in items]
        results.extend(responses)
        print("Total of ", len(results), " responses in our data")
    else:
        print("Something went wrong with the network call")

    nextPageToken = data.get("nextPageToken")
    if nextPageToken == None:
        break

Retrieved  1000  responses
Total of  1000  responses in our data
Retrieved  1000  responses
Total of  2000  responses in our data
Retrieved  1000  responses
Total of  3000  responses in our data
Retrieved  1000  responses
Total of  4000  responses in our data
Retrieved  1000  responses
Total of  5000  responses in our data
Retrieved  1000  responses
Total of  6000  responses in our data
Retrieved  1000  responses
Total of  7000  responses in our data
Retrieved  1000  responses
Total of  8000  responses in our data
Retrieved  1000  responses
Total of  9000  responses in our data
Retrieved  1000  responses
Total of  10000  responses in our data
Retrieved  1000  responses
Total of  11000  responses in our data
Retrieved  1000  responses
Total of  12000  responses in our data
Retrieved  1000  responses
Total of  13000  responses in our data
Retrieved  1000  responses
Total of  14000  responses in our data
Retrieved  1000  responses
Total of  15000  responses in our data
Retrieved  1000  re

In [4]:
# Fetch the old data as well (Mar 2015 - Oct 2020)
URL = 'https://github.com/ipeirotis/mturk_demographics/raw/master/mturk_surveys_extended_mar15_oct20.zip'
df_old = pd.read_csv(URL)
df_old = df_old.drop('Unnamed: 0', axis='columns')
df_old

Unnamed: 0,educational_level,gender,hit_answered_date,hit_creation_date,household_income,household_size,languages_spoken,location_city,location_country,location_region,marital_status,post_to_completion_secs,time_spent_on_mturk,weekly_income_from_mturk,worker_id,year_of_birth
0,Bachelors degree,male,2020-10-31 09:20:06.303,2020-10-31 09:18:00.000,"$75,000-$99,999",5+,English,nagercoil,IN,tn,single,126.0,20-40 hours per week,$100-$200 per week,a8dc59ede670806d50921448f353daf8,1996
1,Bachelors degree,female,2020-10-31 09:05:52.654,2020-10-31 09:03:01.000,"$25,000-$39,999",2,English,st. louis,US,mo,cohabitating,171.0,2-4 hours per week,$1-$5 per week,7073f0b49d9081064fbbd4921a54df0b,1972
2,Associates degree,female,2020-10-31 08:59:14.727,2020-10-31 07:48:00.000,"Less than $10,000",1,Portuguese,porto alegre,BR,rs,single,4274.0,Less than 1 hour per week,Less than $1 per week,0ac1d80c558377ee41e854290e87b10e,1999
3,Bachelors degree,female,2020-10-31 08:55:05.752,2020-10-31 08:48:01.000,"$40,000-$59,999",3,English,chennai,IN,tn,single,424.0,4-8 hours per week,$20-$50 per week,8f8ac6136e51afe4cf0c28fc70268e57,1995
4,Bachelors degree,female,2020-10-31 08:38:41.705,2020-10-31 08:33:07.000,"$75,000-$99,999",4,English,pueblo,US,co,married,334.0,20-40 hours per week,$50-$100 per week,a1d3418861d39b491ff8defb76b416ed,1982
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
190159,,male,2015-03-26 12:35:10.796,2015-03-26 12:34:19.000,"$25,000-$39,999",1,,port richey,US,fl,single,51.0,,,5b044cf509da1d8444b6f60c465240ef,1964
190160,,female,2015-03-26 12:20:09.611,2015-03-26 12:19:10.000,"Less than $10,000",3,,chandigarh,IN,ch,single,59.0,,,2f8a2cd8573908a6b5898f13f9b734a6,1990
190161,,male,2015-03-26 12:05:22.182,2015-03-26 12:04:01.000,"$75,000-$99,999",4,,trumbull,US,ct,single,81.0,,,99ddbb8d2492a375929ec043718dabaa,1980
190162,,female,2015-03-26 11:50:08.004,2015-03-26 11:48:52.000,"$75,000-$99,999",2,,san antonio,US,tx,single,76.0,,,6274f9e49910440fc1d73c191901067f,1963


In [5]:
df_old['hit_answered_date'] = pd.to_datetime(df_old['hit_answered_date'], utc=True)
df_old['hit_creation_date'] = pd.to_datetime(df_old['hit_creation_date'], utc=True)

df_old.dtypes

educational_level                        object
gender                                   object
hit_answered_date           datetime64[ns, UTC]
hit_creation_date           datetime64[ns, UTC]
household_income                         object
household_size                           object
languages_spoken                         object
location_city                            object
location_country                         object
location_region                          object
marital_status                           object
post_to_completion_secs                 float64
time_spent_on_mturk                      object
weekly_income_from_mturk                 object
worker_id                                object
year_of_birth                             int64
dtype: object

In [6]:

df = pd.DataFrame(results)
df = pd.concat([df, df_old])

In [7]:
df.dtypes

worker_id                                object
gender                                   object
household_income                         object
educational_level                        object
household_size                           object
marital_status                           object
languages_spoken                         object
time_spent_on_mturk                      object
weekly_income_from_mturk                 object
year_of_birth                             int64
location_city                            object
location_region                          object
location_country                         object
hit_answered_date           datetime64[ns, UTC]
hit_creation_date           datetime64[ns, UTC]
post_to_completion_secs                 float64
dtype: object

In [8]:
df.shape

(284338, 16)

In [9]:
# Let's save the file as a CSV
df.to_csv("mturk_surveys_extended.csv")

In [10]:
# Let's print the total number of retrieved responses
print("Total number of responses:", df.shape[0])

Total number of responses: 284338


In [11]:
print("Unique workers:", len(set(df.worker_id.values)))

Unique workers: 122293


In [12]:

min_date = df.hit_answered_date.min()
max_date = df.hit_answered_date.max()
print("First date:", min_date)
print("Last date:", max_date)
print("Duration:", max_date - min_date)

First date: 2015-03-26 11:47:28.877000+00:00
Last date: 2023-05-29 14:58:05.595000+00:00
Duration: 2986 days 03:10:36.718000


In [13]:
df

Unnamed: 0,worker_id,gender,household_income,educational_level,household_size,marital_status,languages_spoken,time_spent_on_mturk,weekly_income_from_mturk,year_of_birth,location_city,location_region,location_country,hit_answered_date,hit_creation_date,post_to_completion_secs
0,d190976af10a9a62a4013eddbe44cb4c,female,"$40,000-$59,999",Bachelors degree,4,married,English,More than 40 hours per week,$100-$200 per week,1993,?,?,US,2023-05-29 14:58:05.595000+00:00,NaT,
1,e02fabf23946eb8cb0ce173fcc03f47e,female,"$60,000-$74,999",Bachelors degree,4,married,English,More than 40 hours per week,$100-$200 per week,1964,lafayette,la,US,2023-05-29 14:40:51.732000+00:00,NaT,
2,569d1015103c0bd8ff12ee5ed328d3dd,female,"$40,000-$59,999",Bachelors degree,3,married,English,8-20 hours per week,$100-$200 per week,1997,washington,dc,US,2023-05-29 14:07:43.717000+00:00,NaT,
3,797f504ed6db84b3f8498ed6ac8d8b02,female,"$40,000-$59,999","Graduate degree, Masters",3,married,English,8-20 hours per week,$10-$20 per week,1986,?,?,US,2023-05-29 13:45:41.587000+00:00,NaT,
4,51123533e618a45204f0e2a06d0f9f95,female,"$75,000-$99,999",Bachelors degree,4,married,English,8-20 hours per week,$50-$100 per week,1997,?,?,US,2023-05-29 13:37:51.787000+00:00,NaT,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
190159,5b044cf509da1d8444b6f60c465240ef,male,"$25,000-$39,999",,1,single,,,,1964,port richey,fl,US,2015-03-26 12:35:10.796000+00:00,2015-03-26 12:34:19+00:00,51.0
190160,2f8a2cd8573908a6b5898f13f9b734a6,female,"Less than $10,000",,3,single,,,,1990,chandigarh,ch,IN,2015-03-26 12:20:09.611000+00:00,2015-03-26 12:19:10+00:00,59.0
190161,99ddbb8d2492a375929ec043718dabaa,male,"$75,000-$99,999",,4,single,,,,1980,trumbull,ct,US,2015-03-26 12:05:22.182000+00:00,2015-03-26 12:04:01+00:00,81.0
190162,6274f9e49910440fc1d73c191901067f,female,"$75,000-$99,999",,2,single,,,,1963,san antonio,tx,US,2015-03-26 11:50:08.004000+00:00,2015-03-26 11:48:52+00:00,76.0
