<a href="https://colab.research.google.com/github/ipeirotis/mturk_demographics/blob/master/1_Fetch_Demographics_Data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Demographics API

Below we have the code that retrieves the data from the  Mechanical Turk Tracker Demographics API.

In [None]:
from datetime import datetime
import time

# This function takes as input the response for a single survey, and transforms it into a flat dictionary
def flatten(item):
    fmt = "%Y-%m-%dT%H:%M:%S.%f%z"
    
    hit_answer_date = datetime.strptime(item["date"], fmt)
    hit_creation_str = item.get("hitCreationDate")
    
    if hit_creation_str is None: 
        hit_creation_date = None 
        diff = None
    else:
        hit_creation_date = datetime.strptime(hit_creation_str, fmt)
        # convert to unix timestamp
        hit_date_ts = time.mktime(hit_creation_date.timetuple())
        answer_date_ts = time.mktime(hit_answer_date.timetuple())
        diff = int(answer_date_ts-hit_date_ts)
    
    result = {
        "worker_id": str(item["workerId"]),
        "gender": str(item["answers"]["gender"]).lower(),
        "household_income": str(item["answers"]["householdIncome"]),
        "household_size": str(item["answers"]["householdSize"]),
        "marital_status": str(item["answers"].get("maritalStatus")),
        "year_of_birth": int(item["answers"]["yearOfBirth"]),
        "location_city": str(item.get("locationCity")),
        "location_region": str(item.get("locationRegion")),
        "location_country": str(item["locationCountry"]),
        "hit_answered_date": hit_answer_date,
        "hit_creation_date": hit_creation_date,
        "post_to_completion_secs": diff
    }
    return result

In [None]:
# The code below retrieves all the responses from the Demographics API
# Since we cannot get all the responses at once, we fetch a few thousand
# records at a time, until fetching them all

import requests
import json

limit = 1000

# The API call that returns the last survey responses
baseurl = "https://demographics.mturk-tracker.com" + \
    "/api/survey/demographics/answers?limit=" + str(limit)

print(baseurl)
# This is the cursor variable, used to retrieve more pages of results
nextPageToken = ""

# We store the results in this list
results = []

while True:
    url = baseurl + "&cursor=" + nextPageToken
    resp = requests.get(url)
    # print(resp.text)
    if resp.status_code == 200:
        data = json.loads(resp.text)
        items = data.get("items")
        if items == None:
            break
        print("Retrieved ", len(items), " responses")
        responses = [flatten(item) for item in items]
        results.extend(responses)
        print("Total of ", len(results), " responses in our data")
    else:
        print("Something went wrong with the network call")

    nextPageToken = data.get("nextPageToken")
    if nextPageToken == None:
        break


In [None]:
# Fetch the old data as well (Mar 2015 - Oct 2020)
URL = 'https://github.com/ipeirotis/mturk_demographics/raw/master/mturk_surveys_mar15_oct20.zip'
df_old = pd.read_csv(URL)
df_old = df_old.drop('Unnamed: 0', axis='columns')
df_old

In [None]:
df_old['hit_answered_date'] = pd.to_datetime(df_old['hit_answered_date'], utc=True)
df_old['hit_creation_date'] = pd.to_datetime(df_old['hit_creation_date'], utc=True)

df_old.dtypes

In [None]:
import pandas as pd
df = pd.DataFrame(results)

In [None]:
df.dtypes

In [None]:
df = pd.DataFrame(results)
df = pd.concat([df, df_old])

# Let's save the file as a CSV
df.to_csv("mturk_surveys.csv")

In [None]:
# Let's print the total number of retrieved responses
print("Total number of responses:", df.shape[0])

In [None]:
print("Unique workers:", len(set(df.worker_id.values)))

In [None]:
import numpy as np

min_date = df.hit_answered_date.min()
max_date = df.hit_answered_date.max()
print("First date:", min_date)
print("Last date:", max_date)
print("Duration:", max_date - min_date)

In [None]:
df