### Demographics API

Below we have the code that retrieves the data from the  Mechanical Turk Tracker Demographics API.

In [1]:
from datetime import datetime
import time

# This function takes as input the response for a single survey, and transforms it into a flat dictionary
def flatten(item):
    fmt = "%Y-%m-%dT%H:%M:%S.%fZ"
    
    hit_answer_date = datetime.strptime(item["date"], fmt)
    hit_creation_str = item.get("hitCreationDate")
    
    if hit_creation_str is None: 
        hit_creation_date = None 
        diff = None
    else:
        hit_creation_date = datetime.strptime(hit_creation_str, fmt)
        # convert to unix timestamp
        hit_date_ts = time.mktime(hit_creation_date.timetuple())
        answer_date_ts = time.mktime(hit_answer_date.timetuple())
        diff = int(answer_date_ts-hit_date_ts)
    
    result = {
        "worker_id": str(item["workerId"]),
        "gender": str(item["answers"]["gender"]).lower(),
        "household_income": str(item["answers"]["householdIncome"]),
        "educational_level": str(item["answers"].get("educationalLevel")),
        "household_size": str(item["answers"]["householdSize"]),
        "marital_status": str(item["answers"].get("maritalStatus")),
        "languages_spoken": str(item["answers"].get("languagesSpoken")),
        "time_spent_on_mturk": str(item["answers"].get("timeSpentOnMturk")),
        "weekly_income_from_mturk": str(item["answers"].get("weeklyIncomeFromMturk")),
        "year_of_birth": int(item["answers"]["yearOfBirth"]),
        "location_city": str(item.get("locationCity")),
        "location_region": str(item.get("locationRegion")),
        "location_country": str(item["locationCountry"]),
        "hit_answered_date": hit_answer_date,
        "hit_creation_date": hit_creation_date,
        "post_to_completion_secs": diff
    }
    return result

In [2]:
# The code below retrieves all the responses from the Demographics API
# Since we cannot get all the responses at once, we fetch a few thousand
# records at a time, until fetching them all

import requests
import json

limit = 5000

# The API call that returns the last survey responses
baseurl = "https://mturk-surveys.appspot.com/" + \
    "_ah/api/survey/v1/survey/demographics/answers?limit=" + str(limit)

# This is the cursor variable, used to retrieve more pages of results
nextPageToken = None

# We store the results in this list
results = []

while True:
    if nextPageToken == None:
        url = baseurl
    else:
        url = baseurl + "&cursor=" + nextPageToken

    resp = requests.get(url)
    if resp.status_code == 200:
        data = json.loads(resp.text)
        items = data.get("items")
        if items == None:
            break
        print("Retrieved ", len(items), " responses")
        responses = [flatten(item) for item in items]
        results.extend(responses)
        print("Total of ", len(results), " responses in our data")
    else:
        print("Something went wrong with the network call")

    nextPageToken = data.get("nextPageToken")
    if nextPageToken == None:
        break


Retrieved  5000  responses
Total of  5000  responses in our data
Retrieved  5000  responses
Total of  10000  responses in our data
Retrieved  5000  responses
Total of  15000  responses in our data
Retrieved  5000  responses
Total of  20000  responses in our data
Retrieved  5000  responses
Total of  25000  responses in our data
Retrieved  5000  responses
Total of  30000  responses in our data
Retrieved  5000  responses
Total of  35000  responses in our data
Retrieved  5000  responses
Total of  40000  responses in our data
Retrieved  5000  responses
Total of  45000  responses in our data
Retrieved  5000  responses
Total of  50000  responses in our data
Retrieved  5000  responses
Total of  55000  responses in our data
Retrieved  5000  responses
Total of  60000  responses in our data
Retrieved  5000  responses
Total of  65000  responses in our data
Retrieved  5000  responses
Total of  70000  responses in our data
Retrieved  5000  responses
Total of  75000  responses in our data
Retrieved  

In [3]:
# Let's print the total number of retrieved responses
print(len(results))

172075


In [4]:
import pandas as pd
df = pd.DataFrame(results)
# Let's save the file as a CSV
df.to_csv("mturk_surveys_extended.csv")

In [5]:
df.languages_spoken.values

array(['English', 'English,French', 'English', ..., 'None', 'None',
       'None'], dtype=object)

In [6]:
lol = [entries.split(',') for entries in df.languages_spoken.values if entries!='None']
s = set()
for l in lol:
    for m in l:
        s.add(m)
s

{'',
 'Abkhazian',
 'Afar',
 'Afrikaans',
 'Albanian',
 'Amharic',
 'Arabic',
 'Armenian',
 'Assamese',
 'Azerbaijani',
 'Bashkir',
 'Basque',
 'Bengali',
 'Bihari',
 'Bulgarian',
 'Burmese',
 'Byelorussian',
 'Cambodian',
 'Catalan',
 'Chinese',
 'Croatian',
 'Czech',
 'Danish',
 'Dutch',
 'English',
 'Esperanto',
 'Estonian',
 'Fiji',
 'Finnish',
 'French',
 'Frisian',
 'Gaelic',
 'Galician',
 'Georgian',
 'German',
 'Greek',
 'Guarani',
 'Gujarati',
 'Hausa',
 'Hebrew',
 'Hindi',
 'Hungarian',
 'Icelandic',
 'Indonesian',
 'Interlingua',
 'Irish',
 'Italian',
 'Japanese',
 'Javanese',
 'Kannada',
 'Kashmiri',
 'Korean',
 'Kurdish',
 'Laothian',
 'Latin',
 'Latvian',
 'Lithuanian',
 'Macedonian',
 'Malagasy',
 'Malay',
 'Malayalam',
 'Maltese',
 'Marathi',
 'Mongolian',
 'Nepali',
 'Norwegian',
 'Oriya',
 'Pashto',
 'Persian',
 'Polish',
 'Portuguese',
 'Punjabi',
 'Romanian',
 'Russian',
 'Samoan',
 'Sanskrit',
 'Serbian',
 'Serbo-Croatian',
 'Shona',
 'Sindhi',
 'Singhalese',
 'Slo

In [7]:
df.people_with_language

AttributeError: 'DataFrame' object has no attribute 'people_with_language'

In [None]:
people_with_language = len([v for v in df.languages_spoken.values if v!='None' and v!=''])
people_with_language

In [None]:
df.location_country.value_counts()

In [None]:
df [  df.languages_spoken.str.contains(',') ].location_country.value_counts()

In [None]:
# bilingual and above
len(df [ df.languages_spoken.str.contains(',') ].worker_id.value_counts())

In [None]:
df['bilingual'] = df.languages_spoken.str.contains(',')

In [None]:
pv_bilingual = df.pivot_table(
    index = 'location_country',
    columns='bilingual',
    values = 'worker_id',
    aggfunc='count'
).fillna(0)

pv_bilingual['perc_bilingual'] = pv_bilingual[True] / (pv_bilingual[True] + pv_bilingual[False])
pv_bilingual.sort_values('perc_bilingual', ascending=False)

In [None]:
# unique ids of workers that answered the language question
len(df [ df.languages_spoken !='None' ].worker_id.value_counts())

In [None]:
df [  df.languages_spoken.str.contains('English') ].location_country.value_counts()

In [None]:
df [  df.languages_spoken.str.contains('Tamil') ].location_country.value_counts()

In [None]:
df [  df.languages_spoken.str.contains('Spanish') ].location_country.value_counts()

In [None]:
df [  df.languages_spoken.str.contains('Hindi') ].location_country.value_counts()

In [None]:
df [  df.languages_spoken.str.contains('French') ].location_country.value_counts()

In [None]:
df [  df.languages_spoken.str.contains('Malayalam') ].location_country.value_counts()

In [None]:
df [  df.languages_spoken.str.contains('Tegulu') ].location_country.value_counts()

In [None]:
df [  df.languages_spoken.str.contains('German') ].location_country.value_counts()

In [None]:
(df [  df.languages_spoken.str.contains('Chinese') ].location_country.value_counts() / df.location_country.value_counts()).sort_values(ascending=False)

In [None]:
df [  df.languages_spoken.str.contains('Macedonian') ].location_country.value_counts()

In [None]:
languages = set([l for v in df.languages_spoken.values if v!='None' 
                 for l in v.split(',') if l!=''])

In [None]:

result = []
for language in languages:
    people = len(set(df[ df.languages_spoken.str.contains(language) ].worker_id.values))
    result.append({"lang":language, "unique_workers": people})
    

df_cnt = pd.DataFrame(result).sort_values('unique_workers', ascending=False)    
print (df_cnt)    
# more than 10 people for the language

#df2 = pd.DataFrame(result).sort_values('unique_workers', ascending=False)
#enough = df2 [df2.unique_workers > 9]
#len(enough)

In [None]:
df_cnt.unique_workers.value_counts().sort_index()

In [24]:
len([v for v in df.languages_spoken.values if 'Spanish' in v])

5044

In [25]:
1189 / 19268 * 170000

10490.450487855513

In [26]:
def intersect(a, b):
    """ return the intersection of two lists """
    return list(set(a) & set(b))

my_langauge = ['English', 'Tamil', 'Spanish', 'Hindi', 'Malayalam', 'French', 'Telugu', 'Chinese', 'German', 'Kannada', 'Italian', 'Portuguese', 'Marathi', 'Arabic', 'Russian', 'Japanese', 'Gujarati', 'Urdu', 'Bengali', 'Punjabi', 'Korean', 'Tagalog', 'Romanian', 'Vietnamese', 'Greek', 'Polish', 'Dutch', 'Turkish', 'Hebrew', 'Swedish', 'Serbian', 'Nepali', 'Bulgarian', 'Macedonian', 'Oriya']
pavlick_langauge = ['English', 'Tamil', 'Malayalam', 'Hindi', 'Spanish', 'Telugu', 'Chinese', 'Romanian', 'Portuguese', 'Arabic', 'Kannada', 'German', 'French', 'Polish', 'Urdu', 'Tagalog', 'Marathi', 'Russian', 'Italian', 'Bengali', 'Gujarati', 'Hebrew', 'Dutch', 'Turkish', 'Vietnamese', 'Macedonian', 'Cebuano', 'Swedish', 'Bulgarian', 'Swahili', 'Hungarian', 'Catalan', 'Thai', 'Lithuanian', 'Punjabi']

intersect_language = intersect(my_langauge, pavlick_langauge)
len(intersect_language)


29

In [27]:
from scipy.stats import spearmanr

In [28]:
spearmanr(my_langauge,pavlick_langauge)



SpearmanrResult(correlation=0.057703081232493, pvalue=0.7419647128381073)