### Demographics API

Below we have the code that retrieves the data from the  Mechanical Turk Tracker Demographics API.

In [1]:
from datetime import datetime
import time

# This function takes as input the response for a single survey, and transforms it into a flat dictionary
def flatten(item):
    fmt = "%Y-%m-%dT%H:%M:%S.%fZ"
    
    hit_answer_date = datetime.strptime(item["date"], fmt)
    hit_creation_str = item.get("hitCreationDate")
    
    if hit_creation_str is None: 
        hit_creation_date = None 
        diff = None
    else:
        hit_creation_date = datetime.strptime(hit_creation_str, fmt)
        # convert to unix timestamp
        hit_date_ts = time.mktime(hit_creation_date.timetuple())
        answer_date_ts = time.mktime(hit_answer_date.timetuple())
        diff = int(answer_date_ts-hit_date_ts)
    
    result = {
        "worker_id": str(item["workerId"]),
        "gender": str(item["answers"]["gender"]).lower(),
        "household_income": str(item["answers"]["householdIncome"]),
        "educational_level": str(item["answers"].get("educationalLevel")),
        "household_size": str(item["answers"]["householdSize"]),
        "marital_status": str(item["answers"].get("maritalStatus")),
        "languages_spoken": str(item["answers"].get("languagesSpoken")),
        "time_spent_on_mturk": str(item["answers"].get("timeSpentOnMturk")),
        "weekly_income_from_mturk": str(item["answers"].get("weeklyIncomeFromMturk")),
        "year_of_birth": int(item["answers"]["yearOfBirth"]),
        "location_city": str(item.get("locationCity")),
        "location_region": str(item.get("locationRegion")),
        "location_country": str(item["locationCountry"]),
        "hit_answered_date": hit_answer_date,
        "hit_creation_date": hit_creation_date,
        "post_to_completion_secs": diff
    }
    return result

In [2]:
# The code below retrieves all the responses from the Demographics API
# Since we cannot get all the responses at once, we fetch a few thousand
# records at a time, until fetching them all

import requests
import json

limit = 5000

# The API call that returns the last survey responses
baseurl = "https://mturk-surveys.appspot.com/" + \
    "_ah/api/survey/v1/survey/demographics/answers?limit=" + str(limit)

# This is the cursor variable, used to retrieve more pages of results
nextPageToken = None

# We store the results in this list
results = []

while True:
    if nextPageToken == None:
        url = baseurl
    else:
        url = baseurl + "&cursor=" + nextPageToken

    resp = requests.get(url)
    if resp.status_code == 200:
        data = json.loads(resp.text)
        items = data.get("items")
        if items == None:
            break
        print("Retrieved ", len(items), " responses")
        responses = [flatten(item) for item in items]
        results.extend(responses)
        print("Total of ", len(results), " responses in our data")
    else:
        print("Something went wrong with the network call")

    nextPageToken = data.get("nextPageToken")
    if nextPageToken == None:
        break


Retrieved  5000  responses
Total of  5000  responses in our data
Retrieved  5000  responses
Total of  10000  responses in our data
Retrieved  5000  responses
Total of  15000  responses in our data
Retrieved  5000  responses
Total of  20000  responses in our data
Retrieved  5000  responses
Total of  25000  responses in our data
Retrieved  5000  responses
Total of  30000  responses in our data
Retrieved  5000  responses
Total of  35000  responses in our data
Retrieved  5000  responses
Total of  40000  responses in our data
Retrieved  5000  responses
Total of  45000  responses in our data
Retrieved  5000  responses
Total of  50000  responses in our data
Retrieved  5000  responses
Total of  55000  responses in our data
Retrieved  5000  responses
Total of  60000  responses in our data
Retrieved  5000  responses
Total of  65000  responses in our data
Retrieved  5000  responses
Total of  70000  responses in our data
Retrieved  5000  responses
Total of  75000  responses in our data
Something w

In [3]:
# Let's print the total number of retrieved responses
print(len(results))

188039


In [4]:
import pandas as pd
df = pd.DataFrame(results)
# Let's save the file as a CSV
df.to_csv("mturk_surveys_extended.csv")

In [5]:
df.languages_spoken.values

array(['English', 'English,Spanish', 'English', ..., 'None', 'None',
       'None'], dtype=object)

In [6]:
lol = [entries.split(',') for entries in df.languages_spoken.values if entries!='None']
s = set()
for l in lol:
    for m in l:
        s.add(m)
s

{'',
 'Abkhazian',
 'Afar',
 'Afrikaans',
 'Albanian',
 'Amharic',
 'Arabic',
 'Armenian',
 'Assamese',
 'Azerbaijani',
 'Bashkir',
 'Basque',
 'Bengali',
 'Bihari',
 'Bulgarian',
 'Burmese',
 'Byelorussian',
 'Cambodian',
 'Catalan',
 'Chinese',
 'Croatian',
 'Czech',
 'Danish',
 'Dutch',
 'English',
 'Esperanto',
 'Estonian',
 'Fiji',
 'Finnish',
 'French',
 'Frisian',
 'Gaelic',
 'Galician',
 'Georgian',
 'German',
 'Greek',
 'Guarani',
 'Gujarati',
 'Hausa',
 'Hebrew',
 'Hindi',
 'Hungarian',
 'Icelandic',
 'Indonesian',
 'Interlingua',
 'Irish',
 'Italian',
 'Japanese',
 'Javanese',
 'Kannada',
 'Kashmiri',
 'Kazakh',
 'Korean',
 'Kurdish',
 'Laothian',
 'Latin',
 'Latvian',
 'Lithuanian',
 'Macedonian',
 'Malagasy',
 'Malay',
 'Malayalam',
 'Maltese',
 'Marathi',
 'Mongolian',
 'Nauru',
 'Nepali',
 'Norwegian',
 'Oriya',
 'Pashto',
 'Persian',
 'Polish',
 'Portuguese',
 'Punjabi',
 'Quechua',
 'Romanian',
 'Russian',
 'Samoan',
 'Sanskrit',
 'Serbian',
 'Serbo-Croatian',
 'Setswa

In [8]:
people_with_language = len([v for v in df.languages_spoken.values if v!='None' and v!=''])
people_with_language

106972

In [9]:
df.location_country.value_counts()

US    136310
IN     34084
CA      2724
GB      1991
BR      1498
IT      1054
DE       716
PH       554
FR       524
ES       504
VE       493
ZZ       380
MX       328
AU       291
NL       251
AE       235
KE       220
PK       201
RO       196
JP       179
TR       176
NG       174
RU       161
ID       160
PT       159
TH       159
IE       156
GR       153
MK       140
UA       133
       ...  
ME         2
AM         2
SR         2
DJ         2
LI         2
PY         2
HT         2
MV         2
SX         2
LA         2
MO         2
AZ         2
RW         2
TG         1
CN         1
GM         1
BM         1
FJ         1
BU         1
AI         1
IM         1
NE         1
TZ         1
CD         1
AG         1
PS         1
PF         1
UZ         1
GN         1
FM         1
Name: location_country, Length: 157, dtype: int64

In [10]:
df [  df.languages_spoken.str.contains(',') ].location_country.value_counts()

IN    8844
US    8341
BR    1003
IT     654
CA     644
DE     378
ES     315
FR     264
GB     258
VE     165
MX     161
NL      99
RO      77
PH      71
PT      64
KE      54
CO      52
RU      52
GR      48
AU      47
AE      45
JP      44
EG      43
IE      40
MK      40
BD      39
BG      36
BE      36
AR      35
ID      34
      ... 
ZA       5
LK       5
SV       5
NZ       5
CY       4
MT       4
CH       4
SK       4
JO       3
IQ       3
LV       3
MD       3
NI       3
NA       2
AL       2
MN       2
OM       2
GE       2
CW       2
JM       2
UG       1
BY       1
MU       1
SR       1
MV       1
LA       1
FM       1
UY       1
RW       1
ME       1
Name: location_country, Length: 108, dtype: int64

In [11]:
# bilingual and above
len(df [ df.languages_spoken.str.contains(',') ].worker_id.value_counts())

11471

In [12]:
df['bilingual'] = df.languages_spoken.str.contains(',')

In [13]:
pv_bilingual = df.pivot_table(
    index = 'location_country',
    columns='bilingual',
    values = 'worker_id',
    aggfunc='count'
).fillna(0)

pv_bilingual['perc_bilingual'] = pv_bilingual[True] / (pv_bilingual[True] + pv_bilingual[False])
pv_bilingual.sort_values('perc_bilingual', ascending=False)

bilingual,False,True,perc_bilingual
location_country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
FM,0.0,1.0,1.000000
LB,5.0,18.0,0.782609
BR,495.0,1003.0,0.669559
BH,14.0,24.0,0.631579
ES,189.0,315.0,0.625000
IT,400.0,654.0,0.620493
DZ,17.0,24.0,0.585366
EG,34.0,43.0,0.558442
MA,19.0,23.0,0.547619
TN,8.0,9.0,0.529412


In [14]:
# unique ids of workers that answered the language question
len(df [ df.languages_spoken !='None' ].worker_id.value_counts())

51463

In [15]:
df [  df.languages_spoken.str.contains('English') ].location_country.value_counts()

US    75045
IN    18652
CA     1786
GB     1397
BR     1117
IT      708
DE      457
ES      346
FR      276
PH      260
VE      241
ZZ      213
MX      190
NL      174
AU      172
KE      144
PK      141
AE      107
IE       95
NG       95
JP       92
RO       92
UA       85
TH       82
PT       77
ID       76
CO       65
RU       62
BD       62
TR       60
      ...  
GD        2
ET        2
KG        2
RW        2
CM        2
LU        2
JE        2
AL        2
ME        2
BS        2
TJ        2
GY        2
SX        2
UY        2
ZM        2
MO        1
SR        1
AM        1
KN        1
MU        1
BN        1
TG        1
LA        1
HT        1
UZ        1
MV        1
FM        1
DJ        1
GN        1
NE        1
Name: location_country, Length: 139, dtype: int64

In [None]:
df [  df.languages_spoken.str.contains('Tamil') ].location_country.value_counts()

In [None]:
df [  df.languages_spoken.str.contains('Spanish') ].location_country.value_counts()

In [None]:
df [  df.languages_spoken.str.contains('Hindi') ].location_country.value_counts()

In [None]:
df [  df.languages_spoken.str.contains('French') ].location_country.value_counts()

In [None]:
df [  df.languages_spoken.str.contains('Malayalam') ].location_country.value_counts()

In [None]:
df [  df.languages_spoken.str.contains('Tegulu') ].location_country.value_counts()

In [None]:
df [  df.languages_spoken.str.contains('German') ].location_country.value_counts()

In [None]:
(df [  df.languages_spoken.str.contains('Chinese') ].location_country.value_counts() / df.location_country.value_counts()).sort_values(ascending=False)

In [None]:
df [  df.languages_spoken.str.contains('Macedonian') ].location_country.value_counts()

In [None]:
languages = set([l for v in df.languages_spoken.values if v!='None' 
                 for l in v.split(',') if l!=''])

In [None]:

result = []
for language in languages:
    people = len(set(df[ df.languages_spoken.str.contains(language) ].worker_id.values))
    result.append({"lang":language, "unique_workers": people})
    

df_cnt = pd.DataFrame(result).sort_values('unique_workers', ascending=False)    
print (df_cnt)    
# more than 10 people for the language

#df2 = pd.DataFrame(result).sort_values('unique_workers', ascending=False)
#enough = df2 [df2.unique_workers > 9]
#len(enough)

In [None]:
df_cnt.unique_workers.value_counts().sort_index()

In [None]:
len([v for v in df.languages_spoken.values if 'Spanish' in v])

In [None]:
1189 / 19268 * 170000

In [None]:
def intersect(a, b):
    """ return the intersection of two lists """
    return list(set(a) & set(b))

my_langauge = ['English', 'Tamil', 'Spanish', 'Hindi', 'Malayalam', 'French', 'Telugu', 'Chinese', 'German', 'Kannada', 'Italian', 'Portuguese', 'Marathi', 'Arabic', 'Russian', 'Japanese', 'Gujarati', 'Urdu', 'Bengali', 'Punjabi', 'Korean', 'Tagalog', 'Romanian', 'Vietnamese', 'Greek', 'Polish', 'Dutch', 'Turkish', 'Hebrew', 'Swedish', 'Serbian', 'Nepali', 'Bulgarian', 'Macedonian', 'Oriya']
pavlick_langauge = ['English', 'Tamil', 'Malayalam', 'Hindi', 'Spanish', 'Telugu', 'Chinese', 'Romanian', 'Portuguese', 'Arabic', 'Kannada', 'German', 'French', 'Polish', 'Urdu', 'Tagalog', 'Marathi', 'Russian', 'Italian', 'Bengali', 'Gujarati', 'Hebrew', 'Dutch', 'Turkish', 'Vietnamese', 'Macedonian', 'Cebuano', 'Swedish', 'Bulgarian', 'Swahili', 'Hungarian', 'Catalan', 'Thai', 'Lithuanian', 'Punjabi']

intersect_language = intersect(my_langauge, pavlick_langauge)
len(intersect_language)


In [None]:
from scipy.stats import spearmanr

In [None]:
spearmanr(my_langauge,pavlick_langauge)