### Demographics API

Below we have the code that retrieves the data from the  Mechanical Turk Tracker Demographics API.

In [1]:
from datetime import datetime
import time

# This function takes as input the response for a single survey, and transforms it into a flat dictionary
def flatten(item):
    fmt = "%Y-%m-%dT%H:%M:%S.%fZ"
    
    hit_answer_date = datetime.strptime(item["date"], fmt)
    hit_creation_str = item.get("hitCreationDate")
    
    if hit_creation_str is None: 
        hit_creation_date = None 
        diff = None
    else:
        hit_creation_date = datetime.strptime(hit_creation_str, fmt)
        # convert to unix timestamp
        hit_date_ts = time.mktime(hit_creation_date.timetuple())
        answer_date_ts = time.mktime(hit_answer_date.timetuple())
        diff = int(answer_date_ts-hit_date_ts)
    
    result = {
        "worker_id": str(item["workerId"]),
        "gender": str(item["answers"]["gender"]).lower(),
        "household_income": str(item["answers"]["householdIncome"]),
        "educational_level": str(item["answers"].get("educationalLevel")),
        "household_size": str(item["answers"]["householdSize"]),
        "marital_status": str(item["answers"].get("maritalStatus")),
        "languages_spoken": str(item["answers"].get("languagesSpoken")),
        "time_spent_on_mturk": str(item["answers"].get("timeSpentOnMturk")),
        "weekly_income_from_mturk": str(item["answers"].get("weeklyIncomeFromMturk")),
        "year_of_birth": int(item["answers"]["yearOfBirth"]),
        "location_city": str(item.get("locationCity")),
        "location_region": str(item.get("locationRegion")),
        "location_country": str(item["locationCountry"]),
        "hit_answered_date": hit_answer_date,
        "hit_creation_date": hit_creation_date,
        "post_to_completion_secs": diff
    }
    return result

In [2]:
# The code below retrieves all the responses from the Demographics API
# Since we cannot get all the responses at once, we fetch a few thousand
# records at a time, until fetching them all

import requests
import json

limit = 5000

# The API call that returns the last survey responses
baseurl = "https://mturk-surveys.appspot.com/" + \
    "_ah/api/survey/v1/survey/demographics/answers?limit=" + str(limit)

# This is the cursor variable, used to retrieve more pages of results
nextPageToken = None

# We store the results in this list
results = []

while True:
    if nextPageToken == None:
        url = baseurl
    else:
        url = baseurl + "&cursor=" + nextPageToken

    resp = requests.get(url)
    if resp.status_code == 200:
        data = json.loads(resp.text)
        items = data.get("items")
        if items == None:
            break
        print("Retrieved ", len(items), " responses")
        responses = [flatten(item) for item in items]
        results.extend(responses)
        print("Total of ", len(results), " responses in our data")
    else:
        print("Something went wrong with the network call")

    nextPageToken = data.get("nextPageToken")
    if nextPageToken == None:
        break


Retrieved  5000  responses
Total of  5000  responses in our data
Retrieved  5000  responses
Total of  10000  responses in our data
Retrieved  5000  responses
Total of  15000  responses in our data
Retrieved  5000  responses
Total of  20000  responses in our data
Retrieved  5000  responses
Total of  25000  responses in our data
Retrieved  5000  responses
Total of  30000  responses in our data
Retrieved  5000  responses
Total of  35000  responses in our data
Retrieved  5000  responses
Total of  40000  responses in our data
Retrieved  5000  responses
Total of  45000  responses in our data
Retrieved  5000  responses
Total of  50000  responses in our data
Retrieved  5000  responses
Total of  55000  responses in our data
Retrieved  5000  responses
Total of  60000  responses in our data
Retrieved  5000  responses
Total of  65000  responses in our data
Retrieved  5000  responses
Total of  70000  responses in our data
Retrieved  5000  responses
Total of  75000  responses in our data
Retrieved  

In [3]:
# Let's print the total number of retrieved responses
print(len(results))

190164


In [4]:
import pandas as pd
df = pd.DataFrame(results)
# Let's save the file as a CSV
df.to_csv("mturk_surveys_extended.csv")

In [5]:
df.languages_spoken.values

array(['English', 'English', 'Portuguese', ..., 'None', 'None', 'None'],
      dtype=object)

In [6]:
lol = [entries.split(',') for entries in df.languages_spoken.values if entries!='None']
s = set()
for l in lol:
    for m in l:
        s.add(m)
s

{'',
 'Abkhazian',
 'Afar',
 'Afrikaans',
 'Albanian',
 'Amharic',
 'Arabic',
 'Armenian',
 'Assamese',
 'Aymara',
 'Azerbaijani',
 'Bashkir',
 'Basque',
 'Bengali',
 'Bihari',
 'Bulgarian',
 'Burmese',
 'Byelorussian',
 'Cambodian',
 'Catalan',
 'Chinese',
 'Croatian',
 'Czech',
 'Danish',
 'Dutch',
 'English',
 'Esperanto',
 'Estonian',
 'Fiji',
 'Finnish',
 'French',
 'Frisian',
 'Gaelic',
 'Galician',
 'Georgian',
 'German',
 'Greek',
 'Guarani',
 'Gujarati',
 'Hausa',
 'Hebrew',
 'Hindi',
 'Hungarian',
 'Icelandic',
 'Indonesian',
 'Interlingua',
 'Irish',
 'Italian',
 'Japanese',
 'Javanese',
 'Kannada',
 'Kashmiri',
 'Kazakh',
 'Korean',
 'Kurdish',
 'Laothian',
 'Latin',
 'Latvian',
 'Lithuanian',
 'Macedonian',
 'Malagasy',
 'Malay',
 'Malayalam',
 'Maltese',
 'Marathi',
 'Mongolian',
 'Nauru',
 'Nepali',
 'Norwegian',
 'Oriya',
 'Pashto',
 'Persian',
 'Polish',
 'Portuguese',
 'Punjabi',
 'Quechua',
 'Romanian',
 'Russian',
 'Samoan',
 'Sanskrit',
 'Serbian',
 'Serbo-Croatian

In [7]:
people_with_language = len([v for v in df.languages_spoken.values if v!='None' and v!=''])
people_with_language

109097

In [8]:
df.location_country.value_counts()

US    137798
IN     34394
CA      2771
GB      2021
BR      1592
IT      1093
DE       733
PH       557
FR       534
ES       522
VE       495
ZZ       393
MX       329
AU       292
NL       259
AE       240
KE       221
PK       205
RO       196
JP       179
TR       176
NG       174
PT       162
ID       162
RU       161
IE       159
TH       159
GR       155
MK       140
UA       136
       ...  
AM         2
SX         2
AZ         2
DJ         2
RW         2
HT         2
LA         2
PY         2
MO         2
LI         2
ME         2
MV         2
SR         2
FM         1
AG         1
FJ         1
PS         1
NE         1
BU         1
IM         1
GM         1
UZ         1
BM         1
TG         1
AI         1
CD         1
GN         1
TZ         1
CN         1
PF         1
Name: location_country, Length: 157, dtype: int64

In [9]:
df [  df.languages_spoken.str.contains(',') ].location_country.value_counts()

IN    8980
US    8456
BR    1077
IT     683
CA     665
DE     388
ES     330
FR     271
GB     266
VE     166
MX     162
NL     101
RO      77
PH      72
PT      65
KE      55
RU      52
CO      52
GR      50
AU      47
AE      46
EG      44
JP      44
IE      41
MK      40
BD      39
AR      36
BE      36
BG      36
ID      35
      ... 
NO       5
BO       5
SV       5
ZA       5
NZ       5
CH       4
CY       4
MT       4
LV       3
MD       3
IQ       3
JO       3
NI       3
MN       2
OM       2
AL       2
CW       2
GE       2
NA       2
JM       2
ME       1
MV       1
RW       1
LA       1
SR       1
UG       1
MU       1
FM       1
UY       1
BY       1
Name: location_country, Length: 108, dtype: int64

In [10]:
# bilingual and above
len(df [ df.languages_spoken.str.contains(',') ].worker_id.value_counts())

11646

In [11]:
df['bilingual'] = df.languages_spoken.str.contains(',')

In [12]:
pv_bilingual = df.pivot_table(
    index = 'location_country',
    columns='bilingual',
    values = 'worker_id',
    aggfunc='count'
).fillna(0)

pv_bilingual['perc_bilingual'] = pv_bilingual[True] / (pv_bilingual[True] + pv_bilingual[False])
pv_bilingual.sort_values('perc_bilingual', ascending=False)

bilingual,False,True,perc_bilingual
location_country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
FM,0.0,1.0,1.000000
LB,5.0,18.0,0.782609
BR,515.0,1077.0,0.676508
BH,14.0,25.0,0.641026
ES,192.0,330.0,0.632184
IT,410.0,683.0,0.624886
DZ,17.0,24.0,0.585366
EG,34.0,44.0,0.564103
MA,19.0,23.0,0.547619
TN,8.0,9.0,0.529412


In [13]:
# unique ids of workers that answered the language question
len(df [ df.languages_spoken !='None' ].worker_id.value_counts())

52475

In [14]:
df [  df.languages_spoken.str.contains('English') ].location_country.value_counts()

US    76520
IN    18948
CA     1832
GB     1427
BR     1193
IT      738
DE      473
ES      363
FR      284
PH      263
VE      242
ZZ      226
MX      191
NL      181
AU      173
KE      145
PK      145
AE      112
IE       97
NG       95
RO       92
JP       92
UA       88
TH       82
PT       79
ID       78
CO       65
BD       62
RU       62
TR       60
      ...  
ME        2
TJ        2
RW        2
BS        2
ET        2
ZM        2
UY        2
GY        2
JE        2
LU        2
CM        2
KG        2
GD        2
SX        2
AL        2
KN        1
BN        1
TG        1
AM        1
MU        1
SR        1
DJ        1
MV        1
NE        1
HT        1
UZ        1
LA        1
GN        1
FM        1
MO        1
Name: location_country, Length: 139, dtype: int64

In [15]:
df [  df.languages_spoken.str.contains('Tamil') ].location_country.value_counts()

IN    7765
US     179
CA      24
AE      21
GB      20
ZZ      12
SA       8
AU       4
QA       4
NO       4
FR       4
DE       4
IT       3
BH       3
KW       3
MY       2
PL       2
SG       2
LK       2
PK       2
UA       2
NG       1
JP       1
CZ       1
CO       1
MV       1
SE       1
HK       1
IQ       1
OM       1
NL       1
RU       1
CY       1
Name: location_country, dtype: int64

In [16]:
df [  df.languages_spoken.str.contains('Spanish') ].location_country.value_counts()

US    4507
BR     507
ES     356
IN     226
VE     197
MX     163
IT     163
CA      72
GB      64
CO      53
FR      51
AR      35
DE      34
PT      32
PR      29
DO      25
NL      22
CL      21
PE      19
BE      17
EC      17
JP      13
ZZ      12
AU      12
GR      11
SV       8
CR       8
KE       8
IE       7
PA       7
      ... 
NG       3
NI       3
UA       3
MK       2
JO       2
CW       2
SA       2
PL       2
SE       2
EE       2
CH       2
JM       2
PK       2
LB       2
UY       2
BG       2
RU       2
AL       1
MY       1
BY       1
MD       1
SG       1
BA       1
RS       1
FI       1
AT       1
CZ       1
BD       1
BH       1
NP       1
Name: location_country, Length: 77, dtype: int64

In [17]:
df [  df.languages_spoken.str.contains('Hindi') ].location_country.value_counts()

IN    3588
US     291
CA      64
AE      32
BD      26
GB      25
NP      13
SA      12
DE      10
BR      10
IT      10
PH       9
SE       9
PK       5
ZZ       5
KW       4
BH       4
UA       4
FR       4
AR       3
NZ       3
NO       3
AU       3
SG       3
QA       3
ES       2
OM       2
JP       2
HK       2
KR       2
DK       1
IQ       1
CY       1
ID       1
NL       1
GH       1
MY       1
PL       1
RU       1
Name: location_country, dtype: int64

In [18]:
df [  df.languages_spoken.str.contains('French') ].location_country.value_counts()

US    1138
CA     404
FR     298
IN     230
IT     123
GB      85
DE      67
BR      59
ES      50
BE      33
EG      28
DZ      23
MA      23
KE      23
NG      22
NL      21
BH      20
IE      14
PT      11
TN       9
RO       8
PH       8
MX       6
VE       6
TR       5
PA       5
JP       4
CO       4
IL       4
GH       4
      ... 
AR       2
SA       2
CH       2
BG       2
ZZ       2
PK       2
BD       2
QA       2
AU       2
PL       1
MT       1
AT       1
CR       1
RW       1
MK       1
UG       1
DO       1
HK       1
JO       1
MD       1
UA       1
FM       1
IQ       1
MU       1
CZ       1
ID       1
RU       1
AL       1
BA       1
NI       1
Name: location_country, Length: 69, dtype: int64

In [19]:
df [  df.languages_spoken.str.contains('Malayalam') ].location_country.value_counts()

IN    1848
US      35
AE      30
SA      11
CA       9
GB       5
BH       4
IT       4
KW       4
QA       4
OM       2
DE       2
ZZ       2
CY       1
MV       1
IQ       1
SG       1
CO       1
GT       1
BN       1
Name: location_country, dtype: int64

In [20]:
df [  df.languages_spoken.str.contains('Tegulu') ].location_country.value_counts()

IN    850
US     38
GB      9
CA      5
SE      3
ZZ      3
FR      2
PL      2
IT      1
JP      1
ID      1
AU      1
NL      1
BH      1
Name: location_country, dtype: int64

In [21]:
df [  df.languages_spoken.str.contains('German') ].location_country.value_counts()

US    515
DE    397
IN     56
IT     40
GB     38
NL     24
ES     22
FR     22
BE     21
BR     17
IE     17
CA     16
LB     13
FI     10
AT      9
AU      6
CO      6
MX      6
TR      5
DK      5
HR      4
GR      4
MK      4
UA      4
VE      3
TH      3
RO      2
RS      2
SI      2
AR      2
BA      2
PL      1
EG      1
ZZ      1
ID      1
GT      1
KE      1
SE      1
BG      1
PT      1
CY      1
AL      1
VN      1
PH      1
NG      1
MA      1
BO      1
SV      1
TN      1
HU      1
LT      1
JP      1
NO      1
ZA      1
CH      1
Name: location_country, dtype: int64

In [22]:
(df [  df.languages_spoken.str.contains('Chinese') ].location_country.value_counts() / df.location_country.value_counts()).sort_values(ascending=False)

MN    0.250000
HK    0.235294
MY    0.205882
DK    0.085714
TW    0.081081
KR    0.052632
SG    0.045045
CA    0.043667
LV    0.035714
GH    0.031250
AU    0.020548
BE    0.019231
JP    0.016760
NL    0.015444
PK    0.014634
ZA    0.014493
IE    0.012579
ID    0.012346
AR    0.011765
IT    0.009149
MX    0.009119
NZ    0.008197
PH    0.007181
GB    0.006927
DE    0.006821
US    0.006582
TH    0.006289
ES    0.005747
FR    0.005618
ZZ    0.005089
        ...   
PY         NaN
QA         NaN
RO         NaN
RS         NaN
RU         NaN
RW         NaN
SA         NaN
SE         NaN
SI         NaN
SK         NaN
SR         NaN
SV         NaN
SX         NaN
TG         NaN
TJ         NaN
TM         NaN
TN         NaN
TR         NaN
TT         NaN
TZ         NaN
UA         NaN
UG         NaN
UY         NaN
UZ         NaN
VC         NaN
VE         NaN
VI         NaN
VN         NaN
ZM         NaN
ZW         NaN
Name: location_country, Length: 157, dtype: float64

In [23]:
df [  df.languages_spoken.str.contains('Macedonian') ].location_country.value_counts()

MK    44
RS     4
US     3
BA     2
BG     1
IT     1
Name: location_country, dtype: int64

In [24]:
languages = set([l for v in df.languages_spoken.values if v!='None' 
                 for l in v.split(',') if l!=''])

In [25]:

result = []
for language in languages:
    people = len(set(df[ df.languages_spoken.str.contains(language) ].worker_id.values))
    result.append({"lang":language, "unique_workers": people})
    

df_cnt = pd.DataFrame(result).sort_values('unique_workers', ascending=False)    
print (df_cnt)    
# more than 10 people for the language

#df2 = pd.DataFrame(result).sort_values('unique_workers', ascending=False)
#enough = df2 [df2.unique_workers > 9]
#len(enough)

             lang  unique_workers
95        English           51605
68        Spanish            4116
23          Tamil            2910
79          Hindi            1808
47         French            1713
45     Portuguese            1130
69         German             840
37        Italian             815
61          Malay             812
36      Malayalam             790
38        Chinese             729
19         Tegulu             374
10       Japanese             315
1          Arabic             293
11        Russian             286
29         Korean             204
93        Kannada             199
90        Tagalog             198
58           Urdu             171
100    Vietnamese             145
77          Dutch             131
87        Marathi             129
94        Bengali             129
71       Gujarati             120
35        Punjabi             113
70         Polish             104
88        Turkish              90
82       Romanian              84
67          Gr

In [26]:
df_cnt.unique_workers.value_counts().sort_index()

1        15
2         7
3         6
4         1
5         1
6         5
7         6
8         3
10        1
11        1
12        1
13        1
15        1
16        2
17        2
18        1
19        2
20        2
21        1
23        2
25        2
26        1
29        1
30        1
31        2
32        2
34        1
35        1
39        1
44        1
         ..
51        1
52        1
71        1
84        1
90        1
104       1
113       1
120       1
129       2
131       1
145       1
171       1
198       1
199       1
204       1
286       1
293       1
315       1
374       1
729       1
790       1
812       1
815       1
840       1
1130      1
1713      1
1808      1
2910      1
4116      1
51605     1
Name: unique_workers, Length: 62, dtype: int64

In [27]:
len([v for v in df.languages_spoken.values if 'Spanish' in v])

6822

In [28]:
1189 / 19268 * 170000

10490.450487855513

In [29]:
def intersect(a, b):
    """ return the intersection of two lists """
    return list(set(a) & set(b))

my_langauge = ['English', 'Tamil', 'Spanish', 'Hindi', 'Malayalam', 'French', 'Telugu', 'Chinese', 'German', 'Kannada', 'Italian', 'Portuguese', 'Marathi', 'Arabic', 'Russian', 'Japanese', 'Gujarati', 'Urdu', 'Bengali', 'Punjabi', 'Korean', 'Tagalog', 'Romanian', 'Vietnamese', 'Greek', 'Polish', 'Dutch', 'Turkish', 'Hebrew', 'Swedish', 'Serbian', 'Nepali', 'Bulgarian', 'Macedonian', 'Oriya']
pavlick_langauge = ['English', 'Tamil', 'Malayalam', 'Hindi', 'Spanish', 'Telugu', 'Chinese', 'Romanian', 'Portuguese', 'Arabic', 'Kannada', 'German', 'French', 'Polish', 'Urdu', 'Tagalog', 'Marathi', 'Russian', 'Italian', 'Bengali', 'Gujarati', 'Hebrew', 'Dutch', 'Turkish', 'Vietnamese', 'Macedonian', 'Cebuano', 'Swedish', 'Bulgarian', 'Swahili', 'Hungarian', 'Catalan', 'Thai', 'Lithuanian', 'Punjabi']

intersect_language = intersect(my_langauge, pavlick_langauge)
len(intersect_language)


29

In [30]:
from scipy.stats import spearmanr

In [31]:
spearmanr(my_langauge,pavlick_langauge)



SpearmanrResult(correlation=0.057703081232493, pvalue=0.7419647128381073)