### Demographics API

Below we have the code that retrieves the data from the  Mechanical Turk Tracker Demographics API.

In [1]:
from datetime import datetime
import time

# This function takes as input the response for a single survey, and transforms it into a flat dictionary
def flatten(item):
    fmt = "%Y-%m-%dT%H:%M:%S.%fZ"
    
    hit_answer_date = datetime.strptime(item["date"], fmt)
    hit_creation_str = item.get("hitCreationDate")
    
    if hit_creation_str is None: 
        hit_creation_date = None 
        diff = None
    else:
        hit_creation_date = datetime.strptime(hit_creation_str, fmt)
        # convert to unix timestamp
        hit_date_ts = time.mktime(hit_creation_date.timetuple())
        answer_date_ts = time.mktime(hit_answer_date.timetuple())
        diff = int(answer_date_ts-hit_date_ts)
    
    result = {
        "worker_id": str(item["workerId"]),
        "gender": str(item["answers"]["gender"]).lower(),
        "household_income": str(item["answers"]["householdIncome"]),
        "educational_level": str(item["answers"].get("educationalLevel")),
        "household_size": str(item["answers"]["householdSize"]),
        "marital_status": str(item["answers"].get("maritalStatus")),
        "languages_spoken": str(item["answers"].get("languagesSpoken")),
        "time_spent_on_mturk": str(item["answers"].get("timeSpentOnMturk")),
        "weekly_income_from_mturk": str(item["answers"].get("weeklyIncomeFromMturk")),
        "year_of_birth": int(item["answers"]["yearOfBirth"]),
        "location_city": str(item.get("locationCity")),
        "location_region": str(item.get("locationRegion")),
        "location_country": str(item["locationCountry"]),
        "hit_answered_date": hit_answer_date,
        "hit_creation_date": hit_creation_date,
        "post_to_completion_secs": diff
    }
    return result

In [2]:
# The code below retrieves all the responses from the Demographics API
# Since we cannot get all the responses at once, we fetch a few thousand
# records at a time, until fetching them all

import requests
import json

limit = 5000

# The API call that returns the last survey responses
baseurl = "https://mturk-surveys.appspot.com/" + \
    "_ah/api/survey/v1/survey/demographics/answers?limit=" + str(limit)

# This is the cursor variable, used to retrieve more pages of results
nextPageToken = None

# We store the results in this list
results = []

while True:
    if nextPageToken == None:
        url = baseurl
    else:
        url = baseurl + "&cursor=" + nextPageToken

    resp = requests.get(url)
    if resp.status_code == 200:
        data = json.loads(resp.text)
        items = data.get("items")
        if items == None:
            break
        print("Retrieved ", len(items), " responses")
        responses = [flatten(item) for item in items]
        results.extend(responses)
        print("Total of ", len(results), " responses in our data")
    else:
        print("Something went wrong with the network call")

    nextPageToken = data.get("nextPageToken")
    if nextPageToken == None:
        break


Retrieved  5000  responses
Total of  5000  responses in our data
Retrieved  5000  responses
Total of  10000  responses in our data
Retrieved  5000  responses
Total of  15000  responses in our data
Retrieved  5000  responses
Total of  20000  responses in our data
Retrieved  5000  responses
Total of  25000  responses in our data
Retrieved  5000  responses
Total of  30000  responses in our data
Retrieved  5000  responses
Total of  35000  responses in our data
Retrieved  5000  responses
Total of  40000  responses in our data
Retrieved  5000  responses
Total of  45000  responses in our data
Retrieved  5000  responses
Total of  50000  responses in our data
Retrieved  5000  responses
Total of  55000  responses in our data
Retrieved  5000  responses
Total of  60000  responses in our data
Retrieved  5000  responses
Total of  65000  responses in our data
Retrieved  5000  responses
Total of  70000  responses in our data
Retrieved  5000  responses
Total of  75000  responses in our data
Retrieved  

In [3]:
# Let's print the total number of retrieved responses
print(len(results))

162685


In [4]:
import pandas as pd
df = pd.DataFrame(results)
# Let's save the file as a CSV
df.to_csv("mturk_surveys_extended.csv")

In [44]:
df.languages_spoken.values

array(['English', 'English', 'Malayalam', ..., 'None', 'None', 'None'],
      dtype=object)

In [65]:
lol = [entries.split(',') for entries in df.languages_spoken.values if entries!='None']
s = set()
for l in lol:
    for m in l:
        s.add(m)
s

{'',
 'Abkhazian',
 'Afar',
 'Afrikaans',
 'Albanian',
 'Amharic',
 'Arabic',
 'Armenian',
 'Assamese',
 'Azerbaijani',
 'Bashkir',
 'Basque',
 'Bengali',
 'Bihari',
 'Bulgarian',
 'Burmese',
 'Byelorussian',
 'Cambodian',
 'Catalan',
 'Chinese',
 'Croatian',
 'Czech',
 'Danish',
 'Dutch',
 'English',
 'Esperanto',
 'Estonian',
 'Fiji',
 'Finnish',
 'French',
 'Frisian',
 'Gaelic',
 'Galician',
 'Georgian',
 'German',
 'Greek',
 'Guarani',
 'Gujarati',
 'Hausa',
 'Hebrew',
 'Hindi',
 'Hungarian',
 'Icelandic',
 'Indonesian',
 'Interlingua',
 'Irish',
 'Italian',
 'Japanese',
 'Javanese',
 'Kannada',
 'Kashmiri',
 'Korean',
 'Kurdish',
 'Laothian',
 'Latin',
 'Latvian',
 'Lithuanian',
 'Macedonian',
 'Malagasy',
 'Malay',
 'Malayalam',
 'Maltese',
 'Marathi',
 'Mongolian',
 'Nepali',
 'Norwegian',
 'Oriya',
 'Pashto',
 'Persian',
 'Polish',
 'Portuguese',
 'Punjabi',
 'Romanian',
 'Russian',
 'Samoan',
 'Sanskrit',
 'Serbian',
 'Serbo-Croatian',
 'Shona',
 'Sindhi',
 'Singhalese',
 'Slo

In [None]:
df.people_with_language

In [66]:
people_with_language = len([v for v in df.languages_spoken.values if v!='None' and v!=''])
people_with_language

81618

In [7]:
df.location_country.value_counts()

US    118127
IN     30253
CA      2236
GB      1564
IT       699
BR       610
DE       609
PH       514
VE       465
FR       438
ES       344
ZZ       322
MX       301
AU       249
KE       208
AE       193
RO       178
NL       174
TR       171
JP       163
NG       162
TH       156
ID       152
RU       150
PT       147
GR       142
IE       138
MK       134
UA       119
NZ       116
       ...  
AM         2
TM         2
BS         2
SR         2
RW         2
LI         2
AZ         2
DJ         2
MV         2
PY         2
MO         2
SX         2
NE         1
IM         1
AI         1
FJ         1
BM         1
GN         1
BU         1
CN         1
PF         1
HT         1
FM         1
LA         1
TZ         1
PS         1
AG         1
GM         1
CD         1
UZ         1
Name: location_country, Length: 156, dtype: int64

In [8]:
df [  df.languages_spoken.str.contains(',') ].location_country.value_counts()

IN    7259
US    6703
CA     441
IT     376
BR     346
DE     297
GB     190
FR     188
ES     185
VE     147
MX     138
RO      66
NL      65
PT      56
KE      51
PH      48
CO      48
RU      44
JP      39
AE      38
GR      37
EG      34
MK      34
AU      33
TR      32
BD      31
BE      31
NG      30
BG      30
LT      29
      ... 
LK       5
SV       5
HU       4
ZA       4
KW       4
MT       4
CH       4
NO       4
LV       3
JO       3
SK       3
NI       3
IQ       3
CY       3
MD       2
CW       2
GE       2
MN       2
JM       2
OM       2
AL       2
RW       1
UG       1
MU       1
ME       1
SR       1
MV       1
NA       1
FM       1
UY       1
Name: location_country, Length: 106, dtype: int64

In [9]:
# bilingual and above
len(df [ df.languages_spoken.str.contains(',') ].worker_id.value_counts())

9115

In [29]:
df['bilingual'] = df.languages_spoken.str.contains(',')

In [38]:
pv_bilingual = df.pivot_table(
    index = 'location_country',
    columns='bilingual',
    values = 'worker_id',
    aggfunc='count'
).fillna(0)

pv_bilingual['perc_bilingual'] = pv_bilingual[True] / (pv_bilingual[True] + pv_bilingual[False])
pv_bilingual.sort_values('perc_bilingual', ascending=False)

bilingual,False,True,perc_bilingual
location_country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
FM,0.0,1.0,1.000000
LB,5.0,17.0,0.772727
BH,14.0,21.0,0.600000
TN,6.0,8.0,0.571429
BR,264.0,346.0,0.567213
DZ,16.0,20.0,0.555556
IT,323.0,376.0,0.537911
ES,159.0,185.0,0.537791
EG,33.0,34.0,0.507463
BO,5.0,5.0,0.500000


In [10]:
# unique ids of workers that answered the language question
len(df [ df.languages_spoken !='None' ].worker_id.value_counts())

38559

In [11]:
df [  df.languages_spoken.str.contains('English') ].location_country.value_counts()

US    56937
IN    15221
CA     1302
GB      977
IT      416
BR      385
DE      356
VE      221
PH      220
ES      208
FR      198
MX      163
ZZ      156
KE      132
AU      130
NL      101
NG       83
TH       80
JP       78
IE       77
RO       74
UA       71
ID       69
PT       66
AE       65
CO       60
TR       59
RU       51
BD       51
PK       47
      ...  
ZM        2
CM        2
RW        2
AL        2
NA        2
SX        2
TJ        2
GD        2
GY        2
MD        2
KG        2
ME        2
JE        2
TM        2
GN        1
AM        1
MV        1
KN        1
BN        1
UZ        1
MU        1
MO        1
DJ        1
UY        1
NE        1
FM        1
LU        1
BY        1
ET        1
SR        1
Name: location_country, Length: 134, dtype: int64

In [12]:
df [  df.languages_spoken.str.contains('Tamil') ].location_country.value_counts()

IN    6202
US      96
AE      21
CA      12
GB      11
ZZ       9
SA       8
FR       4
AU       4
QA       4
NO       3
DE       3
BH       3
SG       2
KW       2
LK       2
UA       2
PL       2
IQ       1
CZ       1
HK       1
NG       1
JP       1
MV       1
SE       1
OM       1
MY       1
IT       1
PK       1
CO       1
Name: location_country, dtype: int64

In [13]:
df [  df.languages_spoken.str.contains('Spanish') ].location_country.value_counts()

US    3565
ES     193
IN     192
VE     169
BR     160
MX     139
IT      75
CO      49
CA      48
GB      44
FR      36
DE      30
PR      29
PT      27
AR      26
DO      18
CL      18
PE      17
EC      15
BE      14
NL      11
JP      11
ZZ       9
GR       8
CR       8
PA       7
SV       7
MA       6
TT       6
TH       6
      ... 
KR       3
TR       3
MT       3
QA       2
SE       2
EE       2
BG       2
JM       2
TW       2
JO       2
UA       2
CW       2
CH       2
RU       2
LB       2
UY       2
PH       2
MD       1
BD       1
MY       1
SA       1
PK       1
CZ       1
AE       1
AL       1
BH       1
MK       1
FI       1
RS       1
NP       1
Name: location_country, Length: 72, dtype: int64

In [14]:
df [  df.languages_spoken.str.contains('Hindi') ].location_country.value_counts()

IN    2930
US     185
CA      31
AE      29
BD      21
GB      19
SA      11
SE       9
NP       9
BR       7
DE       7
PH       6
PK       5
IT       5
UA       4
BH       4
NO       3
KW       3
QA       3
NZ       3
JP       2
SG       2
HK       2
AU       2
KR       2
OM       2
DK       1
ID       1
IQ       1
GH       1
ZZ       1
Name: location_country, dtype: int64

In [15]:
df [  df.languages_spoken.str.contains('French') ].location_country.value_counts()

US    878
CA    289
FR    209
IN    186
IT     64
DE     63
GB     63
BE     30
ES     26
KE     22
NG     20
EG     20
MA     18
DZ     18
BR     18
BH     17
NL     15
IE     12
PT     10
RO      8
TN      8
TR      5
PA      5
VE      5
RS      4
JP      4
MX      4
IL      4
CO      4
GH      4
     ... 
BD      2
AR      2
BG      2
FI      2
SA      2
CH      2
QA      2
BA      1
AL      1
DO      1
UG      1
HK      1
PL      1
MT      1
RU      1
RW      1
AU      1
AE      1
IQ      1
CR      1
FM      1
MU      1
NI      1
MD      1
UA      1
CZ      1
ID      1
JO      1
PK      1
AT      1
Name: location_country, Length: 66, dtype: int64

In [16]:
df [  df.languages_spoken.str.contains('Malayalam') ].location_country.value_counts()

IN    1550
AE      28
US      24
SA      10
CA       6
QA       4
GB       3
BH       3
KW       3
OM       2
ZZ       2
IQ       1
CO       1
GT       1
BN       1
MV       1
Name: location_country, dtype: int64

In [17]:
df [  df.languages_spoken.str.contains('Tegulu') ].location_country.value_counts()

IN    715
US     31
GB      7
SE      3
ZZ      2
PL      2
AU      1
CA      1
FR      1
JP      1
ID      1
BH      1
Name: location_country, dtype: int64

In [18]:
df [  df.languages_spoken.str.contains('German') ].location_country.value_counts()

US    418
DE    304
IN     48
GB     30
IT     28
FR     19
BE     18
NL     17
ES     16
IE     12
LB     12
CA     11
FI     10
AT      9
CO      6
MX      6
DK      5
TR      5
AU      4
BR      4
HR      4
MK      3
TH      3
GR      2
AR      2
RO      2
VE      2
SI      2
ZZ      1
SV      1
CH      1
PL      1
NO      1
BA      1
KE      1
AL      1
LT      1
JP      1
GT      1
BO      1
PH      1
TN      1
VN      1
EG      1
ID      1
ZA      1
SE      1
CY      1
RS      1
Name: location_country, dtype: int64

In [61]:
(df [  df.languages_spoken.str.contains('Chinese') ].location_country.value_counts() / df.location_country.value_counts()).sort_values(ascending=False)

MN    0.250000
HK    0.250000
MY    0.212121
TW    0.146341
DK    0.085714
KR    0.053333
LV    0.043478
SG    0.037383
CA    0.036673
GH    0.033333
AU    0.020080
ZA    0.014706
AR    0.013699
ID    0.013158
JP    0.012270
NL    0.011494
BE    0.010204
MX    0.009967
PK    0.009709
ES    0.008721
NZ    0.008621
PH    0.007782
IT    0.007153
GB    0.007033
TH    0.006410
ZZ    0.006211
US    0.005790
AE    0.005181
DE    0.004926
FR    0.004566
        ...   
PT         NaN
PY         NaN
QA         NaN
RO         NaN
RS         NaN
RU         NaN
RW         NaN
SA         NaN
SE         NaN
SI         NaN
SK         NaN
SR         NaN
SV         NaN
SX         NaN
TJ         NaN
TM         NaN
TN         NaN
TR         NaN
TT         NaN
TZ         NaN
UA         NaN
UG         NaN
UY         NaN
UZ         NaN
VC         NaN
VE         NaN
VI         NaN
VN         NaN
ZM         NaN
ZW         NaN
Name: location_country, Length: 156, dtype: float64

In [20]:
df [  df.languages_spoken.str.contains('Macedonian') ].location_country.value_counts()

MK    39
RS     4
US     2
BA     2
IT     1
Name: location_country, dtype: int64

In [21]:
languages = set([l for v in df.languages_spoken.values if v!='None' 
                 for l in v.split(',') if l!=''])

In [22]:

result = []
for language in languages:
    people = len(set(df[ df.languages_spoken.str.contains(language) ].worker_id.values))
    result.append({"lang":language, "unique_workers": people})
    

df_cnt = pd.DataFrame(result).sort_values('unique_workers', ascending=False)    
print (df_cnt)    
# more than 10 people for the language

#df2 = pd.DataFrame(result).sort_values('unique_workers', ascending=False)
#enough = df2 [df2.unique_workers > 9]
#len(enough)

            lang  unique_workers
78       English           37984
52       Spanish            3063
10         Tamil            2496
61         Hindi            1494
23        French            1265
89         Malay             687
19        German             673
39     Malayalam             667
49       Chinese             568
1        Italian             537
44    Portuguese             479
27        Tegulu             328
81      Japanese             249
68       Russian             235
24        Arabic             227
55       Kannada             174
73        Korean             162
86       Tagalog             159
76          Urdu             131
17    Vietnamese             112
38       Marathi             109
14         Dutch             105
91      Gujarati             102
21       Bengali              96
56       Punjabi              90
74        Polish              84
53       Turkish              75
59      Romanian              65
64         Greek              61
34        

In [23]:
df_cnt.unique_workers.value_counts().sort_index()

1        14
2         7
3         4
4         2
5         4
6         6
7         1
8         1
9         1
10        1
11        1
12        1
13        2
14        2
15        2
16        2
17        1
19        1
20        1
21        2
22        2
23        1
24        3
25        3
27        1
30        1
33        1
36        1
39        1
41        1
44        1
61        1
65        1
75        1
84        1
90        1
96        1
102       1
105       1
109       1
112       1
131       1
159       1
162       1
174       1
227       1
235       1
249       1
328       1
479       1
537       1
568       1
667       1
673       1
687       1
1265      1
1494      1
2496      1
3063      1
37984     1
Name: unique_workers, dtype: int64

In [24]:
len([v for v in df.languages_spoken.values if 'Spanish' in v])

5044

In [25]:
1189 / 19268 * 170000

10490.450487855513

In [26]:
def intersect(a, b):
    """ return the intersection of two lists """
    return list(set(a) & set(b))

my_langauge = ['English', 'Tamil', 'Spanish', 'Hindi', 'Malayalam', 'French', 'Telugu', 'Chinese', 'German', 'Kannada', 'Italian', 'Portuguese', 'Marathi', 'Arabic', 'Russian', 'Japanese', 'Gujarati', 'Urdu', 'Bengali', 'Punjabi', 'Korean', 'Tagalog', 'Romanian', 'Vietnamese', 'Greek', 'Polish', 'Dutch', 'Turkish', 'Hebrew', 'Swedish', 'Serbian', 'Nepali', 'Bulgarian', 'Macedonian', 'Oriya']
pavlick_langauge = ['English', 'Tamil', 'Malayalam', 'Hindi', 'Spanish', 'Telugu', 'Chinese', 'Romanian', 'Portuguese', 'Arabic', 'Kannada', 'German', 'French', 'Polish', 'Urdu', 'Tagalog', 'Marathi', 'Russian', 'Italian', 'Bengali', 'Gujarati', 'Hebrew', 'Dutch', 'Turkish', 'Vietnamese', 'Macedonian', 'Cebuano', 'Swedish', 'Bulgarian', 'Swahili', 'Hungarian', 'Catalan', 'Thai', 'Lithuanian', 'Punjabi']

intersect_language = intersect(my_langauge, pavlick_langauge)
len(intersect_language)


29

In [27]:
from scipy.stats import spearmanr

In [28]:
spearmanr(my_langauge,pavlick_langauge)



SpearmanrResult(correlation=0.057703081232493, pvalue=0.7419647128381073)