### Demographics API

Below we have the code that retrieves the data from the  Mechanical Turk Tracker Demographics API.

In [1]:
from datetime import datetime
import time

# This function takes as input the response for a single survey, and transforms it into a flat dictionary
def flatten(item):
    fmt = "%Y-%m-%dT%H:%M:%S.%fZ"
    
    hit_answer_date = datetime.strptime(item["date"], fmt)
    hit_creation_str = item.get("hitCreationDate")
    
    if hit_creation_str is None: 
        hit_creation_date = None 
        diff = None
    else:
        hit_creation_date = datetime.strptime(hit_creation_str, fmt)
        # convert to unix timestamp
        hit_date_ts = time.mktime(hit_creation_date.timetuple())
        answer_date_ts = time.mktime(hit_answer_date.timetuple())
        diff = int(answer_date_ts-hit_date_ts)
    
    result = {
        "worker_id": str(item["workerId"]),
        "gender": str(item["answers"]["gender"]),
        "household_income": str(item["answers"]["householdIncome"]),
        "educational_level": str(item["answers"].get("educationalLevel")),
        "household_size": str(item["answers"]["householdSize"]),
        "marital_status": str(item["answers"].get("maritalStatus")),
        "languages_spoken": str(item["answers"].get("languagesSpoken")),
        "time_spent_on_mturk": str(item["answers"].get("timeSpentOnMturk")),
        "weekly_income_from_mturk": str(item["answers"].get("weeklyIncomeFromMturk")),
        "year_of_birth": int(item["answers"]["yearOfBirth"]),
        "location_city": str(item.get("locationCity")),
        "location_region": str(item.get("locationRegion")),
        "location_country": str(item["locationCountry"]),
        "hit_answered_date": hit_answer_date,
        "hit_creation_date": hit_creation_date,
        "post_to_completion_secs": diff
    }
    return result

In [2]:
# The code below retrieves all the responses from the Demographics API
# Since we cannot get all the responses at once, we fetch a few thousand
# records at a time, until fetching them all

import requests
import json

limit = 5000

# The API call that returns the last survey responses
baseurl = "https://mturk-surveys.appspot.com/" + \
    "_ah/api/survey/v1/survey/demographics/answers?limit=" + str(limit)

# This is the cursor variable, used to retrieve more pages of results
nextPageToken = None

# We store the results in this list
results = []

while True:
    if nextPageToken == None:
        url = baseurl
    else:
        url = baseurl + "&cursor=" + nextPageToken

    resp = requests.get(url)
    if resp.status_code == 200:
        data = json.loads(resp.text)
        items = data.get("items")
        if items == None:
            break
        print("Retrieved ", len(items), " responses")
        responses = [flatten(item) for item in items]
        results.extend(responses)
        print("Total of ", len(results), " responses in our data")
    else:
        print("Something went wrong with the network call")

    nextPageToken = data.get("nextPageToken")
    if nextPageToken == None:
        break


Retrieved  5000  responses
Total of  5000  responses in our data
Retrieved  5000  responses
Total of  10000  responses in our data
Retrieved  5000  responses
Total of  15000  responses in our data
Retrieved  5000  responses
Total of  20000  responses in our data
Retrieved  5000  responses
Total of  25000  responses in our data
Retrieved  5000  responses
Total of  30000  responses in our data
Retrieved  5000  responses
Total of  35000  responses in our data
Retrieved  5000  responses
Total of  40000  responses in our data
Retrieved  5000  responses
Total of  45000  responses in our data
Retrieved  5000  responses
Total of  50000  responses in our data
Retrieved  5000  responses
Total of  55000  responses in our data
Retrieved  5000  responses
Total of  60000  responses in our data
Retrieved  5000  responses
Total of  65000  responses in our data
Retrieved  5000  responses
Total of  70000  responses in our data
Retrieved  5000  responses
Total of  75000  responses in our data
Retrieved  

In [3]:
# Let's print the total number of retrieved responses
print(len(results))

147636


In [4]:
import pandas as pd
df = pd.DataFrame(results)
# Let's save the file as a CSV
df.to_csv("mturk_surveys_extended.csv")

In [5]:
df

Unnamed: 0,educational_level,gender,hit_answered_date,hit_creation_date,household_income,household_size,languages_spoken,location_city,location_country,location_region,marital_status,post_to_completion_secs,time_spent_on_mturk,weekly_income_from_mturk,worker_id,year_of_birth
0,Bachelors degree,male,2019-06-06 22:55:24.228,2019-06-06 22:54:03,"$60,000-$74,999",3,English,morristown,US,vt,single,81.0,8-20 hours per week,$50-$100 per week,c9f6bb33204d93d513194ba62fe4efd5,1984
1,Bachelors degree,male,2019-06-06 22:55:14.758,NaT,"$60,000-$74,999",3,English,morristown,US,vt,single,,8-20 hours per week,$50-$100 per week,c9f6bb33204d93d513194ba62fe4efd5,1984
2,Bachelors degree,female,2019-06-06 22:41:09.236,2019-06-06 22:39:00,"$100,000 or more",2,English,saint petersburg,US,fl,single,129.0,4-8 hours per week,$10-$20 per week,bbfbb69c463755f15dc29fe4a7dec402,1995
3,Bachelors degree,female,2019-06-06 22:30:18.027,2019-06-06 21:24:00,"$25,000-$39,999",1,English,lancaster,US,oh,single,3978.0,20-40 hours per week,$200-$500 per week,203b0c395ceac95b3ed1f5a4a7251e7a,1952
4,Bachelors degree,female,2019-06-06 22:25:07.582,2019-06-06 22:24:00,"$75,000-$99,999",2,English,poway,US,ca,single,67.0,4-8 hours per week,$20-$50 per week,2cac613afe87215c4bd78dc0191ea776,1985
5,Associates degree,male,2019-06-06 22:09:36.167,2019-06-06 22:09:01,"$60,000-$74,999",3,English,portland,US,or,divorced,35.0,20-40 hours per week,$100-$200 per week,78547b65eea3b526f47a84a6e7146eb1,1972
6,"Graduate degree, Masters",female,2019-06-06 22:00:12.966,2019-06-06 20:54:01,"$100,000 or more",4,English,takoma park,US,md,married,3971.0,8-20 hours per week,$50-$100 per week,81f19b54a956e7ccdcd49e535aee6180,1968
7,Associates degree,male,2019-06-06 21:55:15.924,2019-06-06 21:54:06,"$100,000 or more",2,English,elk grove,US,ca,married,69.0,4-8 hours per week,$20-$50 per week,8af1652a8b38e26e2dc141331a876ed4,1948
8,"Some college, no degree",male,2019-06-06 21:42:33.401,2019-06-06 21:39:00,"$60,000-$74,999",5+,"English,Spanish",dallas,US,tx,cohabitating,213.0,8-20 hours per week,$10-$20 per week,0496f91f525e182dba49ba402657df15,1988
9,"Some college, no degree",male,2019-06-06 21:27:08.419,2019-06-06 21:09:00,"Less than $10,000",4,"English,Spanish",guernica,AR,b,single,1088.0,2-4 hours per week,$1-$5 per week,2aa86ed25e7b688866534c8e261e425c,1997


In [6]:
people_with_language = len([v for v in df.languages_spoken.values if v!='None' and v!=''])
people_with_language

66569

In [7]:
df.location_country.value_counts()

US    108530
IN     26549
CA      1948
GB      1256
IT       551
DE       532
PH       482
VE       442
FR       378
BR       341
ZZ       287
ES       276
MX       275
AU       216
KE       194
AE       187
RO       170
JP       155
TR       155
TH       154
NG       150
NL       146
RU       142
ID       142
PT       139
GR       139
MK       131
IE       119
NZ       109
SG       105
       ...  
AZ         2
TM         2
ME         2
SX         2
BS         2
DJ         2
AM         2
LI         2
MO         2
MV         2
RW         2
SR         2
HT         1
PF         1
AG         1
FJ         1
GM         1
CN         1
IM         1
GN         1
AI         1
BU         1
PS         1
NE         1
LA         1
BM         1
FM         1
UZ         1
TZ         1
CD         1
Name: location_country, Length: 156, dtype: int64

In [8]:
df [  df.languages_spoken.str.contains(',') ].location_country.value_counts()

IN    5926
US    5492
CA     350
IT     262
DE     236
BR     146
FR     142
GB     136
VE     131
ES     127
MX     118
RO      59
PT      48
KE      45
NL      45
PH      41
CO      40
JP      38
RU      37
AE      37
GR      35
EG      33
MK      31
NG      28
BG      25
PK      25
BE      25
BD      24
LT      24
PR      24
      ... 
HU       4
SV       4
NZ       4
TT       4
LV       3
CY       3
NI       3
CH       3
IQ       3
ZW       2
GE       2
ZA       2
SK       2
CW       2
JM       2
MD       2
MN       2
MT       2
JO       2
AL       2
OM       2
FM       1
MV       1
NO       1
UY       1
MU       1
RW       1
NA       1
SR       1
ME       1
Name: location_country, Length: 105, dtype: int64

In [9]:
# bilingual and above
len(df [ df.languages_spoken.str.contains(',') ].worker_id.value_counts())

7580

In [10]:
# unique ids of workers that answered the language question
len(df [ df.languages_spoken !='None' ].worker_id.value_counts())

32551

In [11]:
df [  df.languages_spoken.str.contains('English') ].location_country.value_counts()

US    47364
IN    11941
CA     1021
GB      671
IT      295
DE      285
VE      203
PH      188
BR      167
FR      146
ES      144
MX      138
ZZ      121
KE      118
AU       97
TH       78
NL       73
JP       71
NG       71
RO       66
ID       60
AE       60
IE       58
PT       58
UA       53
CO       52
TR       46
RU       43
GR       41
BD       39
      ...  
MD        2
SX        2
NO        2
ME        2
AL        2
TM        2
TJ        2
NA        2
ZM        2
CM        2
DM        2
RW        2
MT        2
AM        1
UY        1
MO        1
DJ        1
FM        1
UZ        1
SR        1
MU        1
GD        1
LU        1
KG        1
ET        1
MV        1
GN        1
BN        1
NE        1
GY        1
Name: location_country, Length: 132, dtype: int64

In [12]:
df [  df.languages_spoken.str.contains('Tamil') ].location_country.value_counts()

IN    4947
US      85
AE      20
ZZ       9
CA       9
GB       8
SA       7
BH       3
AU       3
FR       3
KW       2
PL       2
LK       2
SG       2
IQ       1
SE       1
OM       1
CZ       1
NG       1
UA       1
CO       1
JP       1
QA       1
IT       1
PK       1
DE       1
MV       1
MY       1
HK       1
Name: location_country, dtype: int64

In [13]:
df [  df.languages_spoken.str.contains('Spanish') ].location_country.value_counts()

US    2929
IN     162
VE     148
ES     131
MX     118
IT      54
BR      51
CO      41
CA      37
FR      30
GB      26
PR      24
DE      24
PT      23
AR      21
DO      16
CL      16
PE      16
EC      12
BE      11
JP      10
CR       8
GR       8
NL       8
ZZ       7
SV       6
TH       6
BO       5
ID       5
MA       5
      ... 
JM       2
BG       2
EE       2
QA       2
SE       2
CH       2
TW       2
LB       2
PH       2
CW       2
NG       2
KR       2
IE       1
RU       1
AE       1
UA       1
FI       1
CZ       1
NP       1
BD       1
PK       1
MT       1
JO       1
MD       1
MK       1
RS       1
AL       1
BH       1
SA       1
MY       1
Name: location_country, Length: 72, dtype: int64

In [14]:
df [  df.languages_spoken.str.contains('Hindi') ].location_country.value_counts()

IN    2499
US     151
AE      29
CA      15
BD      15
GB      15
SE       9
SA       8
NP       8
IT       4
BH       4
DE       4
PK       4
KW       3
QA       3
PH       3
UA       3
SG       2
JP       2
KR       2
OM       2
HK       2
BR       2
NZ       2
AU       2
ZZ       1
IQ       1
ID       1
DK       1
GH       1
Name: location_country, dtype: int64

In [15]:
df [  df.languages_spoken.str.contains('French') ].location_country.value_counts()

US    712
CA    228
IN    158
FR    156
DE     52
GB     47
IT     38
BE     24
NG     20
KE     19
EG     19
DZ     18
ES     18
BH     16
MA     16
NL     13
BR     13
PT      8
IE      7
RO      6
TR      5
TN      5
CO      4
GH      4
JP      4
MX      4
RS      3
PA      3
VE      3
LB      3
     ... 
CH      2
FI      2
HU      2
QA      2
SA      2
BG      2
AR      2
BD      2
MD      1
NI      1
JO      1
MT      1
AE      1
AL      1
MU      1
FM      1
UA      1
PH      1
RU      1
PK      1
AT      1
BA      1
PL      1
IQ      1
RW      1
CR      1
KR      1
DO      1
CZ      1
ID      1
Name: location_country, Length: 63, dtype: int64

In [16]:
df [  df.languages_spoken.str.contains('Malayalam') ].location_country.value_counts()

IN    1252
AE      26
US      22
SA       7
CA       5
QA       4
BH       3
GB       3
KW       3
ZZ       2
OM       2
MV       1
GT       1
IQ       1
BN       1
CO       1
Name: location_country, dtype: int64

In [17]:
df [  df.languages_spoken.str.contains('Tegulu') ].location_country.value_counts()

IN    622
US     28
GB      7
SE      3
PL      2
ZZ      2
JP      1
CA      1
AU      1
ID      1
BH      1
Name: location_country, dtype: int64

In [18]:
df [  df.languages_spoken.str.contains('German') ].location_country.value_counts()

US    342
DE    246
IN     40
GB     27
IT     20
BE     13
NL     13
FR     12
LB     11
ES     10
CA      9
AT      9
IE      7
CO      6
MX      6
FI      6
DK      5
TR      5
HR      4
TH      3
AU      3
BR      3
MK      3
SI      2
AR      2
VE      2
RO      2
GR      2
EG      1
CY      1
RS      1
JP      1
BO      1
CH      1
LT      1
ZZ      1
NO      1
SV      1
PH      1
PL      1
ID      1
KE      1
GT      1
BA      1
VN      1
AL      1
Name: location_country, dtype: int64

In [19]:
df [  df.languages_spoken.str.contains('Chinese') ].location_country.value_counts()

US    565
IN     73
CA     65
HK     15
MY     12
GB      9
TW      5
PH      4
SG      4
IT      4
KR      4
DE      3
DK      3
AU      3
ID      2
ES      2
FR      2
ZZ      2
MX      2
JP      2
LV      1
NZ      1
NL      1
AE      1
BE      1
AR      1
MN      1
ZA      1
PK      1
GH      1
TH      1
Name: location_country, dtype: int64

In [20]:
df [  df.languages_spoken.str.contains('Macedonian') ].location_country.value_counts()

MK    36
RS     4
BA     2
IT     1
Name: location_country, dtype: int64

In [21]:
languages = set([l for v in df.languages_spoken.values if v!='None' 
                 for l in v.split(',') if l!=''])

In [22]:

result = []
for language in languages:
    people = len(set(df[ df.languages_spoken.str.contains(language) ].worker_id.values))
    result.append({"lang":language, "unique_workers": people})
    

df_cnt = pd.DataFrame(result).sort_values('unique_workers', ascending=False)    
print (df_cnt)    
# more than 10 people for the language

#df2 = pd.DataFrame(result).sort_values('unique_workers', ascending=False)
#enough = df2 [df2.unique_workers > 9]
#len(enough)

            lang  unique_workers
46       English           32103
6        Spanish            2498
2          Tamil            2133
22         Hindi            1305
69        French            1032
64         Malay             595
61     Malayalam             578
82        German             558
4        Chinese             479
62       Italian             416
96        Tegulu             296
97    Portuguese             280
24      Japanese             214
66        Arabic             191
47       Russian             186
18       Kannada             161
80        Korean             129
55       Tagalog             123
74          Urdu             110
58    Vietnamese              98
41       Marathi              98
92      Gujarati              90
89       Bengali              85
39         Dutch              79
54       Punjabi              72
27        Polish              69
33       Turkish              64
17      Romanian              57
37         Greek              51
83      Cr

In [23]:
df_cnt.unique_workers.value_counts().sort_index()

1        16
2         3
3         3
4         5
5         4
6         6
7         1
9         2
10        2
11        1
12        1
13        3
14        1
15        3
18        3
19        3
20        1
21        2
22        1
23        1
24        1
26        1
28        1
31        2
36        1
37        1
51        1
57        1
64        1
69        1
72        1
79        1
85        1
90        1
98        2
110       1
123       1
129       1
161       1
186       1
191       1
214       1
280       1
296       1
416       1
479       1
558       1
578       1
595       1
1032      1
1305      1
2133      1
2498      1
32103     1
Name: unique_workers, dtype: int64

In [24]:
len([v for v in df.languages_spoken.values if 'Spanish' in v])

4045

In [25]:
1189 / 19268 * 170000

10490.450487855513

In [26]:
def intersect(a, b):
    """ return the intersection of two lists """
    return list(set(a) & set(b))

my_langauge = ['English', 'Tamil', 'Spanish', 'Hindi', 'Malayalam', 'French', 'Telugu', 'Chinese', 'German', 'Kannada', 'Italian', 'Portuguese', 'Marathi', 'Arabic', 'Russian', 'Japanese', 'Gujarati', 'Urdu', 'Bengali', 'Punjabi', 'Korean', 'Tagalog', 'Romanian', 'Vietnamese', 'Greek', 'Polish', 'Dutch', 'Turkish', 'Hebrew', 'Swedish', 'Serbian', 'Nepali', 'Bulgarian', 'Macedonian', 'Oriya']
pavlick_langauge = ['English', 'Tamil', 'Malayalam', 'Hindi', 'Spanish', 'Telugu', 'Chinese', 'Romanian', 'Portuguese', 'Arabic', 'Kannada', 'German', 'French', 'Polish', 'Urdu', 'Tagalog', 'Marathi', 'Russian', 'Italian', 'Bengali', 'Gujarati', 'Hebrew', 'Dutch', 'Turkish', 'Vietnamese', 'Macedonian', 'Cebuano', 'Swedish', 'Bulgarian', 'Swahili', 'Hungarian', 'Catalan', 'Thai', 'Lithuanian', 'Punjabi']

intersect_language = intersect(my_langauge, pavlick_langauge)
len(intersect_language)


29

In [27]:
from scipy.stats import spearmanr

In [28]:
spearmanr(my_langauge,pavlick_langauge)



SpearmanrResult(correlation=0.057703081232493, pvalue=0.7419647128381073)