### Demographics API

Below we have the code that retrieves the data from the  Mechanical Turk Tracker Demographics API.

In [1]:
from datetime import datetime
import time

# This function takes as input the response for a single survey, and transforms it into a flat dictionary
def flatten(item):
    fmt = "%Y-%m-%dT%H:%M:%S.%fZ"
    
    hit_answer_date = datetime.strptime(item["date"], fmt)
    hit_creation_str = item.get("hitCreationDate")
    
    if hit_creation_str is None: 
        hit_creation_date = None 
        diff = None
    else:
        hit_creation_date = datetime.strptime(hit_creation_str, fmt)
        # convert to unix timestamp
        hit_date_ts = time.mktime(hit_creation_date.timetuple())
        answer_date_ts = time.mktime(hit_answer_date.timetuple())
        diff = int(answer_date_ts-hit_date_ts)
    
    result = {
        "worker_id": str(item["workerId"]),
        "gender": str(item["answers"]["gender"]),
        "household_income": str(item["answers"]["householdIncome"]),
        "educational_level": str(item["answers"].get("educationalLevel")),
        "household_size": str(item["answers"]["householdSize"]),
        "marital_status": str(item["answers"].get("maritalStatus")),
        "languages_spoken": str(item["answers"].get("languagesSpoken")),
        "time_spent_on_mturk": str(item["answers"].get("timeSpentOnMturk")),
        "weekly_income_from_mturk": str(item["answers"].get("weeklyIncomeFromMturk")),
        "year_of_birth": int(item["answers"]["yearOfBirth"]),
        "location_city": str(item.get("locationCity")),
        "location_region": str(item.get("locationRegion")),
        "location_country": str(item["locationCountry"]),
        "hit_answered_date": hit_answer_date,
        "hit_creation_date": hit_creation_date,
        "post_to_completion_secs": diff
    }
    return result

In [2]:
# The code below retrieves all the responses from the Demographics API
# Since we cannot get all the responses at once, we fetch a few thousand
# records at a time, until fetching them all

import requests
import json

limit = 5000

# The API call that returns the last survey responses
baseurl = "https://mturk-surveys.appspot.com/" + \
    "_ah/api/survey/v1/survey/demographics/answers?limit=" + str(limit)

# This is the cursor variable, used to retrieve more pages of results
nextPageToken = None

# We store the results in this list
results = []

while True:
    if nextPageToken == None:
        url = baseurl
    else:
        url = baseurl + "&cursor=" + nextPageToken

    resp = requests.get(url)
    if resp.status_code == 200:
        data = json.loads(resp.text)
        items = data.get("items")
        if items == None:
            break
        print("Retrieved ", len(items), " responses")
        responses = [flatten(item) for item in items]
        results.extend(responses)
        print("Total of ", len(results), " responses in our data")
    else:
        print("Something went wrong with the network call")

    nextPageToken = data.get("nextPageToken")
    if nextPageToken == None:
        break


Retrieved  5000  responses
Total of  5000  responses in our data
Retrieved  5000  responses
Total of  10000  responses in our data
Retrieved  5000  responses
Total of  15000  responses in our data
Retrieved  5000  responses
Total of  20000  responses in our data
Retrieved  5000  responses
Total of  25000  responses in our data
Retrieved  5000  responses
Total of  30000  responses in our data
Retrieved  5000  responses
Total of  35000  responses in our data
Retrieved  5000  responses
Total of  40000  responses in our data
Retrieved  5000  responses
Total of  45000  responses in our data
Retrieved  5000  responses
Total of  50000  responses in our data
Retrieved  5000  responses
Total of  55000  responses in our data
Retrieved  5000  responses
Total of  60000  responses in our data
Retrieved  5000  responses
Total of  65000  responses in our data
Retrieved  5000  responses
Total of  70000  responses in our data
Retrieved  5000  responses
Total of  75000  responses in our data
Retrieved  

In [3]:
# Let's print the total number of retrieved responses
print(len(results))

100519


In [4]:
import pandas as pd
df = pd.DataFrame(results)
# Let's save the file as a CSV
df.to_csv("mturk_surveys_extended.csv")

In [5]:
df

Unnamed: 0,educational_level,gender,hit_answered_date,hit_creation_date,household_income,household_size,languages_spoken,location_city,location_country,location_region,marital_status,post_to_completion_secs,time_spent_on_mturk,weekly_income_from_mturk,worker_id,year_of_birth
0,Bachelors degree,female,2018-01-10 20:28:40.525,2018-01-10 20:27:12,"$40,000-$59,999",1,English,killeen,US,tx,single,88.0,4-8 hours per week,$1-$5 per week,e194c2bac84948e0e9060ff0157dd59d,1989
1,"Some college, no degree",female,2018-01-10 20:20:45.463,2018-01-10 20:12:11,"$40,000-$59,999",4,English,denver,US,co,married,514.0,4-8 hours per week,$10-$20 per week,f406ad4e7a7ecd57023f975f3549def2,1984
2,"Graduate degree, Masters",female,2018-01-10 20:01:29.094,2018-01-10 19:57:12,"$100,000 or more",5+,English,omaha,US,ne,married,257.0,8-20 hours per week,$50-$100 per week,01ecbbbf603a9784f8426811d20abe0c,1975
3,Bachelors degree,female,2018-01-10 19:49:31.752,2018-01-10 19:42:12,"$40,000-$59,999",2,English,powell,US,oh,married,439.0,20-40 hours per week,$10-$20 per week,eaeb7e00d369241f9e700f3c4b18b958,1950
4,Associates degree,female,2018-01-10 19:28:38.262,2018-01-10 19:27:12,"$40,000-$59,999",2,English,high springs,US,fl,married,86.0,4-8 hours per week,$5-$10 per week,7cbaecb10cc3b8e6aa5ea2928661efee,1974
5,"Graduate degree, Masters",male,2018-01-10 19:18:17.144,2018-01-10 19:12:15,"$100,000 or more",4,"English,Spanish,Portuguese",minneapolis,US,mn,married,362.0,1-2 hours per week,$5-$10 per week,0c59d157b4d4bee25ebfc73c2b69be9e,1971
6,Associates degree,male,2018-01-10 19:03:21.846,2018-01-10 18:27:12,"Less than $10,000",1,English,scranton,US,pa,single,2169.0,20-40 hours per week,$50-$100 per week,7022a7dc52f8f23d958b277ad98256cb,1986
7,"Some college, no degree",male,2018-01-10 19:02:06.863,2018-01-10 18:57:11,"Less than $10,000",2,English,tuscumbia,US,al,married,295.0,Less than 1 hour per week,$1-$5 per week,99074b9e0ad71b4540aced8b18cbc0e9,1986
8,Bachelors degree,male,2018-01-10 18:43:22.923,2018-01-10 18:42:13,"Less than $10,000",4,"English,Tamil",madurai,IN,tn,single,69.0,4-8 hours per week,$10-$20 per week,d70ac6a57c693c63b1f2b17dad44fd95,1994
9,"Graduate degree, Masters",female,2018-01-10 18:17:18.917,2018-01-10 18:12:10,"$100,000 or more",2,English,fort wayne,US,in,married,308.0,4-8 hours per week,$5-$10 per week,3f7de1a6e35847518fffcd8d0701f4ea,1976


In [6]:
people_with_language = len([v for v in df.languages_spoken.values if v!='None' and v!=''])
people_with_language

19452

In [7]:
df.location_country.value_counts()

US    75428
IN    16862
CA     1148
GB      709
PH      343
VE      280
DE      275
FR      255
IT      250
MX      186
BR      179
ZZ      177
ES      156
AE      147
AU      138
RO      133
TR      117
GR      117
NG      112
PT      111
RU      110
MK      105
KE      103
ID      101
JP      101
TH       98
SG       93
NZ       92
NL       88
SA       79
      ...  
LI        2
ET        2
PY        2
SR        2
KG        2
NA        2
AZ        2
CN        1
AM        1
TZ        1
AG        1
PS        1
CD        1
LA        1
PF        1
BM        1
MN        1
ME        1
CW        1
DJ        1
GM        1
HT        1
AI        1
FM        1
MO        1
SX        1
BU        1
TJ        1
IM        1
FJ        1
Name: location_country, Length: 150, dtype: int64

In [38]:
df [  df.languages_spoken.str.contains(',') ].location_country.value_counts()

IN    1910
US    1585
CA      80
VE      47
IT      46
MX      43
DE      42
FR      35
GB      34
RO      27
ES      24
PT      22
BR      20
GR      17
AE      17
CO      16
PH      16
DZ      15
NG      15
NL      13
KE      13
JP      13
EG      13
LT      11
RU      11
HK      11
PK       9
BE       9
ID       9
RS       9
      ... 
FI       3
LK       3
MD       2
DK       2
OM       2
SK       2
ZA       2
LV       2
CZ       2
GT       2
JO       1
NO       1
SI       1
SG       1
UY       1
TN       1
BA       1
KW       1
EE       1
MV       1
JM       1
MT       1
SR       1
LB       1
PA       1
FM       1
GH       1
IQ       1
HU       1
NA       1
Name: location_country, Length: 92, dtype: int64

In [50]:
# bilingual and above
len(df [ df.languages_spoken.str.contains(',') ].worker_id.value_counts())

2889

In [51]:
# unique ids of workers that answered the language question
len(df [ df.languages_spoken !='None' ].worker_id.value_counts())

12123

In [72]:
df [  df.languages_spoken.str.contains('English') ].location_country.value_counts()

US    14356
IN     3164
CA      235
GB      125
IT       55
PH       52
MX       49
VE       49
DE       48
FR       38
NG       33
PT       31
ES       30
RO       30
KE       27
TH       24
BR       23
AE       22
ID       21
GR       20
NL       19
AU       19
JP       18
IE       18
TR       17
CO       17
DZ       16
BD       15
TT       14
HK       13
      ...  
LC        2
LB        2
NO        2
DK        2
JE        2
AF        2
HU        2
MV        1
KW        1
BN        1
UY        1
CW        1
SR        1
JO        1
IQ        1
GH        1
BA        1
FM        1
SI        1
ZW        1
NA        1
SX        1
MT        1
ME        1
PA        1
EE        1
DM        1
GE        1
TN        1
LU        1
Name: location_country, Length: 109, dtype: int64

In [73]:
df [  df.languages_spoken.str.contains('Tamil') ].location_country.value_counts()

IN    1419
US      18
AE      10
SA       5
GB       3
CA       3
LK       2
NG       1
PK       1
ZZ       1
IT       1
OM       1
IQ       1
MV       1
SE       1
AU       1
SG       1
Name: location_country, dtype: int64

In [74]:
df [  df.languages_spoken.str.contains('Spanish') ].location_country.value_counts()

US    881
VE     56
MX     44
IN     32
ES     24
CO     18
PT     13
FR     10
CL      8
DE      7
IT      7
GB      7
CA      6
DO      6
BR      5
PR      5
BO      5
JP      5
CR      4
NL      4
AR      4
PE      4
RO      3
BE      3
EC      3
KE      2
GR      2
BG      2
CH      2
GT      2
QA      2
ZZ      2
TH      2
MK      1
PA      1
MA      1
ID      1
TR      1
JM      1
AU      1
GH      1
MD      1
JO      1
EE      1
TW      1
AE      1
VN      1
KR      1
UA      1
UY      1
BH      1
CZ      1
MY      1
DK      1
RS      1
Name: location_country, dtype: int64

In [75]:
df [  df.languages_spoken.str.contains('Hindi') ].location_country.value_counts()

IN    1047
US      49
AE      15
SA       5
NP       4
BD       4
GB       4
CA       3
SE       3
OM       2
NZ       2
DE       2
KR       2
JP       1
IQ       1
AU       1
PK       1
ID       1
HK       1
QA       1
Name: location_country, dtype: int64

In [85]:
df [  df.languages_spoken.str.contains('French') ].location_country.value_counts()

US    190
CA     52
IN     44
FR     36
GB     18
DZ     14
DE     10
BE      9
NG      7
EG      7
BH      6
BR      5
PT      5
IT      4
MA      4
TR      4
RS      3
NL      3
RO      3
QA      2
CH      2
DK      1
TN      1
UA      1
AT      1
PL      1
BD      1
MX      1
CZ      1
FM      1
IE      1
BG      1
MD      1
JO      1
CO      1
MT      1
IL      1
AE      1
JP      1
Name: location_country, dtype: int64

In [82]:
df [  df.languages_spoken.str.contains('Malayalam') ].location_country.value_counts()

IN    431
AE      8
US      7
SA      4
OM      2
QA      2
MV      1
BN      1
IQ      1
Name: location_country, dtype: int64

In [83]:
df [  df.languages_spoken.str.contains('Tegulu') ].location_country.value_counts()

IN    251
US      8
GB      3
SE      1
ID      1
CA      1
JP      1
Name: location_country, dtype: int64

In [79]:
df [  df.languages_spoken.str.contains('German') ].location_country.value_counts()

US    110
DE     43
IN     14
GB      7
CO      5
NL      5
AT      4
BE      4
FR      4
IT      4
TR      3
MX      3
CA      2
IE      2
LT      1
LB      1
BO      1
CH      1
RS      1
KE      1
DK      1
SI      1
RO      1
MK      1
NO      1
Name: location_country, dtype: int64

In [80]:
df [  df.languages_spoken.str.contains('Chinese') ].location_country.value_counts()

US    151
IN     24
CA     21
HK     11
MY      6
PH      2
KR      2
IT      2
ID      2
TW      2
DE      2
NZ      1
DK      1
ZA      1
ES      1
GB      1
JP      1
LV      1
Name: location_country, dtype: int64

In [81]:
df [  df.languages_spoken.str.contains('Macedonian') ].location_country.value_counts()

MK    13
RS     2
IT     1
Name: location_country, dtype: int64

In [57]:
languages = set([l for v in df.languages_spoken.values if v!='None' 
                 for l in v.split(',') if l!=''])

In [84]:

result = []
for language in languages:
    people = len(set(df[ df.languages_spoken.str.contains(language) ].worker_id.values))
    result.append({"lang":language, "unique_workers": people})
    

df_cnt = pd.DataFrame(result).sort_values('unique_workers', ascending=False)    
print (df_cnt)    
# more than 10 people for the language

#df2 = pd.DataFrame(result).sort_values('unique_workers', ascending=False)
#enough = df2 [df2.unique_workers > 9]
#len(enough)

           lang  unique_workers
76      English           11921
0       Spanish             865
58        Tamil             865
55        Hindi             735
35       French             324
67        Malay             272
39    Malayalam             265
30       Tegulu             168
8        German             161
75      Chinese             159
15      Kannada             101
79      Italian              99
29   Portuguese              79
6      Japanese              71
52       Arabic              69
62      Marathi              64
20      Russian              61
38     Gujarati              51
5          Urdu              51
12      Bengali              49
57      Punjabi              41
40       Korean              39
51      Tagalog              37
36   Vietnamese              33
27       Polish              26
4         Greek              25
18     Romanian              24
71        Dutch              19
47      Serbian              18
46      Turkish              17
..      

In [71]:
df_cnt.unique_workers.value_counts().sort_index()

1        16
2         6
3         2
4         3
5         2
6         2
7         5
8         5
9         2
10        1
12        3
13        1
14        2
16        2
17        1
18        1
19        1
24        1
25        1
26        1
33        1
37        1
39        1
41        1
49        1
51        2
61        1
64        1
69        1
71        1
79        1
99        1
101       1
159       1
161       1
168       1
265       1
272       1
324       1
735       1
865       2
11921     1
Name: unique_workers, dtype: int64

In [13]:
len([v for v in df.languages_spoken.values if 'Spanish' in v])

1202

In [36]:
1189 / 19268 * 170000

10490.450487855513

In [14]:
def intersect(a, b):
    """ return the intersection of two lists """
    return list(set(a) & set(b))

my_langauge = ['English', 'Tamil', 'Spanish', 'Hindi', 'Malayalam', 'French', 'Telugu', 'Chinese', 'German', 'Kannada', 'Italian', 'Portuguese', 'Marathi', 'Arabic', 'Russian', 'Japanese', 'Gujarati', 'Urdu', 'Bengali', 'Punjabi', 'Korean', 'Tagalog', 'Romanian', 'Vietnamese', 'Greek', 'Polish', 'Dutch', 'Turkish', 'Hebrew', 'Swedish', 'Serbian', 'Nepali', 'Bulgarian', 'Macedonian', 'Oriya']
pavlick_langauge = ['English', 'Tamil', 'Malayalam', 'Hindi', 'Spanish', 'Telugu', 'Chinese', 'Romanian', 'Portuguese', 'Arabic', 'Kannada', 'German', 'French', 'Polish', 'Urdu', 'Tagalog', 'Marathi', 'Russian', 'Italian', 'Bengali', 'Gujarati', 'Hebrew', 'Dutch', 'Turkish', 'Vietnamese', 'Macedonian', 'Cebuano', 'Swedish', 'Bulgarian', 'Swahili', 'Hungarian', 'Catalan', 'Thai', 'Lithuanian', 'Punjabi']

intersect_language = intersect(my_langauge, pavlick_langauge)
len(intersect_language)


29

In [65]:
from scipy.stats import spearmanr

In [66]:
spearmanr(my_langauge,pavlick_langauge)



SpearmanrResult(correlation=0.057703081232493, pvalue=0.74196471283810728)