### Demographics API

Below we have the code that retrieves the data from the  Mechanical Turk Tracker Demographics API.

In [1]:
from datetime import datetime
import time

# This function takes as input the response for a single survey, and transforms it into a flat dictionary
def flatten(item):
    fmt = "%Y-%m-%dT%H:%M:%S.%fZ"
    
    hit_answer_date = datetime.strptime(item["date"], fmt)
    hit_creation_str = item.get("hitCreationDate")
    
    if hit_creation_str is None: 
        hit_creation_date = None 
        diff = None
    else:
        hit_creation_date = datetime.strptime(hit_creation_str, fmt)
        # convert to unix timestamp
        hit_date_ts = time.mktime(hit_creation_date.timetuple())
        answer_date_ts = time.mktime(hit_answer_date.timetuple())
        diff = int(answer_date_ts-hit_date_ts)
    
    result = {
        "worker_id": str(item["workerId"]),
        "gender": str(item["answers"]["gender"]),
        "household_income": str(item["answers"]["householdIncome"]),
        "educational_level": str(item["answers"].get("educationalLevel")),
        "household_size": str(item["answers"]["householdSize"]),
        "marital_status": str(item["answers"].get("maritalStatus")),
        "languages_spoken": str(item["answers"].get("languagesSpoken")),
        "time_spent_on_mturk": str(item["answers"].get("timeSpentOnMturk")),
        "weekly_income_from_mturk": str(item["answers"].get("weeklyIncomeFromMturk")),
        "year_of_birth": int(item["answers"]["yearOfBirth"]),
        "location_city": str(item.get("locationCity")),
        "location_region": str(item.get("locationRegion")),
        "location_country": str(item["locationCountry"]),
        "hit_answered_date": hit_answer_date,
        "hit_creation_date": hit_creation_date,
        "post_to_completion_secs": diff
    }
    return result

In [2]:
# The code below retrieves all the responses from the Demographics API
# Since we cannot get all the responses at once, we fetch a few thousand
# records at a time, until fetching them all

import requests
import json

limit = 5000

# The API call that returns the last survey responses
baseurl = "https://mturk-surveys.appspot.com/" + \
    "_ah/api/survey/v1/survey/demographics/answers?limit=" + str(limit)

# This is the cursor variable, used to retrieve more pages of results
nextPageToken = None

# We store the results in this list
results = []

while True:
    if nextPageToken == None:
        url = baseurl
    else:
        url = baseurl + "&cursor=" + nextPageToken

    resp = requests.get(url)
    if resp.status_code == 200:
        data = json.loads(resp.text)
        items = data.get("items")
        if items == None:
            break
        print("Retrieved ", len(items), " responses")
        responses = [flatten(item) for item in items]
        results.extend(responses)
        print("Total of ", len(results), " responses in our data")
    else:
        print("Something went wrong with the network call")

    nextPageToken = data.get("nextPageToken")
    if nextPageToken == None:
        break


Retrieved  5000  responses
Total of  5000  responses in our data
Retrieved  5000  responses
Total of  10000  responses in our data
Retrieved  5000  responses
Total of  15000  responses in our data
Retrieved  5000  responses
Total of  20000  responses in our data
Retrieved  5000  responses
Total of  25000  responses in our data
Retrieved  5000  responses
Total of  30000  responses in our data
Retrieved  5000  responses
Total of  35000  responses in our data
Retrieved  5000  responses
Total of  40000  responses in our data
Retrieved  5000  responses
Total of  45000  responses in our data
Retrieved  5000  responses
Total of  50000  responses in our data
Retrieved  5000  responses
Total of  55000  responses in our data
Retrieved  5000  responses
Total of  60000  responses in our data
Retrieved  5000  responses
Total of  65000  responses in our data
Retrieved  5000  responses
Total of  70000  responses in our data
Retrieved  5000  responses
Total of  75000  responses in our data
Retrieved  

In [3]:
# Let's print the total number of retrieved responses
print(len(results))

111019


In [4]:
import pandas as pd
df = pd.DataFrame(results)
# Let's save the file as a CSV
df.to_csv("mturk_surveys_extended.csv")

In [5]:
df

Unnamed: 0,educational_level,gender,hit_answered_date,hit_creation_date,household_income,household_size,languages_spoken,location_city,location_country,location_region,marital_status,post_to_completion_secs,time_spent_on_mturk,weekly_income_from_mturk,worker_id,year_of_birth
0,"Graduate degree, Masters",female,2018-04-28 22:56:56.176,2018-04-28 22:49:11,"$40,000-$59,999",2,English,cleveland,US,oh,married,465.0,4-8 hours per week,$50-$100 per week,c7168b965fb301c91054581eda965492,1982
1,Bachelors degree,female,2018-04-28 22:23:17.321,2018-04-28 22:19:11,"$75,000-$99,999",4,English,pleasanton,US,ca,cohabitating,246.0,4-8 hours per week,$20-$50 per week,dba5d6c507ecd73ce64de0d659024efa,1993
2,Associates degree,female,2018-04-28 22:06:09.421,2018-04-28 22:04:12,"$40,000-$59,999",4,English,chambersburg,US,pa,married,117.0,2-4 hours per week,$5-$10 per week,54af7eaca56517a970497deb57907a39,1983
3,"Some college, no degree",female,2018-04-28 21:50:43.160,2018-04-28 21:49:12,"$10,000-$14,999",1,English,welcome,US,nc,divorced,91.0,Less than 1 hour per week,$5-$10 per week,a0ed9875bfc4dc4950d646751eecc8c3,1974
4,"Some college, no degree",male,2018-04-28 21:36:26.790,2018-04-28 21:34:09,"$40,000-$59,999",5+,English,new york,US,ny,single,137.0,20-40 hours per week,$50-$100 per week,d16313537a6c9c658a5202b74c5bfb87,1991
5,Bachelors degree,male,2018-04-28 21:24:42.080,2018-04-28 21:19:10,"$15,000-$24,999",4,"English,Tamil",coimbatore,IN,tn,married,332.0,20-40 hours per week,$100-$200 per week,bdc2b18278fbd1ba38075b828a145d26,1976
6,Bachelors degree,female,2018-04-28 21:18:10.319,2018-04-28 20:33:10,"$40,000-$59,999",2,English,omaha,US,ne,single,2700.0,4-8 hours per week,$10-$20 per week,20ae00766a0466370e09c21184babb4d,1994
7,"Graduate degree, Doctorate",male,2018-04-28 21:06:12.715,2018-04-28 21:04:11,"$75,000-$99,999",1,English,farmingville,US,ny,single,121.0,2-4 hours per week,$1-$5 per week,40f2b11dffeb92215447eb6bdf38e254,1984
8,High School Graduate,female,2018-04-28 20:50:27.023,2018-04-28 20:49:07,"$40,000-$59,999",2,English,minneapolis,US,mn,single,80.0,4-8 hours per week,$10-$20 per week,df783a6170b227fe6047ed1cd8a8e17a,1966
9,Bachelors degree,male,2018-04-28 20:19:54.469,2018-04-28 20:18:11,"$75,000-$99,999",3,English,warren,US,oh,engaged,103.0,1-2 hours per week,$5-$10 per week,dffe4a18679265387ab7aad9ba3e3084,1988


In [6]:
people_with_language = len([v for v in df.languages_spoken.values if v!='None' and v!=''])
people_with_language

29952

In [7]:
df.location_country.value_counts()

US    83207
IN    18792
CA     1285
GB      780
PH      377
VE      317
DE      315
IT      292
FR      276
MX      208
ZZ      193
BR      190
ES      164
AE      154
RO      146
AU      146
TR      128
GR      125
TH      122
NG      121
KE      119
RU      116
PT      116
MK      111
JP      111
ID      109
NZ       99
SG       96
NL       94
IE       89
      ...  
BS        2
TJ        2
ET        2
SR        2
DJ        2
AZ        2
LI        2
MV        2
KG        2
NA        2
ME        2
PS        1
IM        1
BM        1
GM        1
PF        1
AG        1
CD        1
SX        1
MN        1
FM        1
CN        1
AM        1
BU        1
TZ        1
AI        1
HT        1
FJ        1
MO        1
LA        1
Name: location_country, Length: 150, dtype: int64

In [8]:
df [  df.languages_spoken.str.contains(',') ].location_country.value_counts()

IN    2895
US    2547
CA     123
IT      76
DE      75
VE      73
MX      61
FR      54
GB      54
RO      37
ES      30
BR      29
PT      27
GR      24
PH      23
CO      21
AE      21
NG      20
KE      18
EG      18
DZ      18
JP      17
NL      17
RU      16
LT      16
BE      15
HK      13
MK      13
TR      12
RS      12
      ... 
IQ       2
NI       2
SK       2
ZA       2
LV       2
CZ       2
SI       2
OM       2
SG       2
HR       2
MD       2
AL       2
GH       2
NO       1
ME       1
SV       1
SR       1
CY       1
UY       1
JM       1
MT       1
JO       1
TN       1
ZW       1
EE       1
MV       1
PA       1
NA       1
TT       1
FM       1
Name: location_country, Length: 100, dtype: int64

In [9]:
# bilingual and above
len(df [ df.languages_spoken.str.contains(',') ].worker_id.value_counts())

4019

In [10]:
# unique ids of workers that answered the language question
len(df [ df.languages_spoken !='None' ].worker_id.value_counts())

16755

In [11]:
df [  df.languages_spoken.str.contains('English') ].location_country.value_counts()

US    22113
IN     4888
CA      371
GB      196
IT       92
DE       85
PH       85
VE       84
MX       71
FR       57
TH       47
KE       43
NG       42
RO       42
ES       37
PT       36
BR       33
ZZ       29
GR       28
IE       28
JP       28
ID       28
AE       27
AU       27
TR       26
CO       26
NL       25
JM       21
TT       20
HK       20
      ...  
IQ        2
VC        2
NO        2
CW        2
HR        2
NI        2
AL        2
BA        2
SI        2
GE        1
NA        1
GY        1
SV        1
TN        1
FM        1
BN        1
MV        1
JO        1
LU        1
UY        1
GD        1
SX        1
PA        1
EE        1
DJ        1
TJ        1
SR        1
CM        1
DM        1
MT        1
Name: location_country, Length: 118, dtype: int64

In [12]:
df [  df.languages_spoken.str.contains('Tamil') ].location_country.value_counts()

IN    2272
US      24
AE      12
SA       5
GB       4
CA       3
ZZ       3
LK       2
SG       2
MV       1
OM       1
PK       1
SE       1
AU       1
IT       1
KW       1
BH       1
NG       1
IQ       1
Name: location_country, dtype: int64

In [13]:
df [  df.languages_spoken.str.contains('Spanish') ].location_country.value_counts()

US    1390
VE      84
MX      61
IN      53
ES      31
CO      23
PT      14
FR      13
IT      12
DE      11
GB      10
CL      10
BR       8
CA       8
PR       8
PE       7
AR       7
EC       6
DO       6
JP       5
BE       5
GT       5
BO       5
CR       4
NL       4
ZZ       4
RO       3
DK       3
MA       3
GR       3
      ... 
BG       2
VN       2
TR       2
TW       2
QA       2
ID       2
CZ       1
AL       1
JO       1
UY       1
LB       1
JM       1
KR       1
GH       1
AU       1
SV       1
AE       1
MY       1
TT       1
NG       1
UA       1
BD       1
MK       1
RS       1
EE       1
BH       1
MD       1
PA       1
IE       1
NP       1
Name: location_country, Length: 64, dtype: int64

In [14]:
df [  df.languages_spoken.str.contains('Hindi') ].location_country.value_counts()

IN    1444
US      68
AE      18
GB       8
SE       6
NP       6
SA       5
BD       4
CA       4
KR       2
JP       2
DE       2
OM       2
NZ       2
KW       1
BH       1
PK       1
HK       1
ID       1
QA       1
AU       1
IQ       1
Name: location_country, dtype: int64

In [15]:
df [  df.languages_spoken.str.contains('French') ].location_country.value_counts()

US    312
CA     80
IN     67
FR     56
GB     24
DE     24
DZ     16
BE     16
IT     14
NG     12
BH      9
EG      9
MA      8
PT      6
BR      6
TR      5
RO      4
IE      3
RS      3
NL      3
MX      2
DK      2
IL      2
QA      2
BD      2
CH      2
ES      2
JP      1
IQ      1
AL      1
CO      1
NI      1
PL      1
AE      1
AT      1
JO      1
BG      1
FM      1
MT      1
TH      1
UA      1
TN      1
GH      1
MD      1
HU      1
CZ      1
Name: location_country, dtype: int64

In [16]:
df [  df.languages_spoken.str.contains('Malayalam') ].location_country.value_counts()

IN    603
AE     13
US     12
SA      4
OM      2
QA      2
BH      1
MV      1
BN      1
KW      1
IQ      1
GB      1
Name: location_country, dtype: int64

In [17]:
df [  df.languages_spoken.str.contains('Tegulu') ].location_country.value_counts()

IN    358
US     13
GB      4
ZZ      2
SE      2
CA      1
JP      1
ID      1
Name: location_country, dtype: int64

In [18]:
df [  df.languages_spoken.str.contains('German') ].location_country.value_counts()

US    188
DE     76
IN     20
GB     11
BE      7
IT      6
CO      6
FR      5
AT      5
MX      5
NL      5
LB      4
TR      3
DK      2
CA      2
RO      2
AR      2
SI      2
IE      2
MK      2
GT      1
PL      1
BO      1
RS      1
NO      1
AL      1
FI      1
KE      1
TH      1
LT      1
ES      1
CH      1
GR      1
Name: location_country, dtype: int64

In [19]:
df [  df.languages_spoken.str.contains('Chinese') ].location_country.value_counts()

US    256
IN     36
CA     25
HK     13
MY      9
PH      3
KR      3
IT      2
GB      2
ID      2
FR      2
MX      2
DK      2
DE      2
TW      2
AR      1
NZ      1
LV      1
ES      1
ZZ      1
JP      1
ZA      1
TH      1
BE      1
Name: location_country, dtype: int64

In [20]:
df [  df.languages_spoken.str.contains('Macedonian') ].location_country.value_counts()

MK    18
RS     3
IT     1
Name: location_country, dtype: int64

In [57]:
languages = set([l for v in df.languages_spoken.values if v!='None' 
                 for l in v.split(',') if l!=''])

In [84]:

result = []
for language in languages:
    people = len(set(df[ df.languages_spoken.str.contains(language) ].worker_id.values))
    result.append({"lang":language, "unique_workers": people})
    

df_cnt = pd.DataFrame(result).sort_values('unique_workers', ascending=False)    
print (df_cnt)    
# more than 10 people for the language

#df2 = pd.DataFrame(result).sort_values('unique_workers', ascending=False)
#enough = df2 [df2.unique_workers > 9]
#len(enough)

           lang  unique_workers
76      English           11921
0       Spanish             865
58        Tamil             865
55        Hindi             735
35       French             324
67        Malay             272
39    Malayalam             265
30       Tegulu             168
8        German             161
75      Chinese             159
15      Kannada             101
79      Italian              99
29   Portuguese              79
6      Japanese              71
52       Arabic              69
62      Marathi              64
20      Russian              61
38     Gujarati              51
5          Urdu              51
12      Bengali              49
57      Punjabi              41
40       Korean              39
51      Tagalog              37
36   Vietnamese              33
27       Polish              26
4         Greek              25
18     Romanian              24
71        Dutch              19
47      Serbian              18
46      Turkish              17
..      

In [71]:
df_cnt.unique_workers.value_counts().sort_index()

1        16
2         6
3         2
4         3
5         2
6         2
7         5
8         5
9         2
10        1
12        3
13        1
14        2
16        2
17        1
18        1
19        1
24        1
25        1
26        1
33        1
37        1
39        1
41        1
49        1
51        2
61        1
64        1
69        1
71        1
79        1
99        1
101       1
159       1
161       1
168       1
265       1
272       1
324       1
735       1
865       2
11921     1
Name: unique_workers, dtype: int64

In [13]:
len([v for v in df.languages_spoken.values if 'Spanish' in v])

1202

In [36]:
1189 / 19268 * 170000

10490.450487855513

In [14]:
def intersect(a, b):
    """ return the intersection of two lists """
    return list(set(a) & set(b))

my_langauge = ['English', 'Tamil', 'Spanish', 'Hindi', 'Malayalam', 'French', 'Telugu', 'Chinese', 'German', 'Kannada', 'Italian', 'Portuguese', 'Marathi', 'Arabic', 'Russian', 'Japanese', 'Gujarati', 'Urdu', 'Bengali', 'Punjabi', 'Korean', 'Tagalog', 'Romanian', 'Vietnamese', 'Greek', 'Polish', 'Dutch', 'Turkish', 'Hebrew', 'Swedish', 'Serbian', 'Nepali', 'Bulgarian', 'Macedonian', 'Oriya']
pavlick_langauge = ['English', 'Tamil', 'Malayalam', 'Hindi', 'Spanish', 'Telugu', 'Chinese', 'Romanian', 'Portuguese', 'Arabic', 'Kannada', 'German', 'French', 'Polish', 'Urdu', 'Tagalog', 'Marathi', 'Russian', 'Italian', 'Bengali', 'Gujarati', 'Hebrew', 'Dutch', 'Turkish', 'Vietnamese', 'Macedonian', 'Cebuano', 'Swedish', 'Bulgarian', 'Swahili', 'Hungarian', 'Catalan', 'Thai', 'Lithuanian', 'Punjabi']

intersect_language = intersect(my_langauge, pavlick_langauge)
len(intersect_language)


29

In [65]:
from scipy.stats import spearmanr

In [66]:
spearmanr(my_langauge,pavlick_langauge)



SpearmanrResult(correlation=0.057703081232493, pvalue=0.74196471283810728)