# Language speakers in NYC, 2015
American Community Survey data: https://api.census.gov/data/2015/acs/acs1/groups/B16001.html

In [1]:
import cenpy as cen
import pandas as pd

from getpass import getpass



In [2]:
CENSUS_API_KEY = getpass('Enter your Census API Key: ')

Enter your Census API Key: ········


In [3]:
acs = cen.products.ACS()

In [4]:
# https://api.census.gov/data/2019/acs/acs1/variables.html > 
# https://api.census.gov/data/2019/acs/acs1/groups/B16001.html
con = cen.remote.APIConnection('ACSDT1Y2015',apikey=CENSUS_API_KEY)

# The columns to analyze
columns = [
#         'B16001_001E', # Total  
        'B16001_003E', # Spanish or Spanish Creole
        'B16001_006E', # French (incl. Patois, Cajun)
        'B16001_009E', # French Creole
        'B16001_012E', # Italian
        'B16001_015E', # Portuguese
        'B16001_018E', # German
        'B16001_021E', # Yiddish
        'B16001_024E', # Other West Germanic languages
        'B16001_027E', # Scandinavian languages
        'B16001_030E', # Greek
        'B16001_033E', # Russian 
        'B16001_036E', # Polish 
        'B16001_039E', # Serbo-Croatian 
        'B16001_042E', # Other Slavic languages 
        'B16001_045E', # Armenian 
        'B16001_048E', # Persian 
        'B16001_051E', # Gujarati 
        'B16001_054E', # Hindi 
        'B16001_057E', # Urdu 
        'B16001_060E', # Other Indic languages
        'B16001_063E', # Other Indo-European languages
        'B16001_066E', # Chinese 
        'B16001_069E', # Japanese 
        'B16001_072E', # Korean 
        'B16001_075E', # Mon-Khmer, Cambodian 
        'B16001_078E', # Hmong 
        'B16001_081E', # Thai 
        'B16001_084E', # Laotian 
        'B16001_087E', # Vietnamese
        'B16001_090E', # Other Asian 
        'B16001_093E', # Tagalog 
        'B16001_096E', # Other Pacific Island  
        'B16001_099E', # Navajo 
        'B16001_102E', # Other Native North American 
        'B16001_105E', # Hungarian 
        'B16001_108E', # Arabic 
        'B16001_111E', # Hebrew 
        'B16001_114E', # Other African
#         'B16001_117E', # Other and unspecified languages
          ]

g_unit = 'county'
g_filter = {'state':'36'}

df_lang_raw = con.query(columns, geo_unit=g_unit, geo_filter=g_filter)
df_lang_raw.head()

Unnamed: 0,B16001_003E,B16001_006E,B16001_009E,B16001_012E,B16001_015E,B16001_018E,B16001_021E,B16001_024E,B16001_027E,B16001_030E,...,B16001_093E,B16001_096E,B16001_099E,B16001_102E,B16001_105E,B16001_108E,B16001_111E,B16001_114E,state,county
0,,,,,,,,,,,...,,,,,,,,,36,83
1,524568.0,10735.0,24365.0,21610.0,6006.0,4999.0,2060.0,807.0,248.0,27388.0,...,29458.0,3857.0,0.0,988.0,2026.0,17905.0,7887.0,12656.0,36,81
2,340566.0,36896.0,2771.0,9409.0,6994.0,8446.0,1208.0,947.0,3231.0,3807.0,...,6590.0,1368.0,86.0,73.0,1243.0,8809.0,11571.0,9080.0,36,61
3,,,,,,,,,,,...,,,,,,,,,36,11
4,,,,,,,,,,,...,,,,,,,,,36,87


In [5]:
# rename columns

df_lang_raw = df_lang_raw.rename(columns={
        'B16001_003E': 'Spanish', # incl. Spanish Creole
        'B16001_006E': 'French', # incl. Patois, Cajun
        'B16001_009E': 'French Creole',
        'B16001_012E': 'Italian',
        'B16001_015E': 'Portuguese',
        'B16001_018E': 'German',
        'B16001_021E': 'Yiddish', 
        'B16001_024E': 'Other Germanic',
        'B16001_027E': 'Other Scandinavian',
        'B16001_030E': 'Greek',
        'B16001_033E': 'Russian',
        'B16001_036E': 'Polish', 
        'B16001_039E': 'Serbo-Croatian',
        'B16001_042E': 'Other Slavic', 
        'B16001_045E': 'Armenian',
        'B16001_048E': 'Persian',
        'B16001_051E': 'Gujarati',
        'B16001_054E': 'Hindi',
        'B16001_057E': 'Urdu',
        'B16001_060E': 'Other Indic',
        'B16001_063E': 'Other Indo-European',
        'B16001_066E': 'Chinese',
        'B16001_069E': 'Japanese',
        'B16001_072E': 'Korean',
        'B16001_075E': 'Cambodian', # Mon-Khmer, Cambodian
        'B16001_078E': 'Hmong',
        'B16001_081E': 'Thai',
        'B16001_084E': 'Laotian',
        'B16001_087E': 'Vietnamese',
        'B16001_090E': 'Other Asian',
        'B16001_093E': 'Tagalog', 
        'B16001_096E': 'Other Pacific Island',
        'B16001_099E': 'Navajo', 
        'B16001_102E': 'Other Native North American',
        'B16001_105E': 'Hungarian',
        'B16001_108E': 'Arabic',
        'B16001_111E': 'Hebrew',
        'B16001_114E': 'Other African'
#         'B16001_117E': 'Other'
})

df_lang_raw.columns

Index(['Spanish', 'French', 'French Creole', 'Italian', 'Portuguese', 'German',
       'Yiddish', 'Other Germanic', 'Other Scandinavian', 'Greek', 'Russian',
       'Polish', 'Serbo-Croatian', 'Other Slavic', 'Armenian', 'Persian',
       'Gujarati', 'Hindi', 'Urdu', 'Other Indic', 'Other Indo-European',
       'Chinese', 'Japanese', 'Korean', 'Cambodian', 'Hmong', 'Thai',
       'Laotian', 'Vietnamese', 'Other Asian', 'Tagalog',
       'Other Pacific Island', 'Navajo', 'Other Native North American',
       'Hungarian', 'Arabic', 'Hebrew', 'Other African', 'state', 'county'],
      dtype='object')

In [6]:
# filter to just the counties we want
# Codes: https://www.census.gov/library/reference/code-lists/ansi.html
# NY codes: https://www2.census.gov/geo/docs/reference/codes/files/st36_ny_cou.txt

df_lang = df_lang_raw[df_lang_raw['county'].isin(['061', # Manhattan
                                             '081', # Queens
                                             '047', # Brooklyn (Kings)
                                             '005', # Bronx
                                             '085' # Staten Island (Richmond County)
                                            ])]

df_lang.shape

(5, 40)

In [7]:
# create variable for county ID
                      
df_lang.loc[df_lang['county'].str.contains('061'), 'borough'] = 'Manhattan'
df_lang.loc[df_lang['county'].str.contains('081'), 'borough'] = 'Queens'
df_lang.loc[df_lang['county'].str.contains('047'), 'borough'] = 'Brooklyn'
df_lang.loc[df_lang['county'].str.contains('005'), 'borough'] = 'Bronx'
df_lang.loc[df_lang['county'].str.contains('085'), 'borough'] = 'Staten Island'  

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


In [8]:
# convert all columns from string to numeric
df_lang = df_lang.apply(pd.to_numeric, errors='ignore')

In [9]:
# drop state column
df_lang.drop(columns=['state'])

Unnamed: 0,Spanish,French,French Creole,Italian,Portuguese,German,Yiddish,Other Germanic,Other Scandinavian,Greek,...,Tagalog,Other Pacific Island,Navajo,Other Native North American,Hungarian,Arabic,Hebrew,Other African,county,borough
1,524568,10735,24365,21610,6006,4999,2060,807,248,27388,...,29458,3857,0,988,2026,17905,7887,12656,81,Queens
2,340566,36896,2771,9409,6994,8446,1208,947,3231,3807,...,6590,1368,86,73,1243,8809,11571,9080,61,Manhattan
15,651874,15385,5295,8653,1686,950,384,47,77,2143,...,4034,212,0,62,1257,8653,1174,47372,5,Bronx
21,399775,20007,76337,20010,2813,5671,87859,1725,2203,7376,...,5277,1597,0,144,2416,29893,21219,14812,47,Brooklyn
24,46850,1739,229,11950,175,789,155,0,107,1441,...,4501,239,54,177,65,7406,1934,1834,85,Staten Island


In [10]:
# reshape data
df_melt = pd.melt(df_lang, id_vars=['county', 'borough'], value_vars=['Spanish', 'French', 'French Creole', 'Italian', 'Portuguese', 'German',
       'Yiddish', 'Other Germanic', 'Other Scandinavian', 'Greek', 'Russian',
       'Polish', 'Serbo-Croatian', 'Other Slavic', 'Armenian', 'Persian',
       'Gujarati', 'Hindi', 'Urdu', 'Other Indic', 'Other Indo-European',
       'Chinese', 'Japanese', 'Korean', 'Cambodian', 'Hmong', 'Thai',
       'Laotian', 'Vietnamese', 'Other Asian', 'Tagalog',
       'Other Pacific Island', 'Navajo', 'Other Native North American',
       'Hungarian', 'Arabic', 'Hebrew', 'Other African'], var_name='language', value_name='speaker_num')
df_melt.head()          

Unnamed: 0,county,borough,language,speaker_num
0,81,Queens,Spanish,524568
1,61,Manhattan,Spanish,340566
2,5,Bronx,Spanish,651874
3,47,Brooklyn,Spanish,399775
4,85,Staten Island,Spanish,46850


In [11]:
# define overarching groups for treemap: https://www.census.gov/topics/population/language-use/about.html
indo_eur = ['Spanish', 'French', 'French Creole', 'Italian', 'Portuguese', 'German',
       'Yiddish', 'Other Germanic', 'Other Scandinavian', 'Greek', 'Russian',
       'Polish', 'Serbo-Croatian', 'Other Slavic', 'Armenian', 'Persian',
       'Gujarati', 'Hindi', 'Urdu', 'Other Indic', 'Other Indo-European']

asia_pacific = ['Chinese', 'Japanese', 'Korean', 'Cambodian', 'Hmong', 'Thai',
       'Laotian', 'Vietnamese', 'Other Asian', 'Tagalog',
       'Other Pacific Island']

other = ['Navajo', 'Other Native North American',
       'Hungarian', 'Arabic', 'Hebrew', 'Other African']


df_melt.loc[df_melt.language.isin(indo_eur),'lang_family']='Indo-European'
df_melt.loc[df_melt.language.isin(asia_pacific),'lang_family']='Asian and Pacific Island'
df_melt.loc[df_melt.language.isin(other),'lang_family']='Other'
df_melt.head()

Unnamed: 0,county,borough,language,speaker_num,lang_family
0,81,Queens,Spanish,524568,Indo-European
1,61,Manhattan,Spanish,340566,Indo-European
2,5,Bronx,Spanish,651874,Indo-European
3,47,Brooklyn,Spanish,399775,Indo-European
4,85,Staten Island,Spanish,46850,Indo-European


In [12]:
df_melt.to_csv('data/languages-2015.csv', index=False)