# Language speakers in NYC, 2005
American Community Survey data: https://api.census.gov/data/2005/acs/acs1/groups/B16001.html

In [1]:
import cenpy as cen
import pandas as pd

from getpass import getpass



In [2]:
CENSUS_API_KEY = getpass('Enter your Census API Key: ')

Enter your Census API Key: ········


In [3]:
acs = cen.products.ACS()

In [4]:
con = cen.remote.APIConnection('ACSDT1Y2005',apikey=CENSUS_API_KEY)

# The columns to analyze
columns = [
#            'B16001_001E', # Total  
           'B16001_003E', # Spanish or Spanish Creole
           'B16001_006E', # French (incl. Patois, Cajun)
           'B16001_009E', # French Creole
           'B16001_012E', # Italian
           'B16001_015E', # Portuguese or Portuguese Creole
           'B16001_018E', # German
           'B16001_021E', # Yiddish
           'B16001_024E', # Other West Germanic languages
           'B16001_027E', # Scandinavian languages
           'B16001_030E', # Greek
           'B16001_033E', # Russian
           'B16001_036E', # Polish
           'B16001_039E', # Serbo-Croatian
           'B16001_042E', # Other Slavic languages
           'B16001_045E', # Armenian
           'B16001_048E', # Persian
           'B16001_051E', # Gujarati
           'B16001_054E', # Hindi
           'B16001_057E', # Urdu
           'B16001_060E', # Other Indic languages
           'B16001_063E', # Other Indo-European languages
           'B16001_066E', # Chinese
           'B16001_069E', # Japanese
           'B16001_072E', # Korean
           'B16001_075E', # Mon-Khmer, Cambodian
           'B16001_078E', # Hmong
           'B16001_081E', # Thai
           'B16001_084E', # Laotian
           'B16001_087E', # Vietnamese
           'B16001_090E', # Other Asian languages
           'B16001_093E', # Tagalog
           'B16001_096E', # Other Pacific Island languages
           'B16001_099E', # Navajo
           'B16001_102E', # Other Native North American languages
           'B16001_105E', # Hungarian
           'B16001_108E', # Arabic
           'B16001_111E', # Hebrew
           'B16001_114E', # African languages
#           'B16001_117E', # Other and unspecified languages           
          ]

g_unit = 'county'
g_filter = {'state':'36'}

df_lang_raw = con.query(columns, geo_unit=g_unit, geo_filter=g_filter)
df_lang_raw.head()

Unnamed: 0,B16001_003E,B16001_006E,B16001_009E,B16001_012E,B16001_015E,B16001_018E,B16001_021E,B16001_024E,B16001_027E,B16001_030E,...,B16001_093E,B16001_096E,B16001_099E,B16001_102E,B16001_105E,B16001_108E,B16001_111E,B16001_114E,state,county
0,,,,,,,,,,,...,,,,,,,,,36,1
1,556951.0,12145.0,4119.0,13098.0,482.0,1938.0,425.0,608.0,67.0,2221.0,...,4459.0,131.0,0.0,604.0,1231.0,4398.0,309.0,26786.0,36,5
2,,,,,,,,,,,...,,,,,,,,,36,7
3,,,,,,,,,,,...,,,,,,,,,36,9
4,,,,,,,,,,,...,,,,,,,,,36,11


In [5]:
# rename columns
df_lang_raw = df_lang_raw.rename(columns={
#     'B16001_001E': 'total',
    'B16001_003E': 'Spanish',
    'B16001_006E': 'French',
    'B16001_009E': 'French Creole',
    'B16001_012E': 'Italian',
    'B16001_015E': 'Portuguese',
    'B16001_018E': 'German',
    'B16001_021E': 'Yiddish',
    'B16001_024E': 'Other Germanic',
    'B16001_027E': 'Other Scandinavian',
    'B16001_030E': 'Greek',
    'B16001_033E': 'Russian',
    'B16001_036E': 'Polish',
    'B16001_039E': 'Serbo-Croatian',
    'B16001_042E': 'Other Slavic',
    'B16001_045E': 'Armenian',
    'B16001_048E': 'Persian',
    'B16001_051E': 'Gujarati',
    'B16001_054E': 'Hindi',
    'B16001_057E': 'Urdu',
    'B16001_060E': 'Other Indic',
    'B16001_063E': 'Other Indo-European',
    'B16001_066E': 'Chinese',
    'B16001_069E': 'Japanese',
    'B16001_072E': 'Korean',
    'B16001_075E': 'Cambodian',
    'B16001_078E': 'Hmong',
    'B16001_081E': 'Thai',
    'B16001_084E': 'Laotian',
    'B16001_087E': 'Vietnamese',
    'B16001_090E': 'Other Asian',
    'B16001_093E': 'Tagalog',
    'B16001_096E': 'Other Pacific Island',
    'B16001_099E': 'Navajo',
    'B16001_102E': 'Other Native American',
    'B16001_105E': 'Hungarian',
    'B16001_108E': 'Arabic',
    'B16001_111E': 'Hebrew',
    'B16001_114E': 'Other African'
})

df_lang_raw.columns

Index(['Spanish', 'French', 'French Creole', 'Italian', 'Portuguese', 'German',
       'Yiddish', 'Other Germanic', 'Other Scandinavian', 'Greek', 'Russian',
       'Polish', 'Serbo-Croatian', 'Other Slavic', 'Armenian', 'Persian',
       'Gujarati', 'Hindi', 'Urdu', 'Other Indic', 'Other Indo-European',
       'Chinese', 'Japanese', 'Korean', 'Cambodian', 'Hmong', 'Thai',
       'Laotian', 'Vietnamese', 'Other Asian', 'Tagalog',
       'Other Pacific Island', 'Navajo', 'Other Native American', 'Hungarian',
       'Arabic', 'Hebrew', 'Other African', 'state', 'county'],
      dtype='object')

In [6]:
# filter to just the counties we want
# Codes: https://www.census.gov/library/reference/code-lists/ansi.html
# NY codes: https://www2.census.gov/geo/docs/reference/codes/files/st36_ny_cou.txt

df_lang = df_lang_raw[df_lang_raw['county'].isin(['061', # Manhattan
                                             '081', # Queens
                                             '047', # Brooklyn (Kings)
                                             '005', # Bronx
                                             '085' # Staten Island (Richmond County)
                                            ])]

df_lang.shape

(5, 40)

In [7]:
# create variable for county ID          
df_lang.loc[df_lang['county'].str.contains('061'), 'borough'] = 'Manhattan'
df_lang.loc[df_lang['county'].str.contains('081'), 'borough'] = 'Queens'
df_lang.loc[df_lang['county'].str.contains('047'), 'borough'] = 'Brooklyn'
df_lang.loc[df_lang['county'].str.contains('005'), 'borough'] = 'Bronx'
df_lang.loc[df_lang['county'].str.contains('085'), 'borough'] = 'Staten Island'  

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


In [8]:
# drop state column
df_lang.drop(columns=['state'])

Unnamed: 0,Spanish,French,French Creole,Italian,Portuguese,German,Yiddish,Other Germanic,Other Scandinavian,Greek,...,Tagalog,Other Pacific Island,Navajo,Other Native American,Hungarian,Arabic,Hebrew,Other African,county,borough
1,556951.0,12145.0,4119.0,13098.0,482.0,1938.0,425.0,608.0,67.0,2221.0,...,4459.0,131.0,0.0,604.0,1231.0,4398.0,309.0,26786.0,5,Bronx
11,398837.0,31583.0,61519.0,31917.0,1812.0,4118.0,72562.0,587.0,782.0,4456.0,...,4357.0,348.0,0.0,311.0,3359.0,27351.0,22088.0,9193.0,47,Brooklyn
15,321861.0,30390.0,1353.0,8995.0,4257.0,10484.0,1550.0,2134.0,2383.0,3762.0,...,5363.0,517.0,0.0,135.0,1628.0,6327.0,8165.0,4225.0,61,Manhattan
23,487565.0,17687.0,20904.0,33995.0,7007.0,6125.0,2473.0,1441.0,201.0,35809.0,...,30530.0,2312.0,0.0,1175.0,3222.0,12865.0,10520.0,6620.0,81,Queens
25,,,,,,,,,,,...,,,,,,,,,85,Staten Island


In [9]:
# convert all columns from string to numeric
df_lang = df_lang.apply(pd.to_numeric, errors='ignore')

In [10]:
# reshape data
df_melt = pd.melt(df_lang, id_vars=['county', 'borough'], value_vars=['Spanish', 'French', 'French Creole', 'Italian', 'Portuguese', 'German',
       'Yiddish', 'Other Germanic', 'Other Scandinavian', 'Greek', 'Russian',
       'Polish', 'Serbo-Croatian', 'Other Slavic', 'Armenian', 'Persian',
       'Gujarati', 'Hindi', 'Urdu', 'Other Indic', 'Other Indo-European',
       'Chinese', 'Japanese', 'Korean', 'Cambodian', 'Hmong', 'Thai',
       'Laotian', 'Vietnamese', 'Other Asian', 'Tagalog',
       'Other Pacific Island', 'Navajo', 'Other Native American', 'Hungarian',
       'Arabic', 'Hebrew', 'Other African'], var_name='language', value_name='speaker_num')
df_melt.head() 

Unnamed: 0,county,borough,language,speaker_num
0,5,Bronx,Spanish,556951.0
1,47,Brooklyn,Spanish,398837.0
2,61,Manhattan,Spanish,321861.0
3,81,Queens,Spanish,487565.0
4,85,Staten Island,Spanish,


In [11]:
# define overarching groups for treemap: https://www.census.gov/topics/population/language-use/about.html

indo_eur = ['Spanish', 'French', 'French Creole', 'Italian', 'Portuguese', 'German',
       'Yiddish', 'Other Germanic', 'Other Scandinavian', 'Greek', 'Russian',
       'Polish', 'Serbo-Croatian', 'Other Slavic', 'Armenian', 'Persian',
       'Gujarati', 'Hindi', 'Urdu', 'Other Indic', 'Other Indo-European']

asia_pacific = ['Chinese', 'Japanese', 'Korean', 'Cambodian', 'Hmong', 'Thai',
       'Laotian', 'Vietnamese', 'Other Asian', 'Tagalog',
       'Other Pacific Island']

other = ['Navajo', 'Other Native American', 'Hungarian',
       'Arabic', 'Hebrew', 'Other African']


df_melt.loc[df_melt.language.isin(indo_eur),'lang_family']='Indo-European'
df_melt.loc[df_melt.language.isin(asia_pacific),'lang_family']='Asian and Pacific Island'
df_melt.loc[df_melt.language.isin(other),'lang_family']='Other'
df_melt.head()

Unnamed: 0,county,borough,language,speaker_num,lang_family
0,5,Bronx,Spanish,556951.0,Indo-European
1,47,Brooklyn,Spanish,398837.0,Indo-European
2,61,Manhattan,Spanish,321861.0,Indo-European
3,81,Queens,Spanish,487565.0,Indo-European
4,85,Staten Island,Spanish,,Indo-European


In [12]:
df_melt.to_csv('data/languages-2005.csv', index=False)