# Language speakers in NYC, 2019
American Community Survey data: https://api.census.gov/data/2019/acs/acs1/groups/B16001.html

In [1]:
import cenpy as cen
import pandas as pd

from getpass import getpass



In [2]:
CENSUS_API_KEY = getpass('Enter your Census API Key: ')

Enter your Census API Key: ········


In [3]:
acs = cen.products.ACS()

In [4]:
# https://api.census.gov/data/2019/acs/acs1/variables.html > 
# https://api.census.gov/data/2019/acs/acs1/groups/B16001.html
con = cen.remote.APIConnection('ACSDT1Y2019',apikey=CENSUS_API_KEY)

# The columns to analyze
columns = [
#         'B16001_001E', # Total  
        'B16001_003E', # Spanish or Spanish Creole
        'B16001_006E', # French (incl. Patois, Cajun)
        'B16001_009E', # Haitian
        'B16001_012E', # Italian
        'B16001_015E', # Portuguese
        'B16001_018E', # German
        'B16001_021E', # Yiddish, Pennsylvania Dutch or other West Germanic languages
        'B16001_024E', # Greek
        'B16001_027E', # Russian
        'B16001_030E', # Polish
        'B16001_033E', # Serbo-Croatian
        'B16001_036E', # Ukrainian or other Slavic languages
        'B16001_039E', # Armenian
        'B16001_042E', # Persian (incl. Farsi, Dari)
        'B16001_045E', # Gujarati
        'B16001_048E', # Hindi
        'B16001_051E', # Urdu
        'B16001_054E', # Punjabi
        'B16001_057E', # Bengali
        'B16001_060E', # Nepali, Marathi, or other Indic languages
        'B16001_063E', # Other Indo-European languages
        'B16001_066E', # Telugu
        'B16001_069E', # Tamil
        'B16001_072E', # Malayalam, Kannada, or other Dravidian languages
        'B16001_075E', # Chinese (incl. Mandarin, Cantonese)
        'B16001_078E', # Japanese
        'B16001_081E', # Korean
        'B16001_084E', # Hmong
        'B16001_087E', # Vietnamese
        'B16001_090E', # Khmer
        'B16001_093E', # Thai, Lao, or other Tai-Kadai languages
        'B16001_096E', # Other languages of Asia
        'B16001_099E', # Tagalog (incl. Filipino)
        'B16001_102E', # Ilocano, Samoan, Hawaiian, or other Austronesian languages
        'B16001_105E', # Arabic
        'B16001_108E', # Hebrew
        'B16001_111E', # Amharic, Somali, or other Afro-Asiatic languages
        'B16001_114E', # Yoruba, Twi, Igbo, or other languages of Western Africa
        'B16001_117E', # Swahili or other languages of Central, Eastern, and Southern Africa
        'B16001_120E', # Navajo
        'B16001_123E', # Other Native languages of North America
#         'B16001_126E', # Other and unspecified languages
          ]

g_unit = 'county'
g_filter = {'state':'36'}

df_lang_raw = con.query(columns, geo_unit=g_unit, geo_filter=g_filter)
df_lang_raw.head()

Unnamed: 0,B16001_003E,B16001_006E,B16001_009E,B16001_012E,B16001_015E,B16001_018E,B16001_021E,B16001_024E,B16001_027E,B16001_030E,...,B16001_102E,B16001_105E,B16001_108E,B16001_111E,B16001_114E,B16001_117E,B16001_120E,B16001_123E,state,county
0,631573.0,14330.0,6371.0,4581.0,866.0,304.0,889.0,3353.0,3403.0,324.0,...,334.0,8748.0,1133.0,5709.0,42001.0,443.0,0.0,110.0,36,5
1,,,,,,,,,,,...,,,,,,,,,36,67
2,,,,,,,,,,,...,,,,,,,,,36,19
3,,,,,,,,,,,...,,,,,,,,,36,79
4,,,,,,,,,,,...,,,,,,,,,36,69


In [5]:
# rename columns
df_lang_raw = df_lang_raw.rename(columns={
        'B16001_003E': 'Spanish', # incl. Spanish Creole
        'B16001_006E': 'French', # incl. Patois, Cajun
        'B16001_009E': 'Haitian',
        'B16001_012E': 'Italian',
        'B16001_015E': 'Portuguese',
        'B16001_018E': 'German',
        'B16001_021E': 'Other Germanic', # Yiddish, Pennsylvania Dutch or other West Germanic languages -- naming this one as Germanic to match with other dataset
        'B16001_024E': 'Greek',
        'B16001_027E': 'Russian',
        'B16001_030E': 'Polish',
        'B16001_033E': 'Serbo-Croatian',
        'B16001_036E': 'Other Slavic', # incl. Ukrainian
        'B16001_039E': 'Armenian',
        'B16001_042E': 'Persian', # (incl. Farsi, Dari)
        'B16001_045E': 'Gujarati',
        'B16001_048E': 'Hindi',
        'B16001_051E': 'Urdu',
        'B16001_054E': 'Punjabi',
        'B16001_057E': 'Bengali',
        'B16001_060E': 'Other Indic', # incl. Nepali, Marathi
        'B16001_063E': 'Other Indo-European',
        'B16001_066E': 'Telugu',
        'B16001_069E': 'Tamil',
        'B16001_072E': 'Other Dravidian', # Malayalam, Kannada
        'B16001_075E': 'Chinese', # (incl. Mandarin, Cantonese)
        'B16001_078E': 'Japanese',
        'B16001_081E': 'Korean',
        'B16001_084E': 'Hmong',
        'B16001_087E': 'Vietnamese',
        'B16001_090E': 'Khmer',
        'B16001_093E': 'Other Tai-Kadai', # incl. Thai, Lao
        'B16001_096E': 'Other Asian',
        'B16001_099E': 'Tagalog', # (incl. Filipino)
        'B16001_102E': 'Other Austronesian', # incl. Ilocano, Samoan, Hawaiian
        'B16001_105E': 'Arabic',
        'B16001_108E': 'Hebrew',
        'B16001_111E': 'Other Afro-Asiatic', # incl. Amharic, Somali
        'B16001_114E': 'Other West African', # ncl. Yoruba, Twi, Igbo
        'B16001_117E': 'Other African', # incl. Swahili or other languages of Central, Eastern, and Southern Africa
        'B16001_120E': 'Navajo',
        'B16001_123E': 'Other Native American'
#         'B16001_126E': 'Other'
})

df_lang_raw.columns

Index(['Spanish', 'French', 'Haitian', 'Italian', 'Portuguese', 'German',
       'Other Germanic', 'Greek', 'Russian', 'Polish', 'Serbo-Croatian',
       'Other Slavic', 'Armenian', 'Persian', 'Gujarati', 'Hindi', 'Urdu',
       'Punjabi', 'Bengali', 'Other Indic', 'Other Indo-European', 'Telugu',
       'Tamil', 'Other Dravidian', 'Chinese', 'Japanese', 'Korean', 'Hmong',
       'Vietnamese', 'Khmer', 'Other Tai-Kadai', 'Other Asian', 'Tagalog',
       'Other Austronesian', 'Arabic', 'Hebrew', 'Other Afro-Asiatic',
       'Other West African', 'Other African', 'Navajo',
       'Other Native American', 'state', 'county'],
      dtype='object')

In [6]:
# filter to just the counties we want
# Codes: https://www.census.gov/library/reference/code-lists/ansi.html
# NY codes: https://www2.census.gov/geo/docs/reference/codes/files/st36_ny_cou.txt

df_lang = df_lang_raw[df_lang_raw['county'].isin(['061', # Manhattan
                                             '081', # Queens
                                             '047', # Brooklyn (Kings)
                                             '005', # Bronx
                                             '085' # Staten Island (Richmond County)
                                            ])]

df_lang.shape

(5, 43)

In [7]:
# create variable for county ID
                      
df_lang.loc[df_lang['county'].str.contains('061'), 'borough'] = 'Manhattan'
df_lang.loc[df_lang['county'].str.contains('081'), 'borough'] = 'Queens'
df_lang.loc[df_lang['county'].str.contains('047'), 'borough'] = 'Brooklyn'
df_lang.loc[df_lang['county'].str.contains('005'), 'borough'] = 'Bronx'
df_lang.loc[df_lang['county'].str.contains('085'), 'borough'] = 'Staten Island'  

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


In [8]:
# convert all columns from string to numeric
df_lang = df_lang.apply(pd.to_numeric, errors='ignore')

In [9]:
# drop state column
df_lang.drop(columns=['state'])

Unnamed: 0,Spanish,French,Haitian,Italian,Portuguese,German,Other Germanic,Greek,Russian,Polish,...,Other Austronesian,Arabic,Hebrew,Other Afro-Asiatic,Other West African,Other African,Navajo,Other Native American,county,borough
0,631573,14330,6371,4581,866,304,889,3353,3403,324,...,334,8748,1133,5709,42001,443,0,110,5,Bronx
7,363599,20946,61133,16555,2884,4220,96494,6543,123340,13089,...,899,30680,17872,923,11209,1361,0,0,47,Brooklyn
10,45789,2263,381,10211,297,183,619,1944,17677,4250,...,0,12313,1359,0,4413,27,0,245,85,Staten Island
25,500857,10499,31513,17110,4626,5196,3626,20243,33656,24661,...,3558,11544,6667,801,6640,890,0,306,81,Queens
26,321998,23745,2762,7801,10671,7942,4962,5973,15043,2413,...,929,7912,9976,2193,5004,378,0,14,61,Manhattan


In [10]:
# reshape data
df_melt = pd.melt(df_lang, id_vars=['county', 'borough'], value_vars=['Spanish', 'French', 'Haitian', 'Italian', 'Portuguese', 'German',
       'Other Germanic', 'Greek', 'Russian', 'Polish', 'Serbo-Croatian',
       'Other Slavic', 'Armenian', 'Persian', 'Gujarati', 'Hindi', 'Urdu',
       'Punjabi', 'Bengali', 'Other Indic', 'Other Indo-European', 'Telugu',
       'Tamil', 'Other Dravidian', 'Chinese', 'Japanese', 'Korean', 'Hmong',
       'Vietnamese', 'Khmer', 'Other Tai-Kadai', 'Other Asian', 'Tagalog',
       'Other Austronesian', 'Arabic', 'Hebrew', 'Other Afro-Asiatic',
       'Other West African', 'Other African', 'Navajo',
       'Other Native American'], var_name='language', value_name='speaker_num')
df_melt.head()          

Unnamed: 0,county,borough,language,speaker_num
0,5,Bronx,Spanish,631573
1,47,Brooklyn,Spanish,363599
2,85,Staten Island,Spanish,45789
3,81,Queens,Spanish,500857
4,61,Manhattan,Spanish,321998


In [11]:
# define overarching groups for treemap: https://www.census.gov/topics/population/language-use/about.html
indo_eur = ['Spanish', 'French', 'Haitian', 'Italian', 'Portuguese', 'German',
       'Other West Germanic', 'Greek', 'Russian', 'Polish',
       'Serbo-Croatian', 'Other Slavic', 'Armenian', 'Persian',
       'Gujarati', 'Hindi', 'Urdu', 'Punjabi', 'Bengali', 'Other Indic',
       'Other Indo-European', 'Telugu', 'Tamil', 'Other Dravidian']

asia_pacific = ['Chinese', 'Japanese', 'Korean', 'Hmong', 'Vietnamese', 'Khmer',
       'Other Tai-Kadai', 'Other Asian', 'Tagalog', 'Other Austronesian']

other = ['Arabic', 'Hebrew', 'Other Afro-Asiatic', 'Other West African',
       'Other African', 'Navajo', 'Other Native American']


df_melt.loc[df_melt.language.isin(indo_eur),'lang_family']='Indo-European'
df_melt.loc[df_melt.language.isin(asia_pacific),'lang_family']='Asian and Pacific Island'
df_melt.loc[df_melt.language.isin(other),'lang_family']='Other'
df_melt.head()

Unnamed: 0,county,borough,language,speaker_num,lang_family
0,5,Bronx,Spanish,631573,Indo-European
1,47,Brooklyn,Spanish,363599,Indo-European
2,85,Staten Island,Spanish,45789,Indo-European
3,81,Queens,Spanish,500857,Indo-European
4,61,Manhattan,Spanish,321998,Indo-European


In [12]:
df_melt.to_csv('data/languages-2019.csv', index=False)