**6.1: Data Sourcing**
Testing out datasets for NYC healthcare project

In [26]:
import pandas as pd

In [27]:
print("Loading datasets...")

Loading datasets...


In [28]:
# 1. Facilities - load NYC Health Facility General Information dataset and count total rows
facilities = pd.read_csv('../02 data/original data/Health_Facility_General_Information.csv')
print(f"1. Facilities: {len(facilities):,} rows")

1. Facilities: 5,976 rows


In [29]:
# Check column names in Facility General Information dataset
print(facilities.columns.tolist())

['Facility ID', 'Facility Name', 'Short Description', 'Description', 'Facility Open Date', 'Facility Address 1', 'Facility Address 2', 'Facility City', 'Facility State', 'Facility Zip Code', 'Facility Phone Number', 'Facility Fax Number', 'Facility Website', 'Facility County Code', 'Facility County', 'Regional Office ID', 'Regional Office', 'Main Site Name', 'Main Site Facility ID', 'Operating Certificate Number', 'Operator Name', 'Operator Address 1', 'Operator Address 2', 'Operator City', 'Operator State', 'Operator Zip Code', 'Cooperator Name', 'Cooperator Address', 'Cooperator Address 2', 'Cooperator City', 'Cooperator State', 'Cooperator Zip Code', 'Ownership Type', 'Facility Latitude', 'Facility Longitude', 'Facility Location']


In [30]:
# Filter to 5 NYC facilities
nyc_counties = ['New York', 'Kings', 'Queens', 'Bronx', 'Richmond']
nyc_facilities = facilities[facilities['Facility County'].isin(nyc_counties)].copy()
print(f"NYC facilities: {len(nyc_facilities)}")

NYC facilities: 2126


In [31]:
# List files in data folder
import os
print(os.listdir('../02 data/original data'))

['PLACES_2023_census_tract.csv', 'PLACES_2022_census_tract.csv', 'PLACES_2021_census_tract.csv', 'acs_2024_demographics_nyc.csv', 'acs_2024_income_nyc.csv', 'acs_2024_insurance_nyc.csv', 'Health_Facility_General_Information.csv', 'PLACES_2024_census_tract.csv']


In [32]:
# Load PLACES 2021-2024 releases (contains 2018-2022 data)
places_2021 = pd.read_csv('../02 data/original data/PLACES_2021_census_tract.csv')
places_2022 = pd.read_csv('../02 data/original data/PLACES_2022_census_tract.csv')
places_2023 = pd.read_csv('../02 data/original data/PLACES_2023_census_tract.csv')
places_2024 = pd.read_csv('../02 data/original data/PLACES_2024_census_tract.csv')

In [33]:
# Combine all years for PLACES datasets
places = pd.concat([places_2021, places_2022, places_2023, places_2024], ignore_index=True)

In [34]:
# Check PLACES columns
print(places.columns.tolist())

['Year', 'StateAbbr', 'StateDesc', 'CountyName', 'CountyFIPS', 'LocationName', 'DataSource', 'Category', 'Measure', 'Data_Value_Unit', 'Data_Value_Type', 'Data_Value', 'Data_Value_Footnote_Symbol', 'Data_Value_Footnote', 'Low_Confidence_Limit', 'High_Confidence_Limit', 'TotalPopulation', 'Geolocation', 'LocationID', 'CategoryID', 'MeasureId', 'DataValueTypeID', 'Short_Question_Text', 'TotalPop18plus']


In [35]:
# Filter combined PLACES to NYC
nyc_counties = ['New York', 'Kings', 'Queens', 'Bronx', 'Richmond']
nyc_places = places[places['StateAbbr'] == 'NY'].copy()
nyc_places = nyc_places[nyc_places['CountyName'].isin(nyc_counties)].copy()

print(f"NYC PLACES rows: {len(nyc_places):,}")
print(f"Years: {sorted(nyc_places['Year'].unique())}")

NYC PLACES rows: 278,934
Years: [np.int64(2018), np.int64(2019), np.int64(2020), np.int64(2021), np.int64(2022)]


In [36]:
# Load demographics dataset
demographics = pd.read_csv('../02 data/original data/acs_2024_demographics_nyc.csv')
print(f"Demographics rows: {len(demographics)}")

Demographics rows: 113


In [37]:
# Load insurance dataset
insurance = pd.read_csv('../02 data/original data/acs_2024_insurance_nyc.csv')
print(f"Insurance rows: {len(insurance)}")

Insurance rows: 72


In [38]:
# Load income dataset
income = pd.read_csv('../02 data/original data/acs_2024_income_nyc.csv')
print(f"Income rows: {len(income)}")
print("\n✅ All 5 datasets loaded successfully")

Income rows: 6

✅ All 5 datasets loaded successfully


In [39]:
# Inspect facilities structure
print("=== FACILITIES ===")
print(facilities.info())

=== FACILITIES ===
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5976 entries, 0 to 5975
Data columns (total 36 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Facility ID                   5976 non-null   int64  
 1   Facility Name                 5976 non-null   object 
 2   Short Description             5641 non-null   object 
 3   Description                   5976 non-null   object 
 4   Facility Open Date            5976 non-null   object 
 5   Facility Address 1            5976 non-null   object 
 6   Facility Address 2            621 non-null    object 
 7   Facility City                 5976 non-null   object 
 8   Facility State                5976 non-null   object 
 9   Facility Zip Code             5976 non-null   object 
 10  Facility Phone Number         5073 non-null   float64
 11  Facility Fax Number           2529 non-null   float64
 12  Facility Website              673 non-null 

In [40]:
# Check NYC facilities missing data
print("\n=== NYC FACILITIES ===")
print(f"Total: {len(nyc_facilities)}")
print(f"Missing latitude: {nyc_facilities['Facility Latitude'].isna().sum()}")
print(f"Missing longitude: {nyc_facilities['Facility Longitude'].isna().sum()}")


=== NYC FACILITIES ===
Total: 2126
Missing latitude: 15
Missing longitude: 15


In [41]:
# Inspect PLACES combined dataset structure
print("\n=== PLACES ===")
print(f"Total NYC rows: {len(nyc_places):,}")
print(f"Unique census tracts: {nyc_places['LocationID'].nunique()}")
print(f"Unique measures: {nyc_places['Measure'].nunique()}")
print(f"Years included: {sorted(nyc_places['Year'].unique())}")
print(f"\nSample measures:")
print(nyc_places['Measure'].unique()[:10])


=== PLACES ===
Total NYC rows: 278,934
Unique census tracts: 2368
Unique measures: 67
Years included: [np.int64(2018), np.int64(2019), np.int64(2020), np.int64(2021), np.int64(2022)]

Sample measures:
['Stroke among adults aged >=18 years'
 'Obesity among adults aged >=18 years'
 'Arthritis among adults aged >=18 years'
 'Mammography use among women aged 50-74 years'
 'Depression among adults aged >=18 years'
 'Diagnosed diabetes among adults aged >=18 years'
 'Binge drinking among adults aged >=18 years'
 'All teeth lost among adults aged >=65 years'
 'Visits to dentist or dental clinic among adults aged >=18 years'
 'Fecal occult blood test, sigmoidoscopy, or colonoscopy among adults aged 50-75 years']


In [42]:
print("Years in combined PLACES before filtering:")
print(sorted(places['Year'].unique()))

Years in combined PLACES before filtering:
[np.int64(2018), np.int64(2019), np.int64(2020), np.int64(2021), np.int64(2022)]


In [43]:
# View demographics sample
print("\n=== DEMOGRAPHICS ===")
print(demographics.head())


=== DEMOGRAPHICS ===
                            Label (Grouping) Bronx County, New York!!Estimate  \
0                                SEX AND AGE                              NaN   
1                           Total population                        1,404,779   
2                                       Male                          663,632   
3                                     Female                          741,147   
4          Sex ratio (males per 100 females)                             89.5   

  Bronx County, New York!!Margin of Error Bronx County, New York!!Percent  \
0                                     NaN                             NaN   
1                                   *****                       1,404,779   
2                                    ±150                           47.2%   
3                                    ±150                           52.8%   
4                                    ±0.1                             (X)   

  Bronx County, New York!!Pe

In [44]:
# View insurance sample
print("\n=== INSURANCE ===")
print(insurance.head())


=== INSURANCE ===
                           Label (Grouping)  \
0  Civilian noninstitutionalized population   
1                                       AGE   
2                             Under 6 years   
3                             6 to 18 years   
4                            19 to 25 years   

  Bronx County, New York!!Total!!Estimate  \
0                               1,369,100   
1                                     NaN   
2                                 104,674   
3                                 246,899   
4                                 127,576   

  Bronx County, New York!!Total!!Margin of Error  \
0                                         ±2,667   
1                                            NaN   
2                                         ±2,874   
3                                         ±3,658   
4                                         ±3,311   

  Bronx County, New York!!Insured!!Estimate  \
0                                 1,271,286   
1                   

In [45]:
# View income sample
print("\n=== INCOME ===")
print(income.head())


=== INCOME ===
           GEO_ID                       NAME               S1901_C01_001E  \
0       Geography       Geographic Area Name  Estimate!!Households!!Total   
1  0500000US36005     Bronx County, New York                       534100   
2  0500000US36047     Kings County, New York                      1009927   
3  0500000US36061  New York County, New York                       778281   
4  0500000US36081    Queens County, New York                       841003   

                       S1901_C01_001M  \
0  Margin of Error!!Households!!Total   
1                                1505   
2                                2338   
3                                4924   
4                                2217   

                                   S1901_C01_002E  \
0  Estimate!!Households!!Total!!Less than $10,000   
1                                            11.9   
2                                             6.9   
3                                             7.5   
4        

In [46]:
# Final summary
print("\n=== FINAL SUMMARY ===")
print(f"Facilities: {len(nyc_facilities):,} NYC facilities (15 missing coordinates)")
print(f"PLACES: {len(nyc_places):,} rows across {nyc_places['LocationID'].nunique():,} census tracts (2018-2022)")
print(f"Demographics: {len(demographics)} rows")
print(f"Insurance: {len(insurance)} rows")
print(f"Income: {len(income)} rows")
print("\n✅ Data loading complete")


=== FINAL SUMMARY ===
Facilities: 2,126 NYC facilities (15 missing coordinates)
PLACES: 278,934 rows across 2,368 census tracts (2018-2022)
Demographics: 113 rows
Insurance: 72 rows
Income: 6 rows

✅ Data loading complete
