In [6]:
pip install rasterio geopandas numpy matplotlib

Note: you may need to restart the kernel to use updated packages.


In [3]:
import rasterio
from rasterio.mask import mask
import geopandas as gpd
import numpy as np
import pandas as pd
from shapely.geometry import mapping
import requests
import zipfile
import io
import os


noise_data_folder = r"C:\Users\Elias\Final Project\Noise data"
master_data_path = r"C:\Users\Elias\Final Project\Cleaned output data files\master_data.csv"

# Noise file paths
road_path = os.path.join(noise_data_folder, "CA_road_noise_2020.tif")
rail_path = os.path.join(noise_data_folder, "CA_rail_noise_2020.tif")
aviation_path = os.path.join(noise_data_folder, "CA_aviation_noise_2020.tif")


master = pd.read_csv(master_data_path)
print(f"Loaded master data: {master.shape[0]} rows, {master.shape[1]} columns")

# Get unique ZIP codes
unique_zips = master['ZIP_Code_of_Residence'].dropna().unique()
unique_zips = [str(int(z)).zfill(5) for z in unique_zips if pd.notna(z)]
print(f"Found {len(unique_zips)} unique ZIP codes")
print(f"Sample ZIPs: {unique_zips[:5]}")


# 2022 ZIP Code Boundaries from Census Bureau
url = "https://www2.census.gov/geo/tiger/TIGER2022/ZCTA520/tl_2022_us_zcta520.zip"
print("Downloading 2022 ZIP code boundaries from US Census Bureau...")
print("Source: TIGER/Line Shapefiles - ZCTA 2022")

# Download and extract
response = requests.get(url, timeout=180)
zip_extract_path = os.path.join(noise_data_folder, "zip_boundaries_2022")
os.makedirs(zip_extract_path, exist_ok=True)

with zipfile.ZipFile(io.BytesIO(response.content)) as z:
    z.extractall(zip_extract_path)
print(f"Downloaded and extracted to: {zip_extract_path}")

# Loading shapefile
zip_gdf = gpd.read_file(os.path.join(zip_extract_path, "tl_2022_us_zcta520.shp"))
print(f"Loaded {len(zip_gdf)} ZIP codes nationwide (2022)")

# Trying different possible column names for ZIP code
zip_code_column = None
for col in ['ZCTA5CE20', 'ZCTA5CE22', 'ZCTA5CE', 'ZCTA5']:
    if col in zip_gdf.columns:
        zip_code_column = col
        break

if zip_code_column is None:
    print("ERROR: Could not find ZIP code column in shapefile")
    print(f"Available columns: {list(zip_gdf.columns)}")
    exit()
    
print(f"Using column '{zip_code_column}' for ZIP codes")

# Filter to only ZIP codes in master data
study_zips = zip_gdf[zip_gdf[zip_code_column].isin(unique_zips)].copy()
print(f"Filtered to {len(study_zips)} ZIP codes from study")

# Reproject to match noise data coordinate system
study_zips = study_zips.to_crs("ESRI:102039")
print("Reprojected to ESRI:102039 (matches noise data)")

# Extracting noise data by zip code

def extract_noise_for_zips(noise_path, zip_gdf, noise_type, zip_col):
    """
    Extract simplified noise statistics for each ZIP code
    """
   
    
    results = []
    
    with rasterio.open(noise_path) as src:
        nodata = src.nodata
        
        for idx, row in zip_gdf.iterrows():
            zipcode = row[zip_col]
            geometry = [mapping(row.geometry)]
            
            try:
                # Clip raster to ZIP code boundary
                out_image, out_transform = mask(src, geometry, crop=True, nodata=nodata)
                data = out_image[0]
                
                # Extract valid noise values
                valid_data = data[data != nodata]
                
                if len(valid_data) > 0:
                    mean_noise = float(valid_data.mean())
                    n_pixels = len(valid_data)
                else:
                    mean_noise = np.nan
                    n_pixels = 0
                
                results.append({
                    'ZIP': zipcode,
                    f'{noise_type}_mean_db': mean_noise,
                    f'{noise_type}_n_pixels': n_pixels
                })
                
            except Exception as e:
                results.append({
                    'ZIP': zipcode,
                    f'{noise_type}_mean_db': np.nan,
                    f'{noise_type}_n_pixels': 0
                })
            
            if (idx + 1) % 20 == 0:
                print(f"  Processed {idx + 1}/{len(zip_gdf)} ZIP codes")
    
    return pd.DataFrame(results)

# Extract noise for each transportation source
road_df = extract_noise_for_zips(road_path, study_zips, "road", zip_code_column)
rail_df = extract_noise_for_zips(rail_path, study_zips, "rail", zip_code_column)
aviation_df = extract_noise_for_zips(aviation_path, study_zips, "aviation", zip_code_column)


# Merge all three noise sources
noise_data = road_df.merge(rail_df, on='ZIP', how='outer')
noise_data = noise_data.merge(aviation_df, on='ZIP', how='outer')

# Function to combine decibel values properly
def combine_decibels(road, rail, aviation):
    """
    Combine decibel values from multiple sources
    Converts to energy, sum, converts back to dB
    """
    # Replace NaN with quiet background noise (30 dB)
    road = np.where(np.isnan(road), 30, road)
    rail = np.where(np.isnan(rail), 30, rail)
    aviation = np.where(np.isnan(aviation), 30, aviation)
    
    # Convert dB to energy (linear scale)
    road_energy = 10 ** (road / 10)
    rail_energy = 10 ** (rail / 10)
    aviation_energy = 10 ** (aviation / 10)
    
    # Sum energies and convert back to dB
    total_energy = road_energy + rail_energy + aviation_energy
    combined = 10 * np.log10(total_energy)
    
    return combined

# Calculate combined noise
noise_data['combined_noise_mean_db'] = combine_decibels(
    noise_data['road_mean_db'].values,
    noise_data['rail_mean_db'].values,
    noise_data['aviation_mean_db'].values
)


# Save just the noise data to CSV
noise_output_path = os.path.join(noise_data_folder, "zipcode_noise_data_2022.csv")
noise_data.to_csv(noise_output_path, index=False)
print(f"Saved noise data to CSV: {noise_output_path}")
print(f"Total ZIP codes with noise data: {len(noise_data)}")


# Ensure ZIP formats match
master['ZIP_Code_of_Residence'] = master['ZIP_Code_of_Residence'].astype(str).str.zfill(5)

# Create a simplified noise dataframe with only zip code and combined noise
simplified_noise_data = noise_data[['ZIP', 'combined_noise_mean_db']].copy()

# Merging only the combined noise column with master data
master_with_noise = master.merge(
    simplified_noise_data, 
    left_on='ZIP_Code_of_Residence', 
    right_on='ZIP', 
    how='left'
)

# Drop the extra ZIP columnfrom noise_data since we already have ZIP_Code_of_Residence
master_with_noise = master_with_noise.drop(columns=['ZIP'])

print(f"Master data rows: {len(master)}")
print(f"After merge: {len(master_with_noise)}")
print(f"Rows with noise data: {master_with_noise['combined_noise_mean_db'].notna().sum()}")

# Save merged data
merged_output = r"C:\Users\Elias\Final Project\Cleaned output data files\master_data_with_noise_2022.csv"
master_with_noise.to_csv(merged_output, index=False)
print(f"\nSaved merged data: {merged_output}")

Loaded master data: 243 rows, 23 columns
Found 243 unique ZIP codes
Sample ZIPs: ['90001', '90002', '90003', '90004', '90005']
Downloading 2022 ZIP code boundaries from US Census Bureau...
Source: TIGER/Line Shapefiles - ZCTA 2022
Downloaded and extracted to: C:\Users\Elias\Final Project\Noise data\zip_boundaries_2022
Loaded 33791 ZIP codes nationwide (2022)
Using column 'ZCTA5CE20' for ZIP codes
Filtered to 243 ZIP codes from study
Reprojected to ESRI:102039 (matches noise data)
  Processed 28260/243 ZIP codes
  Processed 28460/243 ZIP codes
  Processed 28640/243 ZIP codes
  Processed 28780/243 ZIP codes
  Processed 28980/243 ZIP codes
  Processed 29060/243 ZIP codes
  Processed 29080/243 ZIP codes
  Processed 29140/243 ZIP codes
  Processed 29420/243 ZIP codes
  Processed 29560/243 ZIP codes
  Processed 28260/243 ZIP codes
  Processed 28460/243 ZIP codes
  Processed 28640/243 ZIP codes
  Processed 28780/243 ZIP codes
  Processed 28980/243 ZIP codes
  Processed 29060/243 ZIP codes
  P

In [5]:
master_with_noise.shape

(243, 24)