In [None]:
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
from shapely.geometry import Point
import os

# Replace with the path to your CSV file and shapefile
water_dataset_path = '/kaggle/input/flu-dataset/field_results.csv'
counties_shapefile_path = '/kaggle/input/flu-dataset/CA_Counties_TIGER2016.shp'
output_dir = '/kaggle/working/heatmaps_final_results'  # Use the correct path for your Kaggle notebook
os.makedirs(output_dir, exist_ok=True)
zip_file_path = "/kaggle/working"
zip_file_name = 'heatmaps_final_results.zip'


# Load the water contamination dataset
water_data = pd.read_csv(water_dataset_path, low_memory=False)

# Correct the sample date format and convert to datetime
water_data['sample_date'] = pd.to_datetime(water_data['sample_date'], errors='coerce')

# Drop rows where 'sample_date' or 'latitude'/'longitude' are missing
water_data.dropna(subset=['sample_date', 'latitude', 'longitude'], inplace=True)
water_data = water_data[water_data['sample_date'].dt.year >= 2001]


# Create geometry points
water_data['geometry'] = water_data.apply(lambda row: Point(row['longitude'], row['latitude']), axis=1)

# Convert the DataFrame to a GeoDataFrame and set its CRS
gdf_water = gpd.GeoDataFrame(water_data, geometry='geometry')
gdf_water.set_crs(epsg=4326, inplace=True)  # Assuming WGS84 Latitude/Longitude

# Load the California counties shapefile and set its CRS to match gdf_water
ca_counties = gpd.read_file(counties_shapefile_path)
ca_counties = ca_counties.to_crs(gdf_water.crs)

# Perform a spatial join between the water samples and counties
merged_data = gpd.sjoin(gdf_water, ca_counties, how='inner', op='intersects')

# Check for mismatches in county names
water_county_names = gdf_water['county_name'].unique()
shapefile_county_names = ca_counties['NAME'].unique()
mismatches_water_data = [name for name in water_county_names if name not in shapefile_county_names]
mismatches_shapefile = [name for name in shapefile_county_names if name not in water_county_names]

# Handle mismatches (if any) - This step will be specific to the mismatches you find

# Continue with heatmap generation
# Output directory for the heatmaps and the zip file
heatmap_files = []

# List to store all heatmap file paths for zipping later
unique_parameters = gdf_water['parameter'].unique()
for parameter in unique_parameters:
    # Filter data for the parameter and convert to numeric values
    parameter_data = gdf_water[gdf_water['parameter'] == parameter]
    parameter_data['fdr_result'] = pd.to_numeric(parameter_data['fdr_result'], errors='coerce')
    parameter_data['year'] = parameter_data['sample_date'].dt.year

    # Group by county and year
    annual_data = parameter_data.groupby(['county_name', 'year'])['fdr_result'].mean().reset_index()

    # Plot a heatmap for each year from 2001 onwards
    for year in range(2001, annual_data['year'].max() + 1):
        # Filter for the year
        year_data = annual_data[annual_data['year'] == year]
        
        # Merge the data with the shapefile
        merged_data = ca_counties.merge(year_data, left_on='NAME', right_on='county_name', how='left')

        # Generate the heatmap
        fig, ax = plt.subplots(1, figsize=(10, 10))
        merged_data.plot(column='fdr_result', ax=ax, cmap='viridis', legend=True,
                         legend_kwds={'label': f"{parameter} Concentration", 'orientation': "horizontal"})
        plt.title(f'Annual Average {parameter} Levels for {year}')
        plt.axis('off')

        # Save the heatmap
        parameter_safe = parameter.replace(" ", "_").replace("/", "_").replace("(", "").replace(")", "")
        heatmap_filename = f'{parameter_safe}_{year}.png'
        heatmap_path = os.path.join(output_dir, heatmap_filename)
        plt.savefig(heatmap_path, dpi=300)
        plt.close()
        # Add the file path to the list
        heatmap_files.append(heatmap_path)

# Create a zip file for all heatmaps
zip_filepath = os.path.join(zip_file_path, zip_file_name)
with zipfile.ZipFile(zip_filepath, 'w') as zipf:
    for file in heatmap_files:
        zipf.write(file, os.path.basename(file))

# Output the directory containing the saved heatmaps and the zip file path
output_dir, zip_filepath