<a href="https://colab.research.google.com/github/haydenmclemore-fresnost/math120_final_project/blob/main/MATH120_Final_Project_Roughdraft.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# California Water Quality Analysis – Final Project
This notebook analyzes selected water quality measurements from the California Department of Water Resources (DWR).  
The goal is to clean the data, merge it with station information, and answer a few basic questions using tools from class.


## Environment Setup/Loading Data


This notebook loads data directly from the California Open Data Portal so that it can be run by anyone without downloading files manually.


In [20]:
import pandas as pd
# links from California Open Data
stations_url = "https://s3.amazonaws.com/og-production-open-data-cnra-892364687672/resources/24fc759a-ff0b-479a-a72a-c91a9384540f/stations.csv?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAJJIENTAPKHZMIPXQ%2F20251209%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20251209T223055Z&X-Amz-Expires=86400&X-Amz-SignedHeaders=host&X-Amz-Signature=32328182d1657a3afa4040bf130813f31f8c84d3d09aa5e2e5caabfaab0e619d"
field_url = "https://s3.amazonaws.com/og-production-open-data-cnra-892364687672/resources/1911e554-37ab-44c0-89b0-8d7044dd891d/field_results.csv?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAJJIENTAPKHZMIPXQ%2F20251209%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20251209T223911Z&X-Amz-Expires=86400&X-Amz-SignedHeaders=host&X-Amz-Signature=664d0486945f8754e1fa8d47112b7f059b531d3c7c6d0436c88f64f98067abf1"
stations = pd.read_csv(stations_url)
# load datasets using chunks so it doesn't crash
chunks = []
for chunk in pd.read_csv(field_url, chunksize=500_000, low_memory=False):
    chunks.append(chunk)
field = pd.concat(chunks, ignore_index=True)
stations.shape, field.shape

((44630, 11), (1221093, 22))

## Import Libraries (The rest of them)


In [21]:
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
pd.set_option("display.max_columns", 50)

## Clean and Standardize Columns


In [22]:
# Make a numeric result column from fdr_result
field["result"] = field["fdr_result"]
# Ensure sample_date is datetime
field["sample_date"] = pd.to_datetime(field["sample_date"], errors="coerce")
field.head()

Unnamed: 0,station_id,station_name,station_number,full_station_name,station_type,latitude,longitude,status,county_name,sample_code,sample_date,sample_depth,sample_depth_units,anl_data_type,parameter,fdr_result,fdr_text_result,fdr_date_result,fdr_reporting_limit,uns_name,mth_name,fdr_footnote,result
0,12,H.O. Banks Headworks,KA000331,Delta P.P. Headworks at H.O. Banks PP,Surface Water,37.8019,-121.6203,"Public, Review Status Unknown",Alameda,OM0168A0001,1968-01-04 07:45:00,1.0,Feet,,DissolvedOxygen,9.2,,,0.2,mg/L,EPA 360.2 (Field),,9.2
1,12,H.O. Banks Headworks,KA000331,Delta P.P. Headworks at H.O. Banks PP,Surface Water,37.8019,-121.6203,"Public, Review Status Unknown",Alameda,OM0168A0001,1968-01-04 07:45:00,1.0,Feet,,ElectricalConductance,515.0,,,1.0,uS/cm,Std Method 2510-B (Field),,515.0
2,12,H.O. Banks Headworks,KA000331,Delta P.P. Headworks at H.O. Banks PP,Surface Water,37.8019,-121.6203,"Public, Review Status Unknown",Alameda,OM0168A0001,1968-01-04 07:45:00,1.0,Feet,,WaterTemperature,6.7,,,0.1,°C,EPA 170.1 (Field),,6.7
3,12,H.O. Banks Headworks,KA000331,Delta P.P. Headworks at H.O. Banks PP,Surface Water,37.8019,-121.6203,"Public, Review Status Unknown",Alameda,OM0168A0001,1968-01-04 07:45:00,1.0,Feet,,pH,7.3,,,0.1,pH Units,EPA 150.1 (Field),,7.3
4,12,H.O. Banks Headworks,KA000331,Delta P.P. Headworks at H.O. Banks PP,Surface Water,37.8019,-121.6203,"Public, Review Status Unknown",Alameda,OM0268A0006,1968-02-01 08:10:00,1.0,Feet,,DissolvedOxygen,9.7,,,0.2,mg/L,EPA 360.2 (Field),,9.7


## Filter to Selected Water Quality Parameters


In [23]:
params = ["pH", "DissolvedOxygen", "ElectricalConductance"]
field_sub = field[field["parameter"].isin(params)]
field_sub.head()

Unnamed: 0,station_id,station_name,station_number,full_station_name,station_type,latitude,longitude,status,county_name,sample_code,sample_date,sample_depth,sample_depth_units,anl_data_type,parameter,fdr_result,fdr_text_result,fdr_date_result,fdr_reporting_limit,uns_name,mth_name,fdr_footnote,result
0,12,H.O. Banks Headworks,KA000331,Delta P.P. Headworks at H.O. Banks PP,Surface Water,37.8019,-121.6203,"Public, Review Status Unknown",Alameda,OM0168A0001,1968-01-04 07:45:00,1.0,Feet,,DissolvedOxygen,9.2,,,0.2,mg/L,EPA 360.2 (Field),,9.2
1,12,H.O. Banks Headworks,KA000331,Delta P.P. Headworks at H.O. Banks PP,Surface Water,37.8019,-121.6203,"Public, Review Status Unknown",Alameda,OM0168A0001,1968-01-04 07:45:00,1.0,Feet,,ElectricalConductance,515.0,,,1.0,uS/cm,Std Method 2510-B (Field),,515.0
3,12,H.O. Banks Headworks,KA000331,Delta P.P. Headworks at H.O. Banks PP,Surface Water,37.8019,-121.6203,"Public, Review Status Unknown",Alameda,OM0168A0001,1968-01-04 07:45:00,1.0,Feet,,pH,7.3,,,0.1,pH Units,EPA 150.1 (Field),,7.3
4,12,H.O. Banks Headworks,KA000331,Delta P.P. Headworks at H.O. Banks PP,Surface Water,37.8019,-121.6203,"Public, Review Status Unknown",Alameda,OM0268A0006,1968-02-01 08:10:00,1.0,Feet,,DissolvedOxygen,9.7,,,0.2,mg/L,EPA 360.2 (Field),,9.7
5,12,H.O. Banks Headworks,KA000331,Delta P.P. Headworks at H.O. Banks PP,Surface Water,37.8019,-121.6203,"Public, Review Status Unknown",Alameda,OM0268A0006,1968-02-01 08:10:00,1.0,Feet,,ElectricalConductance,720.0,,,1.0,uS/cm,Std Method 2510-B (Field),,720.0


## Merge Field Data With Station Metadata


In [24]:
merged = field_sub.merge(stations, on="station_id", how="left")
merged.head()

Unnamed: 0,station_id,station_name_x,station_number_x,full_station_name_x,station_type_x,latitude_x,longitude_x,status,county_name_x,sample_code,sample_date,sample_depth,sample_depth_units,anl_data_type,parameter,fdr_result,fdr_text_result,fdr_date_result,fdr_reporting_limit,uns_name,mth_name,fdr_footnote,result,station_name_y,full_station_name_y,station_number_y,station_type_y,latitude_y,longitude_y,county_name_y,sample_count,sample_date_min,sample_date_max
0,12,H.O. Banks Headworks,KA000331,Delta P.P. Headworks at H.O. Banks PP,Surface Water,37.8019,-121.6203,"Public, Review Status Unknown",Alameda,OM0168A0001,1968-01-04 07:45:00,1.0,Feet,,DissolvedOxygen,9.2,,,0.2,mg/L,EPA 360.2 (Field),,9.2,H.O. Banks Headworks,Delta P.P. Headworks at H.O. Banks PP,KA000331,Surface Water,37.8019,-121.6203,Alameda,1720.0,10/16/1960 07:25,10/21/2025 11:35
1,12,H.O. Banks Headworks,KA000331,Delta P.P. Headworks at H.O. Banks PP,Surface Water,37.8019,-121.6203,"Public, Review Status Unknown",Alameda,OM0168A0001,1968-01-04 07:45:00,1.0,Feet,,ElectricalConductance,515.0,,,1.0,uS/cm,Std Method 2510-B (Field),,515.0,H.O. Banks Headworks,Delta P.P. Headworks at H.O. Banks PP,KA000331,Surface Water,37.8019,-121.6203,Alameda,1720.0,10/16/1960 07:25,10/21/2025 11:35
2,12,H.O. Banks Headworks,KA000331,Delta P.P. Headworks at H.O. Banks PP,Surface Water,37.8019,-121.6203,"Public, Review Status Unknown",Alameda,OM0168A0001,1968-01-04 07:45:00,1.0,Feet,,pH,7.3,,,0.1,pH Units,EPA 150.1 (Field),,7.3,H.O. Banks Headworks,Delta P.P. Headworks at H.O. Banks PP,KA000331,Surface Water,37.8019,-121.6203,Alameda,1720.0,10/16/1960 07:25,10/21/2025 11:35
3,12,H.O. Banks Headworks,KA000331,Delta P.P. Headworks at H.O. Banks PP,Surface Water,37.8019,-121.6203,"Public, Review Status Unknown",Alameda,OM0268A0006,1968-02-01 08:10:00,1.0,Feet,,DissolvedOxygen,9.7,,,0.2,mg/L,EPA 360.2 (Field),,9.7,H.O. Banks Headworks,Delta P.P. Headworks at H.O. Banks PP,KA000331,Surface Water,37.8019,-121.6203,Alameda,1720.0,10/16/1960 07:25,10/21/2025 11:35
4,12,H.O. Banks Headworks,KA000331,Delta P.P. Headworks at H.O. Banks PP,Surface Water,37.8019,-121.6203,"Public, Review Status Unknown",Alameda,OM0268A0006,1968-02-01 08:10:00,1.0,Feet,,ElectricalConductance,720.0,,,1.0,uS/cm,Std Method 2510-B (Field),,720.0,H.O. Banks Headworks,Delta P.P. Headworks at H.O. Banks PP,KA000331,Surface Water,37.8019,-121.6203,Alameda,1720.0,10/16/1960 07:25,10/21/2025 11:35


## Q1: Average Values by Station


In [25]:
station_summary = (
    merged.groupby(["station_id", "parameter"], as_index=False)
    .agg(avg=("result", "mean"), count=("result", "count"))
)
station_summary.head()

Unnamed: 0,station_id,parameter,avg,count
0,1,DissolvedOxygen,10.948733,726
1,1,pH,7.41274,730
2,3,DissolvedOxygen,8.714773,88
3,3,pH,8.096629,89
4,4,DissolvedOxygen,7.916815,135


## Q2: Trend Over Time (Example Station)


In [26]:
merged = merged.dropna(subset=["sample_date"])
merged["year"] = merged["sample_date"].dt.year
do_data = merged[merged["parameter"] == "DissolvedOxygen"]
do_data["station_id"].value_counts().head()

Unnamed: 0_level_0,count
station_id,Unnamed: 1_level_1
45913,1887
45915,1372
45937,1206
45916,1189
45921,1172


In [27]:
top_station = do_data["station_id"].value_counts().idxmax()
top_station
yearly = (
    do_data[do_data["station_id"] == top_station]
    .groupby("year", as_index=False)
    .agg(avg=("result", "mean"))
)
px.line(yearly, x="year", y="avg", title=f"Dissolved Oxygen Over Time (Station {top_station})")

## Q3: Sampling Effort


In [28]:
samples_by_station = (
    merged.groupby("station_id", as_index=False)
    .agg(count=("result", "count"))
    .sort_values("count", ascending=False)
)
samples_by_station.head()

Unnamed: 0,station_id,count
9816,45913,3614
9818,45915,2586
10,12,2249
9819,45916,2248
9839,45937,2045


## Q4: Parameter Correlation


In [29]:
# Make a pivot table so each parameter becomes its own column
pivot = merged.pivot_table(
    index=["station_id", "sample_date"],
    columns="parameter",
    values="result",
    aggfunc="mean"
).reset_index()
# See what parameter columns we actually have after pivot
print("Pivot columns:", pivot.columns.tolist())
# Make sure we only use parameters that exist as columns
params = ["pH", "DissolvedOxygen", "ElectricalConductance"]
existing_params = [p for p in params if p in pivot.columns]
print("Using these parameters for correlation:", existing_params)
corr_matrix = pivot[existing_params].corr()
corr_matrix

Pivot columns: ['station_id', 'sample_date', 'DissolvedOxygen', 'ElectricalConductance', 'pH']
Using these parameters for correlation: ['pH', 'DissolvedOxygen', 'ElectricalConductance']


parameter,pH,DissolvedOxygen,ElectricalConductance
parameter,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
pH,1.0,-0.000408,0.003701
DissolvedOxygen,-0.000408,1.0,0.231806
ElectricalConductance,0.003701,0.231806,1.0


## Conclusion


This notebook used real California water quality data to explore:

- Average values by station  
- Trends over time  
- Sampling effort  
- Basic correlations between parameters
