# EDS232/CalCOFI Ocean chemistry prediction 2025

Team members: Stephen Carroll, Tom Gibbens-Matsuyama, Ian Morris-Sibaja, Haylee Oyler 

### Description
For this quarter’s final lab, you will apply the machine learning knowledge you’ve gained to train models that predict dissolved inorganic carbon (DIC) levels in water samples collected by the California Cooperative Oceanic Fisheries Investigations program.

In this lab, you'll be working with real-world environmental data in a friendly competition with your classmates to see who can develop the most accurate predictive model.

## Your Task
- **Acquire domain knowledge**: Provided by Dr. Satterthwaite in her presentation
- **Explore the data**: Load the dataset and perform initial exploratory data analysis to inform your modeling choices
- **Preprocessing** (if necessary): Is the data ready to be used in your model?
- **Choose and train a model**: Select an appropriate machine learning algorithm for this task. Train your model on the provided training data
- **Tune relevant parameters**: Use cross-validation to optimize model performance. Experiment with different hyperparameters to reduce error
- **Submit your prediction**: Generate predictions on the provided test dataset

In [22]:
# Load basic libraries
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import statistics as stats
import time

# Linear model
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import sklearn.linear_model
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import LassoCV
from sklearn.preprocessing import PolynomialFeatures

from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score
from scipy.stats import uniform, randint
from sklearn.impute import KNNImputer
from sklearn.tree import DecisionTreeRegressor, plot_tree
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder

# Import data
train_df = pd.read_csv("data/train.csv")
test_df = pd.read_csv("data/test.csv")
sample_sub =  pd.read_csv("data/sample_submission.csv")

In [None]:
# Explore the data
train_df.head()

Unnamed: 0,id,Lat_Dec,Lon_Dec,NO2uM,NO3uM,NH3uM,R_TEMP,R_Depth,R_Sal,R_DYNHT,R_Nuts,R_Oxy_micromol.Kg,Unnamed: 12,PO4uM,SiO3uM,TA1.x,Salinity1,Temperature_degC,DIC
0,1,34.38503,-120.66553,0.03,33.8,0.0,7.79,323,141.2,0.642,0.0,37.40948,,2.77,53.86,2287.45,34.198,7.82,2270.17
1,2,31.418333,-121.998333,0.0,34.7,0.0,7.12,323,140.8,0.767,0.0,64.81441,,2.57,52.5,2279.1,34.074,7.15,2254.1
2,3,34.38503,-120.66553,0.18,14.2,0.0,11.68,50,246.8,0.144,0.0,180.2915,,1.29,13.01,2230.8,33.537,11.68,2111.04
3,4,33.48258,-122.53307,0.013,29.67,0.01,8.33,232,158.5,0.562,0.01,89.62595,,2.27,38.98,2265.85,34.048,8.36,2223.41
4,5,31.41432,-121.99767,0.0,33.1,0.05,7.53,323,143.4,0.74,0.05,60.03062,,2.53,49.28,2278.49,34.117,7.57,2252.62


In [8]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1454 entries, 0 to 1453
Data columns (total 19 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 1454 non-null   int64  
 1   Lat_Dec            1454 non-null   float64
 2   Lon_Dec            1454 non-null   float64
 3   NO2uM              1454 non-null   float64
 4   NO3uM              1454 non-null   float64
 5   NH3uM              1454 non-null   float64
 6   R_TEMP             1454 non-null   float64
 7   R_Depth            1454 non-null   int64  
 8   R_Sal              1454 non-null   float64
 9   R_DYNHT            1454 non-null   float64
 10  R_Nuts             1454 non-null   float64
 11  R_Oxy_micromol.Kg  1454 non-null   float64
 12  Unnamed: 12        0 non-null      float64
 13  PO4uM              1454 non-null   float64
 14  SiO3uM             1454 non-null   float64
 15  TA1.x              1454 non-null   float64
 16  Salinity1          1454 

In [9]:
test_df.head()

Unnamed: 0,id,Lat_Dec,Lon_Dec,NO2uM,NO3uM,NH3uM,R_TEMP,R_Depth,R_Sal,R_DYNHT,R_Nuts,R_Oxy_micromol.Kg,PO4uM,SiO3uM,TA1,Salinity1,Temperature_degC
0,1455,34.321666,-120.811666,0.02,24.0,0.41,9.51,101,189.9,0.258,0.41,138.8383,1.85,25.5,2244.94,33.83,9.52
1,1456,34.275,-120.033333,0.0,25.1,0.0,9.84,102,185.2,0.264,0.0,102.7092,2.06,28.3,2253.27,33.963,9.85
2,1457,34.275,-120.033333,0.0,31.9,0.0,6.6,514,124.1,0.874,0.0,2.174548,3.4,88.1,2316.95,34.241,6.65
3,1458,33.828333,-118.625,0.0,0.0,0.2,19.21,1,408.1,0.004,0.2,258.6743,0.27,2.5,2240.49,33.465,19.21
4,1459,33.828333,-118.625,0.02,19.7,0.0,10.65,100,215.5,0.274,0.0,145.8399,1.64,19.4,2238.3,33.72,10.66


In [10]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 485 entries, 0 to 484
Data columns (total 17 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 485 non-null    int64  
 1   Lat_Dec            485 non-null    float64
 2   Lon_Dec            485 non-null    float64
 3   NO2uM              485 non-null    float64
 4   NO3uM              485 non-null    float64
 5   NH3uM              485 non-null    float64
 6   R_TEMP             485 non-null    float64
 7   R_Depth            485 non-null    int64  
 8   R_Sal              485 non-null    float64
 9   R_DYNHT            485 non-null    float64
 10  R_Nuts             485 non-null    float64
 11  R_Oxy_micromol.Kg  485 non-null    float64
 12  PO4uM              485 non-null    float64
 13  SiO3uM             485 non-null    float64
 14  TA1                485 non-null    float64
 15  Salinity1          485 non-null    float64
 16  Temperature_degC   485 non

### Exploratory Analysis
Our final data frame submission needs to have `id` and `DIC` 

- `DIC`: Our target variable of dissolved oxygen content
- `id`: Unique identifier for each sample

The data contains numeric variables related to dissolved oxygen content. The variable `Unnamed: 12` is an empty row. We'll drop this variable and scale the rest of our numeric variables

In [15]:
# Remove NA column from training data
train_df = train_df.drop(columns='Unnamed: 12')

# Assign features
X = train_df.drop(columns='DIC')
y = train_df['DIC']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=808) 

# Scale the data
scaler = StandardScaler()
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X.columns)
X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X.columns)

## Model Selection

Can start with linear regrssion (ridge, lasso), decision tree regression, random forest regrssion, XGB, SVM, deep learning?


### Ridge and Lasso Regression

In [19]:
# Create OLS instance and fit it
# ols = LinearRegression()
# ols.fit(X_train_scaled, y_train)

# Define a fixed alpha (lambda)
alpha_fixed = 10 # arbitrariliy choose alpha

# Create Ridge regression instance and fit it
ridge = Ridge(alpha=alpha_fixed)
ridge.fit(X_train_scaled, y_train)

# Check coefficients of the OLS and ridge models
# print("OLS Coefficients:", ols.coef_)
# print("Ridge Coefficients:", ridge.coef_)

# Predictions using ridge model
y_train_pred = ridge.predict(X_train_scaled)
y_test_pred = ridge.predict(X_test_scaled)

# Evaluate MSE
mse_train = mean_squared_error(y_train, y_train_pred)
mse_test = mean_squared_error(y_test, y_test_pred)

print(f"Train MSE (alpha={alpha_fixed}): {mse_train:.4f}")
print(f"Test MSE (alpha={alpha_fixed}): {mse_test:.4f} ")

Train MSE (alpha=10): 29.7278
Test MSE (alpha=10): 48.5233 


In [20]:
# Define the three alpha values
alphas = [0.1, 1.0, 10.0]

# Fit RidgeCV
ridge_cv = RidgeCV(alphas=alphas, cv=10).fit(X_train_scaled, y_train) 
# using ten fold cross validation. 30 models total

# Print best alpha
print(f"Best alpha: {ridge_cv.alpha_}")
print(f"All alphas: {ridge_cv.alphas}")

# Once we know what the best alpha is, we can move on to the test phase and see how well it does
# Evaluate model with the best alpha
y_test_pred_cv = ridge_cv.predict(X_test_scaled)
mse_test_cv = mean_squared_error(y_test, y_test_pred_cv)
print(f"Test MSE with best alpha: {mse_test_cv:.4f}")


Best alpha: 1.0
All alphas: [0.1, 1.0, 10.0]
Test MSE with best alpha: 45.3524


In [23]:
# Fit lasso regression with cross-validation
alphas =  np.logspace(-4, 4, 100) # Alphas from 0.0001 to 10,000
lasso_cv = LassoCV(alphas=alphas, cv = 10).fit(X_train_scaled, y_train)

# Print the optimal alpha and associated coefficients
print(f"Best alpha: {lasso_cv.alpha_:.6f}")

  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descen

Best alpha: 0.432876


In [24]:
# Think: what is this code doing?
lasso_best_alpha_1se = lasso_cv.alphas_[np.where(
    lasso_cv.mse_path_.mean(axis=1) <= (lasso_cv.mse_path_.mean(axis=1).min() + lasso_cv.mse_path_.std(axis=1).mean())
)[0][0]]

# Fit a Lasso with this new alpha
lasso_1se_model = Lasso(alpha=lasso_best_alpha_1se)
lasso_1se_model.fit(X_train_scaled, y_train)

# Identify remaining features for 1-SE rule (those with non-zero coefficients)
remaining_features_1se = X_train.columns[lasso_1se_model.coef_ != 0].to_list()
print(remaining_features_1se)

['NO3uM', 'R_Sal', 'PO4uM', 'SiO3uM', 'Salinity1']


## Decision Tree

In [None]:
# Initialize models
knn = KNeighborsRegressor(n_neighbors=5)
dt = DecisionTreeRegressor(random_state=808)

# Train (fit) both models
knn.fit(X_train_scaled, y_train)
dt.fit(X_train_scaled, y_train)

# Predictions on training data
knn_y_train_pred = knn.predict(X_train_scaled)
dt_y_train_pred = dt.predict(X_train_scaled)

# Compute training accuracy
knn_train_accuracy = accuracy_score(y_train, knn_y_train_pred)
dt_train_accuracy = accuracy_score(y_train, dt_y_train_pred)

#Print training accuracy for both models
print(f"K-Nearest Neighbor training accuracy: {knn_train_accuracy:.4f}")
print(f"Decision tree training accuracy: {dt_train_accuracy}")