# EDS232/CalCOFI Ocean chemistry prediction 2025

Team members: Stephen Carroll, Tom Gibbens-Matsuyama, Ian Morris-Sibaja, Haylee Oyler 

### Description
For this quarter’s final lab, you will apply the machine learning knowledge you’ve gained to train models that predict dissolved inorganic carbon (DIC) levels in water samples collected by the California Cooperative Oceanic Fisheries Investigations program.

In this lab, you'll be working with real-world environmental data in a friendly competition with your classmates to see who can develop the most accurate predictive model.

## Your Task
- **Acquire domain knowledge**: Provided by Dr. Satterthwaite in her presentation
- **Explore the data**: Load the dataset and perform initial exploratory data analysis to inform your modeling choices
- **Preprocessing** (if necessary): Is the data ready to be used in your model?
- **Choose and train a model**: Select an appropriate machine learning algorithm for this task. Train your model on the provided training data
- **Tune relevant parameters**: Use cross-validation to optimize model performance. Experiment with different hyperparameters to reduce error
- **Submit your prediction**: Generate predictions on the provided test dataset

In [171]:
# Load basic libraries
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import statistics as stats
import time

# Linear model
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import sklearn.linear_model
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import LassoCV
from sklearn.preprocessing import PolynomialFeatures

from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score
from scipy.stats import uniform, randint
from sklearn.impute import KNNImputer
from sklearn.tree import DecisionTreeRegressor, plot_tree
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder

# Define parameter grid for SVR

# Import data
train_df = pd.read_csv("data/train.csv").drop(columns='Unnamed: 12').rename(columns = {"TA1.x":"TA1"})
test_df = pd.read_csv("data/test.csv")
sample_sub =  pd.read_csv("data/sample_submission.csv")

In [172]:
test_df

Unnamed: 0,id,Lat_Dec,Lon_Dec,NO2uM,NO3uM,NH3uM,R_TEMP,R_Depth,R_Sal,R_DYNHT,R_Nuts,R_Oxy_micromol.Kg,PO4uM,SiO3uM,TA1,Salinity1,Temperature_degC
0,1455,34.321666,-120.811666,0.020,24.00,0.41,9.51,101,189.9,0.258,0.41,138.838300,1.85,25.50,2244.94,33.830,9.52
1,1456,34.275000,-120.033333,0.000,25.10,0.00,9.84,102,185.2,0.264,0.00,102.709200,2.06,28.30,2253.27,33.963,9.85
2,1457,34.275000,-120.033333,0.000,31.90,0.00,6.60,514,124.1,0.874,0.00,2.174548,3.40,88.10,2316.95,34.241,6.65
3,1458,33.828333,-118.625000,0.000,0.00,0.20,19.21,1,408.1,0.004,0.20,258.674300,0.27,2.50,2240.49,33.465,19.21
4,1459,33.828333,-118.625000,0.020,19.70,0.00,10.65,100,215.5,0.274,0.00,145.839900,1.64,19.40,2238.30,33.720,10.66
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
480,1935,31.418030,-121.989970,0.000,0.00,0.01,21.20,2,465.3,0.009,0.01,229.490000,0.28,1.89,2235.34,33.380,21.20
481,1936,31.418030,-121.989970,0.000,24.50,0.00,8.54,232,171.3,0.689,0.00,153.330000,1.81,29.40,2250.00,33.934,8.57
482,1937,31.418030,-121.989970,0.000,33.70,0.00,7.15,323,144.0,0.829,0.00,78.890000,2.51,50.75,2270.19,34.049,7.18
483,1938,32.846330,-117.531300,0.037,0.75,0.05,13.95,30,307.5,0.117,0.05,259.410000,0.49,3.52,2225.36,33.285,13.95


In [173]:
# Explore the data
train_df.head()

Unnamed: 0,id,Lat_Dec,Lon_Dec,NO2uM,NO3uM,NH3uM,R_TEMP,R_Depth,R_Sal,R_DYNHT,R_Nuts,R_Oxy_micromol.Kg,PO4uM,SiO3uM,TA1,Salinity1,Temperature_degC,DIC
0,1,34.38503,-120.66553,0.03,33.8,0.0,7.79,323,141.2,0.642,0.0,37.40948,2.77,53.86,2287.45,34.198,7.82,2270.17
1,2,31.418333,-121.998333,0.0,34.7,0.0,7.12,323,140.8,0.767,0.0,64.81441,2.57,52.5,2279.1,34.074,7.15,2254.1
2,3,34.38503,-120.66553,0.18,14.2,0.0,11.68,50,246.8,0.144,0.0,180.2915,1.29,13.01,2230.8,33.537,11.68,2111.04
3,4,33.48258,-122.53307,0.013,29.67,0.01,8.33,232,158.5,0.562,0.01,89.62595,2.27,38.98,2265.85,34.048,8.36,2223.41
4,5,31.41432,-121.99767,0.0,33.1,0.05,7.53,323,143.4,0.74,0.05,60.03062,2.53,49.28,2278.49,34.117,7.57,2252.62


In [174]:
test_df.head()

Unnamed: 0,id,Lat_Dec,Lon_Dec,NO2uM,NO3uM,NH3uM,R_TEMP,R_Depth,R_Sal,R_DYNHT,R_Nuts,R_Oxy_micromol.Kg,PO4uM,SiO3uM,TA1,Salinity1,Temperature_degC
0,1455,34.321666,-120.811666,0.02,24.0,0.41,9.51,101,189.9,0.258,0.41,138.8383,1.85,25.5,2244.94,33.83,9.52
1,1456,34.275,-120.033333,0.0,25.1,0.0,9.84,102,185.2,0.264,0.0,102.7092,2.06,28.3,2253.27,33.963,9.85
2,1457,34.275,-120.033333,0.0,31.9,0.0,6.6,514,124.1,0.874,0.0,2.174548,3.4,88.1,2316.95,34.241,6.65
3,1458,33.828333,-118.625,0.0,0.0,0.2,19.21,1,408.1,0.004,0.2,258.6743,0.27,2.5,2240.49,33.465,19.21
4,1459,33.828333,-118.625,0.02,19.7,0.0,10.65,100,215.5,0.274,0.0,145.8399,1.64,19.4,2238.3,33.72,10.66


In [175]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1454 entries, 0 to 1453
Data columns (total 18 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 1454 non-null   int64  
 1   Lat_Dec            1454 non-null   float64
 2   Lon_Dec            1454 non-null   float64
 3   NO2uM              1454 non-null   float64
 4   NO3uM              1454 non-null   float64
 5   NH3uM              1454 non-null   float64
 6   R_TEMP             1454 non-null   float64
 7   R_Depth            1454 non-null   int64  
 8   R_Sal              1454 non-null   float64
 9   R_DYNHT            1454 non-null   float64
 10  R_Nuts             1454 non-null   float64
 11  R_Oxy_micromol.Kg  1454 non-null   float64
 12  PO4uM              1454 non-null   float64
 13  SiO3uM             1454 non-null   float64
 14  TA1                1454 non-null   float64
 15  Salinity1          1454 non-null   float64
 16  Temperature_degC   1454 

In [176]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 485 entries, 0 to 484
Data columns (total 17 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 485 non-null    int64  
 1   Lat_Dec            485 non-null    float64
 2   Lon_Dec            485 non-null    float64
 3   NO2uM              485 non-null    float64
 4   NO3uM              485 non-null    float64
 5   NH3uM              485 non-null    float64
 6   R_TEMP             485 non-null    float64
 7   R_Depth            485 non-null    int64  
 8   R_Sal              485 non-null    float64
 9   R_DYNHT            485 non-null    float64
 10  R_Nuts             485 non-null    float64
 11  R_Oxy_micromol.Kg  485 non-null    float64
 12  PO4uM              485 non-null    float64
 13  SiO3uM             485 non-null    float64
 14  TA1                485 non-null    float64
 15  Salinity1          485 non-null    float64
 16  Temperature_degC   485 non

In [177]:
#sns.pairplot(train_df)
#plt.show()

### Exploratory Analysis
Our final data frame submission needs to have 'id' and 'DIC' 

- 'DIC': Our target variable of dissolved oxygen content
- 'id': Unique identifier for each sample

The data contains numeric variables related to dissolved oxygen content. The variable 'Unnamed: 12' is an empty row. We'll drop this variable and scale the rest of our numeric variables

In [178]:
# Assign features
X = train_df.drop(columns=['id', 'DIC'], axis=1)
y = train_df['DIC']

# Split the data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=808) 

# Scale the data
scaler = StandardScaler()
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X.columns)
X_val_scaled = pd.DataFrame(scaler.transform(X_val), columns=X.columns)

## Model Selection

Can start with linear regrssion (ridge, lasso), decision tree regression, random forest regrssion, XGB, SVM, deep learning?


SVM

In [224]:
param_grid = {
    'C':[125],
    'gamma': [0.001],
    'kernel': ['linear'],
    'tol': [0.002],
    'epsilon': [5]
}

# Create GridSearchCV object with SVR
grid_search_svm = GridSearchCV(SVR(), param_grid, cv=5)

start_time = time.time()
# Now fit should work with continuous target
grid_search_svm.fit(X_train_scaled, y_train)
end_time = time.time()

svm_time = end_time - start_time
print(svm_time)

# Retrieve the best SVM model
print(grid_search_svm.best_params_)

0.7553348541259766
{'C': 125, 'epsilon': 5, 'gamma': 0.001, 'kernel': 'linear', 'tol': 0.002}


In [225]:
best_svm = grid_search_svm.best_estimator_

best_svm.fit(X_train_scaled, y_train)

best_svm_preds = best_svm.predict(X_val_scaled)

# Calculate R² score
r2 = r2_score(y_val, best_svm_preds)
print(f"R² score: {r2}")

R² score: 0.9953367819153545


In [226]:
svm_score = best_svm.score(X_val_scaled, y_val)
print(f"Accuracy score: {svm_score}")

Accuracy score: 0.9953367819153545


In [227]:
# Prepare the test data
X_test = test_df.drop(columns=['id'], axis=1)  # Drop id, but no need to drop DIC as it's not in test_df

# Scale the test data using the same scaler
X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)

# Make predictions on the test data using your best SVM model
test_predictions = best_svm.predict(X_test_scaled)
print(f"Test Accuracy score: {svm_score}")

Test Accuracy score: 0.9953367819153545


In [228]:

# Create a submission dataframe
submission = pd.DataFrame({
    'id': test_df['id'],
    'DIC': test_predictions
})

# Save the submission to a CSV file
submission.to_csv('svm_submission.csv', index=False)

# Preview the submission file
submission.head()

Unnamed: 0,id,DIC
0,1455,2173.063217
1,1456,2194.869919
2,1457,2327.048263
3,1458,1993.986994
4,1459,2147.96155
