## Check data for algae blooms identification.

The idea is to take the modeled data and train a machine learning (ML) model on that data, then try to use on the observational data.
The reason - models can't predict very well the exact time and location of algae blooms but they reproduce the physics/biogeochemistry of it.
Thus, the intuition to check is that a ML model trained on modelled data will be able to predict blooms on observational data.

In [1]:
import os
from pathlib import Path

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

from blooms_ml.utils import (
    labeling,
)

sns.set_style("whitegrid")

Open the prepared from ROHO800 model output data and label it

In [2]:
datadir = f"{Path.home()}/data_ROHO"
df = pd.read_parquet(os.path.join(datadir, "roho800_weekly_average.parquet"))

In [3]:
df = df.groupby(["station", "s_rho"]).apply(labeling, include_groups=False)
df = df.reset_index().drop(columns="level_2")

In [None]:
df

In [5]:
VARIABLES = [*["N1_p", "N3_n", "N5_s", "P1_c"], *[str(i) for i in range(1, 26)]]
def to_differences(df):
    df = df.reset_index(drop=True)
    df_diff1 = (df[VARIABLES].diff(periods=1))[2:]
    df_diff1 = df_diff1.rename(columns=lambda x: x + '_diff1')
    df_diff2 = (df[VARIABLES].diff(periods=2))[2:]
    df_diff2 = df_diff2.rename(columns=lambda x: x + '_diff2')
    df_diff = pd.concat([df_diff1, df_diff2], axis=1)
    df_diff[["ocean_time", "label"]] = df[["ocean_time", "label"]]
    return df_diff

In [6]:
df_diff = df.groupby(["station", "s_rho"]).apply(to_differences, include_groups=False)
df_diff = df_diff.reset_index().drop(columns="level_2")

Plot a station

In [None]:
df_station = df[df["station"] == 0][df["s_rho"] == -0.02]
df_station.set_index("ocean_time", inplace=True)

In [13]:
plot_year = "2008"
df_year = df_station.loc[plot_year]
df_label_year = df_year[df_year["label"] == 1]

In [14]:
def plot_year(par):
    _, ax = plt.subplots(figsize=(20, 3))
    ax.plot(df_year[par].index, df_year[par])
    ax.plot(df_label_year.index, df_label_year[par], "r.")

In [None]:
plot_year("P1_c")

In [None]:
plot_year("N5_s")

Unsupervised

In [14]:
import time

from sklearn.decomposition import PCA

In [15]:
df_surface = df_norm[df_norm["s_rho"] == -0.02]

In [16]:
df_density_profiles = df_surface.drop(
    columns=["station", "s_rho", "ocean_time", "N1_p", "N3_n", "N5_s", "P1_c", "rho", "y"]
)

In [20]:
df_nutrients = df_surface[["N1_p", "N3_n", "N5_s", "P1_c", "label"]]

In [24]:
X = df_nutrients.drop("label", axis=1)
y = df_nutrients["label"]

In [None]:
# PCA
t0 = time.time()
pca = PCA(n_components=3, random_state=42)
X_reduced_pca = pca.fit_transform(X.values)
t1 = time.time()
print(f"PCA took {t1 - t0:.2} s")

In [None]:
print(pca.explained_variance_ratio_)

In [27]:
import matplotlib.patches as mpatches

In [None]:
f, ax = plt.subplots(1, 1, figsize=(8, 8))
f.suptitle("Clusters using Dimensionality Reduction", fontsize=14)

blue_patch = mpatches.Patch(color="#0A0AFF", label="No bloom")
red_patch = mpatches.Patch(color="#AF0000", label="bloom")

# PCA scatter plot
ax.scatter(X_reduced_pca[:, 0], X_reduced_pca[:, 1], c=(y == 0), cmap="coolwarm", label="No bloom", linewidths=2)
ax.scatter(X_reduced_pca[:, 0], X_reduced_pca[:, 1], c=(y == 1), cmap="coolwarm", label="bloom", linewidths=2)
ax.set_title("PCA", fontsize=14)

ax.grid(True)

ax.legend(handles=[blue_patch, red_patch])

plt.show()

Classification

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

In [None]:
df_input

In [None]:
X = df_input.drop(columns=["station", "ocean_time", "s_rho", "P1_c", "P1_netPI", "label"])
y = df_input["label"]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
X_train = X_train.values
X_test = X_test.values
y_train = y_train.values
y_test = y_test.values

In [None]:
classifiers = {
    "LogisiticRegression": LogisticRegression(),
    "KNearest": KNeighborsClassifier(),
    "Support Vector Classifier": SVC(),
    "DecisionTreeClassifier": DecisionTreeClassifier(),
}

In [None]:
for key, classifier in classifiers.items():
    classifier.fit(X_train, y_train)
    training_score = cross_val_score(classifier, X_test, y_test, cv=5)
    print(
        "Classifiers: ",
        classifier.__class__.__name__,
        "Has a training score of",
        round(training_score.mean(), 2) * 100,
        "% accuracy score",
    )

In [None]:
log_reg_sm = LogisticRegression()

In [None]:
log_reg_sm.fit(X_train, y_train)

In [None]:
y_pred_log_reg = log_reg_sm.predict(X_test)

In [None]:
labels = ["No bloom", "bloom"]
plot_confusion_matrix(confusion_matrix(y_test, y_pred_log_reg), labels, title="Confusion Matrix", cmap=plt.cm.Reds)