In [None]:
import pandas as pd
import geopandas as gpd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.patches import Patch
from matplotlib.lines import Line2D
from pathlib import Path

from se_coast_strandings.transformations import make_dt_col

In [None]:
SOURCE_FILE_PATH = "../data/raw/copepod__4000000-compilation.txt"
OUTPUT_FILE_PATH = "../data/processed/copepod_dataset.parquet"

In [None]:
def load_copepod_data(remake: bool = False) -> pd.DataFrame:
    filepath = Path(OUTPUT_FILE_PATH)
    if filepath.suffix == ".parquet" and filepath.exists() and not remake:
        return pd.read_parquet(filepath)

    with open(SOURCE_FILE_PATH, "r") as file:
        for _ in range(3):
            file.readline()

        header = file.readline().strip().split(",")

    header[0] = header[0].lstrip("#")
    header = header[:-1]
    header[24] = "VALUE-per_volu-UNITS"

    for num, i in enumerate(range(25, 29), start=1):
        header[i] = f"VALUE-per-volu-F{num}"

    header[30] = "VALUE-per_area-UNITS"
    for num, i in enumerate(range(31, 35), start=1):
        header[i] = f"VALUE-per-area-F{num}"

    df = pd.read_csv(
        SOURCE_FILE_PATH, sep=",", low_memory=False, skiprows=5, header=None
    )

    df = df.rename(columns={k: v for k, v in zip(df.columns, header)})
    df = df.drop(columns=df.columns[41:])

    df.to_parquet(OUTPUT_FILE_PATH, index=False)

    return df

In [None]:
df = load_copepod_data()
df.head(5)

In [None]:
df["date"] = make_dt_col(df["DAY"], df["MON"], df["YEAR"])
df["date"].sort_values()

In [None]:
df["Water Strained"] = (
    df["Water Strained"]
    .str.strip()
    .replace("null", np.nan)
    .str.removesuffix(" m3")
    .str.removesuffix(" ml")
    .astype(float)
)

In [None]:
df[~df["SCIENTIFIC NAME -[ modifiers ]-"].str.contains("[ ]")].shape[0]

In [None]:
df["SCIENTIFIC NAME -[ modifiers ]-"] = df[
    "SCIENTIFIC NAME -[ modifiers ]-"
].str.removesuffix("-[ ]-")

In [None]:
states = gpd.read_file("../data/reference/cb_2018_us_state_5m.shp").set_crs("EPSG:5070")
states.shape, states.crs

In [None]:
states = states.to_crs("EPSG:5070")
se_coast_states = states[
    states["NAME"].isin(
        [
            "Virginia",
            "North Carolina",
            "South Carolina",
        ]
    )
]

In [None]:
se_coast_states_b = se_coast_states.copy()
se_coast_states_b["geometry"] = se_coast_states_b.geometry.buffer(6.25)
se_coast_states_b["geometry"]

In [None]:
gdf = gpd.GeoDataFrame(
    df, geometry=gpd.points_from_xy(df["LONGITDE"], df["LATITUDE"]), crs="EPSG:5070"
)

In [None]:
mask = gdf.geometry.within(se_coast_states_b.geometry.union_all())

mask.value_counts()

In [None]:
gdf = gdf[mask]

In [None]:
out_path = Path("../figures/jh_plankton_dataset_map.png")

ax = states.plot(edgecolor="black", facecolor="none", linewidth=0.5, figsize=(8, 8))
se_coast_states.plot(ax=ax, color="blue", alpha=1, edgecolor="black", linewidth=0.5)
se_coast_states_b.plot(ax=ax, color="blue", alpha=0.1)
gdf.plot(ax=ax, markersize=0.25, color="green", alpha=0.5)

handles = [
    Patch(facecolor="blue", edgecolor="black", alpha=1, label="SE Coast states"),
    Patch(
        facecolor="blue", edgecolor="none", alpha=0.1, label="SE Coast buffer/region"
    ),
    Line2D(
        [0],
        [0],
        marker="o",
        color="none",
        markerfacecolor="green",
        markeredgecolor="none",
        markersize=6,
        alpha=0.5,
        label="Plankton samples",
    ),
]
ax.legend(handles=handles, loc="center right", frameon=True)

ax.set_xlim(-85, -65)
ax.set_ylim(24, 44)
ax.set_title("Plankton Sampling Locations along the SE Coast", pad=20)
ax.set_xlabel("Easting (m, EPSG:5070)")
ax.set_ylabel("Northing (m, EPSG:5070)")
plt.savefig(out_path, dpi=300)

In [None]:
gdf.drop(columns=["geometry"]).to_parquet(
    "../data/processed/copepod_dataset_se_coast.parquet", index=False
)

In [None]:
df = pd.read_parquet("../data/processed/copepod_dataset_se_coast.parquet")
df