In [1]:
from pathlib import Path
import random
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from xgboost import XGBRegressor
from scipy.interpolate import griddata


In [None]:

DATA_PATH      = Path("../data/processed/salinity_00_train_data.parquet")
TARGET         = "salinity_00"
TEST_BBOX      = {                       # Morze Śródziemne
    "lat_min": 30.0, "lat_max": 46.0,
    "lon_min": -6.0, "lon_max": 36.0
}
SEED           = 42
N_JOBS         = -1
SUB_FRAC       = 0.20  # bo RAM sie konczy xD 100% train_df zabiera 25GB RAMu, mozna inne podejscia spróbować


In [3]:
random.seed(SEED)
np.random.seed(SEED)

df = pd.read_parquet(DATA_PATH)

In [4]:

deg2rad = np.pi / 180.0
df["sin_lat"]   = np.sin(df["lat"] * deg2rad)
df["cos_lat"]   = np.cos(df["lat"] * deg2rad)
df["sin_lon"]   = np.sin(df["lon"] * deg2rad)
df["cos_lon"]   = np.cos(df["lon"] * deg2rad)
df["norm_depth"] = df["depth"] / df["depth"].max()


In [5]:

FEATURES = [
    "temperature_00", "oxygen_00", "nitrate_00", "phosphate_00",
    "sin_lat", "cos_lat", "sin_lon", "cos_lon", "norm_depth"
]

test_mask = (
    (df["lat"].between(TEST_BBOX["lat_min"], TEST_BBOX["lat_max"])) &
    (df["lon"].between(TEST_BBOX["lon_min"], TEST_BBOX["lon_max"]))
)

train_df = df.loc[~test_mask].reset_index(drop=True)
test_df  = df.loc[test_mask].reset_index(drop=True)

train_df   = train_df.sample(frac=SUB_FRAC, random_state=SEED).reset_index(drop=True)

print(f"Train samples: {len(train_df):,d}")
print(f"Test  samples: {len(test_df):,d}  (bbox: {TEST_BBOX})")

Train samples: 378,627
Test  samples: 10,623  (bbox: {'lat_min': 30.0, 'lat_max': 46.0, 'lon_min': -6.0, 'lon_max': 36.0})


In [6]:

X_train, y_train = train_df[FEATURES].values, train_df[TARGET].values
X_test,  y_test  = test_df[FEATURES].values,  test_df[TARGET].values

scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test  = scaler.transform(X_test)

In [7]:

def evaluate(name, y_true, y_pred):
    rmse = mean_squared_error(y_true, y_pred, squared=False)
    mae  = mean_absolute_error(y_true, y_pred)
    r2   = r2_score(y_true, y_pred)
    print(f"{name:<18} MSE={rmse:.4f}  MAE={mae:.4f}  R2={r2:.4f}")

In [8]:
rf_pipe = make_pipeline(
    SimpleImputer(strategy="median"),
    RandomForestRegressor(n_estimators=400, n_jobs=-1, random_state=SEED)
)

rf_pipe.fit(X_train, y_train)
pred_rf = rf_pipe.predict(X_test)
evaluate("RandomForest", y_test, pred_rf)

RandomForest       MSE=3.0660  MAE=2.8356  R2=0.6326


In [12]:

xgb = XGBRegressor(
    n_estimators=600,
    learning_rate=0.05,
    max_depth=8,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="reg:squarederror",
    reg_lambda=1.0,
    device='cuda',
    n_jobs=N_JOBS,
    random_state=SEED,
)
xgb.fit(X_train, y_train)
pred_xgb = xgb.predict(X_test)
evaluate("XGBoost", y_test, pred_xgb)

XGBoost            MSE=4.1878  MAE=3.9152  R2=0.3145


In [10]:

train_pts = train_df[["lat", "lon", "depth"]].values
test_pts  = test_df[["lat", "lon", "depth"]].values
val_train = y_train

pred_lin = griddata(train_pts, val_train, test_pts, method="linear")
# fallback na najbliższy sąsiad tam, gdzie linear == NaN
nan_mask = np.isnan(pred_lin)
if nan_mask.any():
    pred_nn = griddata(train_pts, val_train, test_pts[nan_mask], method="nearest")
    pred_lin[nan_mask] = pred_nn
evaluate("Linear interp 3D", y_test, pred_lin)

Linear interp 3D   MSE=10.2967  MAE=9.2784  R2=-3.1440


In [11]:

out = test_df[["lat", "lon", "depth"]].copy()
out["salinity_true"] = y_test
out["rf_pred"]       = pred_rf
out["xgb_pred"]      = pred_xgb
out["interp_pred"]   = pred_lin
out.to_csv("../results/baseline_predictions.csv", index=False)
print(">> Wyniki zapisane do ../results/baseline_predictions.csv")


>> Wyniki zapisane do ../results/baseline_predictions.csv
