In [None]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.multioutput import MultiOutputRegressor

In [None]:
# Load training and feature datasets
train_water = pd.read_csv('/home/oai/share/water_quality_training_dataset.csv')
train_landsat = pd.read_csv('/home/oai/share/landsat_features_training_jin.csv')
train_terra = pd.read_csv('/home/oai/share/terraclimate_features_training.csv')

# Load validation feature datasets
val_landsat = pd.read_csv('/home/oai/share/landsat_features_validation_jin.csv')
val_terra = pd.read_csv('/home/oai/share/terraclimate_features_validation.csv')


In [None]:
# Merge landsat and terraclimate features for training
train_features = pd.merge(train_landsat, train_terra, on=['Latitude','Longitude','Sample Date'], how='inner')

# Merge merged features with water quality training data
train_full = pd.merge(train_water, train_features, on=['Latitude','Longitude','Sample Date'], how='inner')

print('Training data shape after merge:', train_full.shape)


In [None]:
# Convert Sample Date to datetime
train_full['Sample Date'] = pd.to_datetime(train_full['Sample Date'], format='%d-%m-%Y', errors='coerce')

# Extract date-related features
train_full['date_ordinal'] = train_full['Sample Date'].map(pd.Timestamp.toordinal)
train_full['year'] = train_full['Sample Date'].dt.year
train_full['month'] = train_full['Sample Date'].dt.month
train_full['dayofyear'] = train_full['Sample Date'].dt.dayofyear

# Prepare feature columns (exclude targets and original date)
target_cols = ['Total Alkalinity','Electrical Conductance','Dissolved Reactive Phosphorus']
feature_cols = [col for col in train_full.columns if col not in target_cols + ['Sample Date']]

# Split into features (X) and targets (y)
X_train = train_full[feature_cols]
y_train = train_full[target_cols]

# Fill missing values in training features with median values
median_vals = X_train.median()
X_train_filled = X_train.fillna(median_vals)


In [None]:
# Define XGBoost parameters
xgb_params = {
    'n_estimators': 300,
    'learning_rate': 0.05,
    'max_depth': 6,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'random_state': 42,
    'objective': 'reg:squarederror'
}

# Initialize and train MultiOutputRegressor with XGBRegressor
base_model = XGBRegressor(**xgb_params)
model = MultiOutputRegressor(base_model)
model.fit(X_train_filled, y_train)

# Compute feature importances by averaging across the individual estimators
importances = np.mean([est.feature_importances_ for est in model.estimators_], axis=0)
feature_importance_df = pd.DataFrame({'feature': X_train.columns, 'importance': importances})
feature_importance_df = feature_importance_df.sort_values(by='importance', ascending=False)

# Save feature importances to CSV
feature_importance_df.to_csv('/home/oai/share/feature_importance.csv', index=False)

# Display top 10 features
feature_importance_df.head(10)


In [None]:
# Merge validation landsat and terraclimate features
val_features = pd.merge(val_landsat, val_terra, on=['Latitude','Longitude','Sample Date'], how='inner')

# Convert Sample Date to datetime for validation
val_features['Sample Date'] = pd.to_datetime(val_features['Sample Date'], format='%d-%m-%Y', errors='coerce')

# Create date features for validation
val_features['date_ordinal'] = val_features['Sample Date'].map(pd.Timestamp.toordinal)
val_features['year'] = val_features['Sample Date'].dt.year
val_features['month'] = val_features['Sample Date'].dt.month
val_features['dayofyear'] = val_features['Sample Date'].dt.dayofyear

# Select the same feature columns as training
X_val = val_features[feature_cols]

# Fill missing values in validation features using training medians
X_val_filled = X_val.fillna(median_vals)

# Make predictions
preds = model.predict(X_val_filled)

# Load submission template and fill predictions
submission = pd.read_csv('/home/oai/share/submission_template.csv')
submission[target_cols] = preds

# Save completed submission
submission.to_csv('/home/oai/share/submission.csv', index=False)

# Display first few rows of the submission
submission.head()
