In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import joblib

In [5]:
# load the dataset
df = pd.read_csv('afa2e701598d20110228.csv', sep=';')
df

Unnamed: 0,id,date,NH4,BSK5,Suspended,O2,NO3,NO2,SO4,PO4,CL
0,1,17.02.2000,0.330,2.77,12.0,12.30,9.50,0.057,154.00,0.454,289.50
1,1,11.05.2000,0.044,3.00,51.6,14.61,17.75,0.034,352.00,0.090,1792.00
2,1,11.09.2000,0.032,2.10,24.5,9.87,13.80,0.173,416.00,0.200,2509.00
3,1,13.12.2000,0.170,2.23,35.6,12.40,17.13,0.099,275.20,0.377,1264.00
4,1,02.03.2001,0.000,3.03,48.8,14.69,10.00,0.065,281.60,0.134,1462.00
...,...,...,...,...,...,...,...,...,...,...,...
2856,22,06.10.2020,0.046,2.69,3.6,8.28,3.80,0.038,160.00,0.726,77.85
2857,22,27.10.2020,0.000,1.52,0.5,11.26,0.56,0.031,147.20,0.634,71.95
2858,22,03.12.2020,0.034,0.29,0.8,11.09,2.58,0.042,209.92,0.484,61.17
2859,22,12.01.2021,0.000,2.10,0.0,14.31,3.94,0.034,121.60,0.424,63.49


In [9]:
# drop the missing values - dropna()
df = df.dropna(subset=pollutants)
df.head()


Unnamed: 0,id,date,NH4,BSK5,Suspended,O2,NO3,NO2,SO4,PO4,CL,year
0,1,17.02.2000,0.33,2.77,12.0,12.3,9.5,0.057,154.0,0.454,289.5,2000
1,1,11.05.2000,0.044,3.0,51.6,14.61,17.75,0.034,352.0,0.09,1792.0,2000
2,1,11.09.2000,0.032,2.1,24.5,9.87,13.8,0.173,416.0,0.2,2509.0,2000
3,1,13.12.2000,0.17,2.23,35.6,12.4,17.13,0.099,275.2,0.377,1264.0,2000
4,1,02.03.2001,0.0,3.03,48.8,14.69,10.0,0.065,281.6,0.134,1462.0,2001


In [24]:
df.isnull().sum()

id           0
date         0
NH4          0
BSK5         0
Suspended    0
O2           0
NO3          0
NO2          0
SO4          0
PO4          0
CL           0
year         0
dtype: int64

In [25]:
#  Feature and Target Selection
# Features (independent variables): 'id' and 'year'
# Target (dependent variables): pollutant concentrations
features = ['id', 'year']
X = df[features]
y = df[pollutants]

In [None]:
# Encoding Categorical Feature 'id' using One-Hot Encoding
# drop_first=True avoids dummy variable trap by removing one column
X_encoded = pd.get_dummies(X, columns=['id'], prefix='station', drop_first=True)

In [None]:
# Optional: ensure consistent column order
X_encoded = X_encoded.sort_index(axis=1)


In [26]:
# Model configuration
base_model = RandomForestRegressor(
    n_estimators=150,       # Slightly increased for better performance
    max_depth=None,         # Can set to a value like 10 for faster training (optional)
    min_samples_split=4,    # Prevents overfitting slightly
    random_state=42,
    n_jobs=-1               # Enables parallel training on all cores
)

# Multi-output model for all pollutants
model = MultiOutputRegressor(base_model)
model.fit(X_train, y_train)


In [17]:
# Evaluate
print("Model Performance on the Test Data:\n")
for i, pollutant in enumerate(pollutants):
    mse = mean_squared_error(y_test.iloc[:, i], y_pred[:, i])
    r2 = r2_score(y_test.iloc[:, i], y_pred[:, i])
    print(f'{pollutant}:')
    print(f'   MSE: {mse:.2f}')
    print(f'   R²: {r2:.2f}\n')


Model Performance on the Test Data:

NH4:
   MSE: 0.88
   R²: 0.78

BSK5:
   MSE: 5.31
   R²: 0.19

Suspended:
   MSE: 98.18
   R²: 0.20

O2:
   MSE: 13.96
   R²: 0.05

NO3:
   MSE: 20.40
   R²: 0.48

NO2:
   MSE: 10.34
   R²: -58.20

SO4:
   MSE: 2275.81
   R²: 0.45

PO4:
   MSE: 0.24
   R²: 0.44

CL:
   MSE: 32661.44
   R²: 0.75



In [11]:
# Evaluate model
y_pred = model.predict(X_test)
print("Model Performance on the Test Data:")
for i, pollutant in enumerate(pollutants):
    print(f'{pollutant}:')
    print('   MSE:', mean_squared_error(y_test.iloc[:, i], y_pred[:, i]))
    print('   R2:', r2_score(y_test.iloc[:, i], y_pred[:, i]))
    print()

Model Performance on the Test Data:
NH4:
   MSE: 0.8827195364614927
   R2: 0.7801981883484587

BSK5:
   MSE: 5.31094542545559
   R2: 0.19096990064204467

Suspended:
   MSE: 98.17784721522588
   R2: 0.20495839046561737

O2:
   MSE: 13.955930601011778
   R2: 0.05381534726017545

NO3:
   MSE: 20.40490374797047
   R2: 0.484569230962687

NO2:
   MSE: 10.343405404494533
   R2: -58.203860061465534

SO4:
   MSE: 2275.807351900022
   R2: 0.44815941114800695

PO4:
   MSE: 0.24389334027446746
   R2: 0.43586964570072984

CL:
   MSE: 32661.43741785968
   R2: 0.7526035914013255



In [12]:
station_id = '22'
year_input = 2024

input_data = pd.DataFrame({'year': [year_input], 'id': [station_id]})
input_encoded = pd.get_dummies(input_data, columns=['id'])

# Align with training feature columns
missing_cols = set(X_encoded.columns) - set(input_encoded.columns)
for col in missing_cols:
    input_encoded[col] = 0
input_encoded = input_encoded[X_encoded.columns]  # reorder columns

# Predict pollutants
predicted_pollutants = model.predict(input_encoded)[0]

print(f"\nPredicted pollutant levels for station '{station_id}' in {year_input}:")
for p, val in zip(pollutants, predicted_pollutants):
    print(f"  {p}: {val:.2f}")


Predicted pollutant levels for station '22' in 2024:
  NH4: 0.03
  BSK5: 2.57
  Suspended: 5.69
  O2: 13.25
  NO3: 6.93
  NO2: 0.07
  SO4: 144.84
  PO4: 0.46
  CL: 67.36


In [20]:
# Function to prepare features and targets
def encode_features(df):
    X = df[['id', 'year']]
    y = df[pollutants]
    X_encoded = pd.get_dummies(X, columns=['id'], drop_first=True)
    return X_encoded, y

In [15]:
# Make prediction
predicted_pollutants = model.predict(input_encoded)[0]
print(f"\nPredicted pollutant levels for station '{station_id}' in {year_input}:")
for p, val in zip(pollutants, predicted_pollutants):
    print(f"  {p}: {val:.2f}")


Predicted pollutant levels for station '22' in 2024:
  NH4: 0.03
  BSK5: 2.57
  Suspended: 5.69
  O2: 13.25
  NO3: 6.93
  NO2: 0.07
  SO4: 144.84
  PO4: 0.46
  CL: 67.36


In [None]:
# Function to load and preprocess the dataset
def load_and_preprocess_data(filepath):
    df = pd.read_csv(filepath, delimiter=';')
    df['year'] = pd.to_datetime(df['date'], dayfirst=True).dt.year
    df = df.dropna(subset=pollutants)
    return df

In [None]:
# Function to prepare features and targets
def encode_features(df):
    X = df[['id', 'year']]
    y = df[pollutants]
    X_encoded = pd.get_dummies(X, columns=['id'], drop_first=True)
    return X_encoded, y

In [None]:
# Function to predict pollutant levels for a new station-year
def predict_pollutants(model, X_columns, station_id, year):
    input_df = pd.DataFrame({'year': [year], 'id': [station_id]})
    input_encoded = pd.get_dummies(input_df, columns=['id'])


In [28]:
import joblib

joblib.dump(model, 'pollution_model.pkl')
joblib.dump(X_encoded.columns.tolist(), "model_columns.pkl")
print('Model and cols structure are saved!')

Model and cols structure are saved!


In [29]:
import joblib

# Define file paths
model_filename = "pollution_rf_model.pkl"
columns_filename = "pollution_feature_columns.pkl"

# Save the trained model and feature column structure
joblib.dump(model, model_filename)
joblib.dump(X_encoded.columns.tolist(), columns_filename)

# Confirmation message
print(f" Model saved as: {model_filename}")
print(f" Feature columns saved as: {columns_filename}")


 Model saved as: pollution_rf_model.pkl
 Feature columns saved as: pollution_feature_columns.pkl
