## ini nyoba prophet pake data 5 kategori

In [1]:
# Load and inspect the provided preprocessed ISPU dataset
import pandas as pd

df = pd.read_csv("./datasets/ispu_imputed_fixed.csv")

# Basic inspection outputs
print("Shape:", df.shape)
print("\nColumns:")
print(df.columns.tolist())

print("\nInfo:")
print(df.info())

print("\nMissing values per column:")
print(df.isna().sum().sort_values(ascending=False).head(20))

print("\nSample rows:")
print(df.head())

df = df.rename(columns={
    "pm_sepuluh": "pm10",
    "pm_duakomalima": "pm25",
    "sulfur_dioksida": "so2",
    "karbon_monoksida": "co",
    "ozon": "o3",
    "nitrogen_dioksida": "no2"
})

pollutant_cols = [
    "pm10",
    "pm25",
    "so2",
    "co",
    "o3",
    "no2"
]

print("\nMissing values (pollutants):")
print(df[pollutant_cols].isna().sum())

print("\nBasic statistics (pollutants):")
print(df[pollutant_cols].describe())

# Check unique categories and stations
print("\nUnique kategori:", df['kategori'].unique())
print("Unique stasiun:", df['stasiun'].unique())

Shape: (15347, 12)

Columns:
['tanggal', 'stasiun', 'pm_sepuluh', 'pm_duakomalima', 'sulfur_dioksida', 'karbon_monoksida', 'ozon', 'nitrogen_dioksida', 'periode_data', 'max', 'parameter_pencemar_kritis', 'kategori']

Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15347 entries, 0 to 15346
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   tanggal                    15347 non-null  object 
 1   stasiun                    15347 non-null  object 
 2   pm_sepuluh                 15347 non-null  float64
 3   pm_duakomalima             15347 non-null  float64
 4   sulfur_dioksida            15347 non-null  float64
 5   karbon_monoksida           15347 non-null  float64
 6   ozon                       15347 non-null  float64
 7   nitrogen_dioksida          15347 non-null  float64
 8   periode_data               15347 non-null  int64  
 9   max                        15347 non-null  floa

In [2]:
df["tanggal"] = pd.to_datetime(df["tanggal"])
df = df.sort_values("tanggal")

In [3]:
df["year"] = df["tanggal"].dt.year
df["month"] = df["tanggal"].dt.month
df["day"] = df["tanggal"].dt.day
df["dayofweek"] = df["tanggal"].dt.dayofweek
df["weekofyear"] = df["tanggal"].dt.isocalendar().week.astype(int)
df["quarter"] = df["tanggal"].dt.quarter
df["is_weekend"] = (df["dayofweek"] >= 5).astype(int)

In [4]:
df

Unnamed: 0,tanggal,stasiun,pm10,pm25,so2,co,o3,no2,periode_data,max,parameter_pencemar_kritis,kategori,year,month,day,dayofweek,weekofyear,quarter,is_weekend
0,2010-01-01,DKI1,46.000000,71.58465,3.000000,44.500000,30.000000,11.500000,201001,73.0,CO,SEDANG,2010,1,1,4,53,1,0
1,2010-01-02,DKI1,39.666667,71.58465,2.666667,36.000000,26.666667,10.666667,201001,33.0,O3,BAIK,2010,1,2,5,53,1,1
2,2010-01-03,DKI1,27.000000,71.58465,2.000000,17.000000,22.666667,8.000000,201001,27.0,PM10,BAIK,2010,1,3,6,53,1,1
3,2010-01-04,DKI1,24.666667,71.58465,2.000000,17.333333,16.666667,7.666667,201001,22.0,PM10,BAIK,2010,1,4,0,1,1,0
4,2010-01-05,DKI1,25.666667,71.58465,2.333333,18.333333,16.333333,8.333333,201001,25.0,PM10,BAIK,2010,1,5,1,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15344,2025-08-31,DKI3,24.500000,59.50000,53.000000,8.000000,21.000000,42.500000,202508,60.0,PM25,SEDANG,2025,8,31,6,35,3,1
15345,2025-08-31,DKI4,42.500000,60.00000,27.500000,9.500000,20.000000,18.500000,202508,59.0,PM25,SEDANG,2025,8,31,6,35,3,1
15342,2025-08-31,DKI1,43.000000,73.50000,29.500000,12.000000,18.500000,27.500000,202508,70.0,PM25,SEDANG,2025,8,31,6,35,3,1
15343,2025-08-31,DKI2,40.993923,69.00000,43.000000,15.500000,21.500000,16.000000,202508,72.0,PM25,SEDANG,2025,8,31,6,35,3,1


In [5]:
time_features = [
    "year",
    "month",
    "day",
    "dayofweek",
    "weekofyear",
    "quarter",
    "is_weekend",
    "stasiun"
]


In [6]:
df[time_features + ["pm10","pm25","so2","co","o3","no2"]].head()

Unnamed: 0,year,month,day,dayofweek,weekofyear,quarter,is_weekend,stasiun,pm10,pm25,so2,co,o3,no2
0,2010,1,1,4,53,1,0,DKI1,46.0,71.58465,3.0,44.5,30.0,11.5
1,2010,1,2,5,53,1,1,DKI1,39.666667,71.58465,2.666667,36.0,26.666667,10.666667
2,2010,1,3,6,53,1,1,DKI1,27.0,71.58465,2.0,17.0,22.666667,8.0
3,2010,1,4,0,1,1,0,DKI1,24.666667,71.58465,2.0,17.333333,16.666667,7.666667
4,2010,1,5,1,1,1,0,DKI1,25.666667,71.58465,2.333333,18.333333,16.333333,8.333333


In [7]:
split_date = df["tanggal"].quantile(0.8)

train = df[df["tanggal"] <= split_date]
test  = df[df["tanggal"] > split_date]


In [13]:
import catboost

In [None]:
# !pip install prophet


In [8]:
from prophet import Prophet

def forecast_pollutant(df, station, pollutant, future_dates):
    df_st = df[df["stasiun"] == station][["tanggal", pollutant]].copy()
    df_st.columns = ["ds", "y"]

    model = Prophet(
        yearly_seasonality=True,
        weekly_seasonality=True,
        daily_seasonality=False
    )

    model.fit(df_st)

    future = pd.DataFrame({"ds": future_dates})
    forecast = model.predict(future)

    return forecast[["ds", "yhat"]]


In [9]:
split_date = "2024-01-01"

df_train_ts = df[df["tanggal"] < split_date].copy()
df_test_ts  = df[df["tanggal"] >= split_date].copy()

In [10]:
df_forecasted = df_test_ts[["tanggal","stasiun"]].copy()

for pol in pollutant_cols:
    print(f"Forecasting {pol}...")
    
    all_preds = []
    
    for station in df_test_ts["stasiun"].unique():
        dates = df_test_ts[df_test_ts["stasiun"] == station]["tanggal"]
        
        fc = forecast_pollutant(df_train_ts, station, pol, dates)
        fc["stasiun"] = station
        
        all_preds.append(fc)
    
    all_preds = pd.concat(all_preds)
    
    df_forecasted = df_forecasted.merge(
        all_preds.rename(columns={"ds":"tanggal", "yhat":pol}),
        on=["tanggal","stasiun"],
        how="left"
    )

09:37:47 - cmdstanpy - INFO - Chain [1] start processing


Forecasting pm10...


09:37:47 - cmdstanpy - INFO - Chain [1] done processing
  comp = np.matmul(X, beta_c.transpose())
  comp = np.matmul(X, beta_c.transpose())
  comp = np.matmul(X, beta_c.transpose())
  Xb_a = np.matmul(seasonal_features.values,
  Xb_a = np.matmul(seasonal_features.values,
  Xb_a = np.matmul(seasonal_features.values,
  Xb_m = np.matmul(seasonal_features.values, beta * s_m.values)
  Xb_m = np.matmul(seasonal_features.values, beta * s_m.values)
  Xb_m = np.matmul(seasonal_features.values, beta * s_m.values)
09:37:47 - cmdstanpy - INFO - Chain [1] start processing
09:37:48 - cmdstanpy - INFO - Chain [1] done processing
  comp = np.matmul(X, beta_c.transpose())
  comp = np.matmul(X, beta_c.transpose())
  comp = np.matmul(X, beta_c.transpose())
  Xb_a = np.matmul(seasonal_features.values,
  Xb_a = np.matmul(seasonal_features.values,
  Xb_a = np.matmul(seasonal_features.values,
  Xb_m = np.matmul(seasonal_features.values, beta * s_m.values)
  Xb_m = np.matmul(seasonal_features.values, beta * s

Forecasting pm25...


  comp = np.matmul(X, beta_c.transpose())
  comp = np.matmul(X, beta_c.transpose())
  comp = np.matmul(X, beta_c.transpose())
  Xb_a = np.matmul(seasonal_features.values,
  Xb_a = np.matmul(seasonal_features.values,
  Xb_a = np.matmul(seasonal_features.values,
  Xb_m = np.matmul(seasonal_features.values, beta * s_m.values)
  Xb_m = np.matmul(seasonal_features.values, beta * s_m.values)
  Xb_m = np.matmul(seasonal_features.values, beta * s_m.values)
09:37:49 - cmdstanpy - INFO - Chain [1] start processing
09:37:49 - cmdstanpy - INFO - Chain [1] done processing
  comp = np.matmul(X, beta_c.transpose())
  comp = np.matmul(X, beta_c.transpose())
  comp = np.matmul(X, beta_c.transpose())
  Xb_a = np.matmul(seasonal_features.values,
  Xb_a = np.matmul(seasonal_features.values,
  Xb_a = np.matmul(seasonal_features.values,
  Xb_m = np.matmul(seasonal_features.values, beta * s_m.values)
  Xb_m = np.matmul(seasonal_features.values, beta * s_m.values)
  Xb_m = np.matmul(seasonal_features.values, 

Forecasting so2...


09:37:50 - cmdstanpy - INFO - Chain [1] done processing
  comp = np.matmul(X, beta_c.transpose())
  comp = np.matmul(X, beta_c.transpose())
  comp = np.matmul(X, beta_c.transpose())
  Xb_a = np.matmul(seasonal_features.values,
  Xb_a = np.matmul(seasonal_features.values,
  Xb_a = np.matmul(seasonal_features.values,
  Xb_m = np.matmul(seasonal_features.values, beta * s_m.values)
  Xb_m = np.matmul(seasonal_features.values, beta * s_m.values)
  Xb_m = np.matmul(seasonal_features.values, beta * s_m.values)
09:37:50 - cmdstanpy - INFO - Chain [1] start processing
09:37:50 - cmdstanpy - INFO - Chain [1] done processing
  comp = np.matmul(X, beta_c.transpose())
  comp = np.matmul(X, beta_c.transpose())
  comp = np.matmul(X, beta_c.transpose())
  Xb_a = np.matmul(seasonal_features.values,
  Xb_a = np.matmul(seasonal_features.values,
  Xb_a = np.matmul(seasonal_features.values,
  Xb_m = np.matmul(seasonal_features.values, beta * s_m.values)
  Xb_m = np.matmul(seasonal_features.values, beta * s

Forecasting co...


  comp = np.matmul(X, beta_c.transpose())
  comp = np.matmul(X, beta_c.transpose())
  comp = np.matmul(X, beta_c.transpose())
  Xb_a = np.matmul(seasonal_features.values,
  Xb_a = np.matmul(seasonal_features.values,
  Xb_a = np.matmul(seasonal_features.values,
  Xb_m = np.matmul(seasonal_features.values, beta * s_m.values)
  Xb_m = np.matmul(seasonal_features.values, beta * s_m.values)
  Xb_m = np.matmul(seasonal_features.values, beta * s_m.values)
09:37:52 - cmdstanpy - INFO - Chain [1] start processing
09:37:52 - cmdstanpy - INFO - Chain [1] done processing
  comp = np.matmul(X, beta_c.transpose())
  comp = np.matmul(X, beta_c.transpose())
  comp = np.matmul(X, beta_c.transpose())
  Xb_a = np.matmul(seasonal_features.values,
  Xb_a = np.matmul(seasonal_features.values,
  Xb_a = np.matmul(seasonal_features.values,
  Xb_m = np.matmul(seasonal_features.values, beta * s_m.values)
  Xb_m = np.matmul(seasonal_features.values, beta * s_m.values)
  Xb_m = np.matmul(seasonal_features.values, 

Forecasting o3...


  comp = np.matmul(X, beta_c.transpose())
  comp = np.matmul(X, beta_c.transpose())
  comp = np.matmul(X, beta_c.transpose())
  Xb_a = np.matmul(seasonal_features.values,
  Xb_a = np.matmul(seasonal_features.values,
  Xb_a = np.matmul(seasonal_features.values,
  Xb_m = np.matmul(seasonal_features.values, beta * s_m.values)
  Xb_m = np.matmul(seasonal_features.values, beta * s_m.values)
  Xb_m = np.matmul(seasonal_features.values, beta * s_m.values)
09:37:53 - cmdstanpy - INFO - Chain [1] start processing
09:37:53 - cmdstanpy - INFO - Chain [1] done processing
  comp = np.matmul(X, beta_c.transpose())
  comp = np.matmul(X, beta_c.transpose())
  comp = np.matmul(X, beta_c.transpose())
  Xb_a = np.matmul(seasonal_features.values,
  Xb_a = np.matmul(seasonal_features.values,
  Xb_a = np.matmul(seasonal_features.values,
  Xb_m = np.matmul(seasonal_features.values, beta * s_m.values)
  Xb_m = np.matmul(seasonal_features.values, beta * s_m.values)
  Xb_m = np.matmul(seasonal_features.values, 

Forecasting no2...


  comp = np.matmul(X, beta_c.transpose())
  comp = np.matmul(X, beta_c.transpose())
  comp = np.matmul(X, beta_c.transpose())
  Xb_a = np.matmul(seasonal_features.values,
  Xb_a = np.matmul(seasonal_features.values,
  Xb_a = np.matmul(seasonal_features.values,
  Xb_m = np.matmul(seasonal_features.values, beta * s_m.values)
  Xb_m = np.matmul(seasonal_features.values, beta * s_m.values)
  Xb_m = np.matmul(seasonal_features.values, beta * s_m.values)
09:37:55 - cmdstanpy - INFO - Chain [1] start processing
09:37:55 - cmdstanpy - INFO - Chain [1] done processing
  comp = np.matmul(X, beta_c.transpose())
  comp = np.matmul(X, beta_c.transpose())
  comp = np.matmul(X, beta_c.transpose())
  Xb_a = np.matmul(seasonal_features.values,
  Xb_a = np.matmul(seasonal_features.values,
  Xb_a = np.matmul(seasonal_features.values,
  Xb_m = np.matmul(seasonal_features.values, beta * s_m.values)
  Xb_m = np.matmul(seasonal_features.values, beta * s_m.values)
  Xb_m = np.matmul(seasonal_features.values, 

## nyoba pake xgb

In [11]:
df_forecasted

Unnamed: 0,tanggal,stasiun,pm10,pm25,so2,co,o3,no2
0,2024-01-01,DKI1,41.367492,72.377497,50.813794,5.219047,18.488442,18.410958
1,2024-01-01,DKI2,37.737434,70.594727,32.055401,7.890116,10.004616,20.061563
2,2024-01-01,DKI3,42.303475,67.588262,48.522416,8.755110,-3.402949,5.595026
3,2024-01-01,DKI4,47.225545,92.469565,28.585385,19.396138,-2.298899,8.247818
4,2024-01-01,DKI5,30.505511,68.416259,23.223022,13.777513,22.389081,12.109866
...,...,...,...,...,...,...,...,...
3022,2025-08-31,DKI3,64.526564,70.831201,61.244409,4.873161,21.919447,5.054854
3023,2025-08-31,DKI4,79.846741,107.980521,19.816248,26.065425,-20.989639,4.953007
3024,2025-08-31,DKI1,61.402589,89.202343,62.491694,6.177513,34.803632,27.100432
3025,2025-08-31,DKI2,67.701601,81.555754,31.638538,4.484153,4.469093,25.024826


In [12]:
df_forecasted["year"] = df_test_ts["year"].values
df_forecasted["month"] = df_test_ts["month"].values
df_forecasted["day"] = df_test_ts["day"].values
df_forecasted["dayofweek"] = df_test_ts["dayofweek"].values
df_forecasted["weekofyear"] = df_test_ts["weekofyear"].values
df_forecasted["quarter"] = df_test_ts["quarter"].values
df_forecasted["is_weekend"] = df_test_ts["is_weekend"].values


In [13]:
df_forecasted

Unnamed: 0,tanggal,stasiun,pm10,pm25,so2,co,o3,no2,year,month,day,dayofweek,weekofyear,quarter,is_weekend
0,2024-01-01,DKI1,41.367492,72.377497,50.813794,5.219047,18.488442,18.410958,2024,1,1,0,1,1,0
1,2024-01-01,DKI2,37.737434,70.594727,32.055401,7.890116,10.004616,20.061563,2024,1,1,0,1,1,0
2,2024-01-01,DKI3,42.303475,67.588262,48.522416,8.755110,-3.402949,5.595026,2024,1,1,0,1,1,0
3,2024-01-01,DKI4,47.225545,92.469565,28.585385,19.396138,-2.298899,8.247818,2024,1,1,0,1,1,0
4,2024-01-01,DKI5,30.505511,68.416259,23.223022,13.777513,22.389081,12.109866,2024,1,1,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3022,2025-08-31,DKI3,64.526564,70.831201,61.244409,4.873161,21.919447,5.054854,2025,8,31,6,35,3,1
3023,2025-08-31,DKI4,79.846741,107.980521,19.816248,26.065425,-20.989639,4.953007,2025,8,31,6,35,3,1
3024,2025-08-31,DKI1,61.402589,89.202343,62.491694,6.177513,34.803632,27.100432,2025,8,31,6,35,3,1
3025,2025-08-31,DKI2,67.701601,81.555754,31.638538,4.484153,4.469093,25.024826,2025,8,31,6,35,3,1


In [None]:
from xgboost import XGBClassifier
import pickle

model_xgb = XGBClassifier()
model_xgb.load_model("xgb_5kategori_berbahaya.json")

# load encoder
le_station = pickle.load(open("./models/le_station_5kategori.pkl", "rb"))
le_target = pickle.load(open("./models/le_target_5kategori.pkl", "rb"))

# ini data hasil forecasting prophet (sample)
df_sample = df_forecasted.copy()

In [27]:
df_sample["stasiun"] = le_station.transform(df_sample["stasiun"])

In [28]:
feature_cols = pickle.load(open("./models/feature_cols.pkl", "rb"))

In [29]:
for c in feature_cols:
    df_sample[c] = pd.to_numeric(df_sample[c], errors="coerce")

X_submit = df_sample[feature_cols]
y_pred_num = model_xgb.predict(X_submit)

y_pred_label = le_target.inverse_transform(y_pred_num)
df_sample["kategori"] = y_pred_label


In [30]:
df_sample

Unnamed: 0,tanggal,stasiun,pm10,pm25,so2,co,o3,no2,year,month,day,dayofweek,weekofyear,quarter,is_weekend,kategori
0,2024-01-01,0,41.367492,72.377497,50.813794,5.219047,18.488442,18.410958,2024,1,1,0,1,1,0,SEDANG
1,2024-01-01,1,37.737434,70.594727,32.055401,7.890116,10.004616,20.061563,2024,1,1,0,1,1,0,SEDANG
2,2024-01-01,2,42.303475,67.588262,48.522416,8.755110,-3.402949,5.595026,2024,1,1,0,1,1,0,SEDANG
3,2024-01-01,3,47.225545,92.469565,28.585385,19.396138,-2.298899,8.247818,2024,1,1,0,1,1,0,SEDANG
4,2024-01-01,4,30.505511,68.416259,23.223022,13.777513,22.389081,12.109866,2024,1,1,0,1,1,0,SEDANG
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3022,2025-08-31,2,64.526564,70.831201,61.244409,4.873161,21.919447,5.054854,2025,8,31,6,35,3,1,SEDANG
3023,2025-08-31,3,79.846741,107.980521,19.816248,26.065425,-20.989639,4.953007,2025,8,31,6,35,3,1,TIDAK SEHAT
3024,2025-08-31,0,61.402589,89.202343,62.491694,6.177513,34.803632,27.100432,2025,8,31,6,35,3,1,SEDANG
3025,2025-08-31,1,67.701601,81.555754,31.638538,4.484153,4.469093,25.024826,2025,8,31,6,35,3,1,SEDANG


In [34]:
X_fake_test = df_sample[feature_cols]
y_pred = model_xgb.predict(X_fake_test)

from sklearn.metrics import classification_report

y_true = le_target.transform(df_test_ts["kategori"])

print(classification_report(y_true, y_pred))


              precision    recall  f1-score   support

           0       0.50      0.17      0.25       362
           2       0.00      0.00      0.00         1
           3       0.78      0.88      0.83      2343
           4       0.19      0.17      0.18       321

    accuracy                           0.72      3027
   macro avg       0.37      0.30      0.31      3027
weighted avg       0.69      0.72      0.69      3027



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## test ke sample submission

In [35]:
sample = pd.read_csv("sample_submission.csv")
sample.head()


Unnamed: 0,id,category
0,2025-09-01_DKI1,
1,2025-09-01_DKI2,
2,2025-09-01_DKI3,
3,2025-09-01_DKI4,
4,2025-09-01_DKI5,


In [36]:
sample[["tanggal", "stasiun_kode"]] = sample["id"].str.split("_", expand=True)

sample["tanggal"] = pd.to_datetime(sample["tanggal"])

In [37]:
sample

Unnamed: 0,id,category,tanggal,stasiun_kode
0,2025-09-01_DKI1,,2025-09-01,DKI1
1,2025-09-01_DKI2,,2025-09-01,DKI2
2,2025-09-01_DKI3,,2025-09-01,DKI3
3,2025-09-01_DKI4,,2025-09-01,DKI4
4,2025-09-01_DKI5,,2025-09-01,DKI5
...,...,...,...,...
450,2025-11-30_DKI1,,2025-11-30,DKI1
451,2025-11-30_DKI2,,2025-11-30,DKI2
452,2025-11-30_DKI3,,2025-11-30,DKI3
453,2025-11-30_DKI4,,2025-11-30,DKI4


In [None]:
from prophet import Prophet

def forecast_station(df_hist, station, target_dates):
    df_s = df_hist[df_hist["stasiun"] == station].copy()
    df_s = df_s.sort_values("tanggal")

    forecasts = []

    for pol in pollutant_cols:
        df_p = df_s[["tanggal", pol]].dropna()
        df_p.columns = ["ds", "y"]

        m = Prophet()
        m.fit(df_p)

        future = pd.DataFrame({"ds": target_dates})
        fc = m.predict(future)[["ds", "yhat"]]
        fc.columns = ["tanggal", pol]

        forecasts.append(fc)

    # gabungkan semua polutan
    df_fc = forecasts[0]
    for f in forecasts[1:]:
        df_fc = df_fc.merge(f, on="tanggal")

    df_fc["stasiun"] = station
    return df_fc


In [40]:
hasil_forecast = []

for st in sample["stasiun_kode"].unique():
    target_dates = sample[sample["stasiun_kode"] == st]["tanggal"]

    df_fc = forecast_station(df, st, target_dates)
    hasil_forecast.append(df_fc)

df_forecast_sample = pd.concat(hasil_forecast).reset_index(drop=True)

12:13:38 - cmdstanpy - INFO - Chain [1] start processing
12:13:38 - cmdstanpy - INFO - Chain [1] done processing
  comp = np.matmul(X, beta_c.transpose())
  comp = np.matmul(X, beta_c.transpose())
  comp = np.matmul(X, beta_c.transpose())
  Xb_a = np.matmul(seasonal_features.values,
  Xb_a = np.matmul(seasonal_features.values,
  Xb_a = np.matmul(seasonal_features.values,
  Xb_m = np.matmul(seasonal_features.values, beta * s_m.values)
  Xb_m = np.matmul(seasonal_features.values, beta * s_m.values)
  Xb_m = np.matmul(seasonal_features.values, beta * s_m.values)
12:13:38 - cmdstanpy - INFO - Chain [1] start processing
12:13:38 - cmdstanpy - INFO - Chain [1] done processing
  comp = np.matmul(X, beta_c.transpose())
  comp = np.matmul(X, beta_c.transpose())
  comp = np.matmul(X, beta_c.transpose())
  Xb_a = np.matmul(seasonal_features.values,
  Xb_a = np.matmul(seasonal_features.values,
  Xb_a = np.matmul(seasonal_features.values,
  Xb_m = np.matmul(seasonal_features.values, beta * s_m.valu

In [None]:
df_forecast_sample["year"] = df_forecast_sample["tanggal"].dt.year
df_forecast_sample["month"] = df_forecast_sample["tanggal"].dt.month
df_forecast_sample["day"] = df_forecast_sample["tanggal"].dt.day
df_forecast_sample["dayofweek"] = df_forecast_sample["tanggal"].dt.dayofweek
df_forecast_sample["weekofyear"] = df_forecast_sample["tanggal"].dt.isocalendar().week
df_forecast_sample["quarter"] = df_forecast_sample["tanggal"].dt.quarter
df_forecast_sample["is_weekend"] = df_forecast_sample["dayofweek"].isin([5,6]).astype(int)


In [41]:
df_forecast_sample

Unnamed: 0,tanggal,pm10,pm25,so2,co,o3,no2,stasiun
0,2025-09-01,54.008622,80.691271,24.609263,19.324039,19.575979,37.853999,DKI1
1,2025-09-02,53.507727,80.298224,24.817371,19.473881,18.922531,38.782525,DKI1
2,2025-09-03,54.179877,80.475307,25.288162,19.977751,19.207502,39.460974,DKI1
3,2025-09-04,54.570896,80.422230,25.184700,19.946457,20.047437,39.569527,DKI1
4,2025-09-05,54.895539,80.376377,24.953556,19.799068,20.369262,39.120025,DKI1
...,...,...,...,...,...,...,...,...
450,2025-11-26,20.183393,72.713681,30.257704,10.435453,19.609786,23.166999,DKI5
451,2025-11-27,19.809239,72.357629,29.964881,10.406793,17.838326,23.265550,DKI5
452,2025-11-28,19.307386,71.641201,29.948848,10.230931,16.722826,22.698776,DKI5
453,2025-11-29,18.570588,71.003693,29.742422,10.285027,14.486873,22.116333,DKI5


In [47]:
print(df_forecast_sample["stasiun"].head())

0    0
1    0
2    0
3    0
4    0
Name: stasiun, dtype: object


In [48]:
df_forecast_sample["stasiun"] = df_forecast_sample["stasiun"].astype(int)

X_submit = df_forecast_sample[feature_cols]
y_pred_num = model_xgb.predict(X_submit)
y_pred = le_target.inverse_transform(y_pred_num)

df_forecast_sample["category"] = y_pred

In [49]:
df_forecast_sample

Unnamed: 0,tanggal,pm10,pm25,so2,co,o3,no2,stasiun,category
0,2025-09-01,54.008622,80.691271,24.609263,19.324039,19.575979,37.853999,0,SEDANG
1,2025-09-02,53.507727,80.298224,24.817371,19.473881,18.922531,38.782525,0,SEDANG
2,2025-09-03,54.179877,80.475307,25.288162,19.977751,19.207502,39.460974,0,SEDANG
3,2025-09-04,54.570896,80.422230,25.184700,19.946457,20.047437,39.569527,0,SEDANG
4,2025-09-05,54.895539,80.376377,24.953556,19.799068,20.369262,39.120025,0,SEDANG
...,...,...,...,...,...,...,...,...,...
450,2025-11-26,20.183393,72.713681,30.257704,10.435453,19.609786,23.166999,4,SEDANG
451,2025-11-27,19.809239,72.357629,29.964881,10.406793,17.838326,23.265550,4,SEDANG
452,2025-11-28,19.307386,71.641201,29.948848,10.230931,16.722826,22.698776,4,SEDANG
453,2025-11-29,18.570588,71.003693,29.742422,10.285027,14.486873,22.116333,4,SEDANG


In [51]:
(
    df_forecast_sample["category"]
    .value_counts()
    .sort_index()
    .reset_index()
    .rename(columns={"index": "category", "category": "count"})
)


Unnamed: 0,count,count.1
0,BAIK,5
1,SEDANG,449
2,TIDAK SEHAT,1


In [52]:
sample = pd.read_csv("sample_submission.csv")
sample["category"] = y_pred


In [53]:
sample.head()
sample["category"].value_counts()

category
SEDANG         449
BAIK             5
TIDAK SEHAT      1
Name: count, dtype: int64

In [54]:
sample.to_csv("submission5_xgb_prophet.csv", index=False)

In [None]:
# submission = sample[["id"]].copy()
# submission["category"] = df_forecast_sample["category"].values
# submission = sample[["id", "category"]]
# submission.to_csv("submission5_prophet.csv", index=False)

In [None]:
pollutant_cols = ["pm10","pm25","so2","co","o3","no2"]

med = df_filled.groupby("stasiun")[pollutant_cols].median().reset_index()
sample = sample.merge(med, on="stasiun", how="left")


In [44]:
feature_cols = model.feature_names_
X_submit = sample[feature_cols]

sample["category"] = model.predict(X_submit).ravel()

# feature_cols = model.feature_names_
# X_submit = sample[feature_cols]

# pred = model.predict(X_submit).ravel()

# sample["category"] = pred

## XGBoost

In [41]:
from xgboost import XGBClassifier
import pickle

model_xgb = XGBClassifier()
model_xgb.load_model("xgb_kategori.json")

# load encoder
le_station = pickle.load(open("le_station.pkl", "rb"))
le_target = pickle.load(open("le_target.pkl", "rb"))

# misal ini data hasil forecasting prophet (sample)
df_sample = df_forecasted.copy()
