## Packages import

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.impute import KNNImputer
import holidays

## Load the data and merge it

In [None]:
df_atm = pd.read_csv("GDAIR_train_data.csv", sep=";") # Read atmospheric data
df_atm["Date"] = pd.to_datetime(df_atm[['Year', 'Month', 'Day']])
df_pm = pd.read_csv("data_pm.csv") # read PM10 data
df_pm["Date"] = pd.to_datetime(df_pm["Date"])
df = df_atm.merge(df_pm, how="left", on="Date") # Merge atmospheric data with PM10 data
df = df[(df["Date"] >= "2015-01-01") & (df["Date"] <= "2024-06-30")] # Filter data for the years 2015 to 2024
df.drop(["Year", "Day"], inplace=True, axis=1) # Drop unnecessary columns

## Impute data

In [None]:
print(df.isna().sum()) # Check for missing values
imputer = KNNImputer(n_neighbors=10) # Initialize KNN imputer with 10 neighbors
df_to_impute = df.iloc[:, 2:]
df_imputed_part = pd.DataFrame(imputer.fit_transform(df_to_impute), columns=df_to_impute.columns)
df_imputed = pd.concat([df.iloc[:, :2], df_imputed_part], axis=1) # Concatenate imputed data with original date columns

## Data plots

In [None]:
value_columns = df.select_dtypes(include=["number"]).columns # Select numeric columns
for col in value_columns: # Plot each numeric column
    plt.figure(figsize=(10, 4))
    plt.plot(df["Date"], df[col], label=col)
    plt.title(f"{col} over time")
    plt.xlabel("Date")
    plt.ylabel(col)
    plt.grid(True)
    plt.legend()
    plt.tight_layout()
    plt.show()

## New variables

In [None]:
# Adding additional features
pl_holidays = holidays.Poland(years=range(2015, 2025)) # Create a list of Polish holidays from 2015 to 2024
df_imputed['IsWeekend'] = df_imputed['Date'].dt.weekday < 5  # Weekday is 0-4 (Monday to Friday), weekend is 5-6 (Saturday and Sunday)
df_imputed['IsHoliday'] = df_imputed['Date'].isin(pl_holidays) # Check if the date is a holiday
# Convert boolean columns to integers (0 and 1)
df_imputed['IsWeekend'] = df_imputed['IsWeekend'].astype(int)
df_imputed['IsHoliday'] = df_imputed['IsHoliday'].astype(int)

def get_season(date): # Function to determine the season based on the date
    if (date.month == 12 and date.day >= 21) or (date.month <= 2) or (date.month == 3 and date.day <= 20):
        return 4  # Winter
    elif (date.month == 3 and date.day >= 21) or (date.month <= 6 and date.day <= 20):
        return 1  # Spring
    elif (date.month == 6 and date.day >= 21) or (date.month <= 9 and date.day <= 20):
        return 2  # Summer
    else:
        return 3  # Autumn

df_imputed['Season'] = df_imputed['Date'].apply(get_season) # Apply the get_season function to each date in the DataFrame
df_imputed['Risk'] = np.where(df_imputed['PM10'] > 50, 1, 0) # Create a risk column based on PM10 levels
df_imputed['FutureRisk'] = df_imputed['Risk'].shift(-1).fillna(0).astype(int) # Shift the risk column to create a future risk column
df_imputed.drop(["Risk"], inplace=True, axis=1) # Drop the original risk column
df_imputed.groupby("FutureRisk").count() # Count the number of occurrences of each future risk value

## Save to excel

In [None]:
df_imputed.to_excel('data.xlsx') # Save the DataFrame to an Excel file