# Visualization of pollution levels in Poland

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

In [None]:
df = pd.read_csv("Stats_2000-2023.csv", header = 0, skiprows = [1], encoding="ansi", delimiter=";")

df = df.loc[df["Czas uśredniania"] == "24g"] # We only select the 24-hour measurement interval
df = df.loc[df["Rok"] >= 2010] # We only select data from 2010 onwards


We want to select two stations that have data for each year in the range [2010; 2023]

In [None]:
# pv = pd.pivot_table(df, index = "Station Code", columns = "Year", values = "Indicator", aggfunc = "count")
# pv

We found the following: 
- DsOsieczow21
- DsWalbrzWyso

In [None]:
station1 = df[df["Kod stacji"] == "DsOsieczow21"]
station2 = df[df["Kod stacji"] == "DsWalbrzWyso"]
station1_measurement = station1.loc[:, "Średnia"].to_numpy().astype(str)
station2_measurement = station2.loc[:, "Średnia"].to_numpy().astype(str)

station1_measurement = np.char.replace(station1_measurement, ',', '.').astype(float)
station2_measurement = np.char.replace(station2_measurement, ',', '.').astype(float)

years = np.arange(start = 2010, stop = 2024)
plt.plot(years, station1_measurement, label = "DsOsieczow21", color = "blue")
plt.plot(years, station2_measurement, label = "DsWalbrzWyso", color = "red")

plt.ylim(10, 30)
plt.yticks(np.arange(10, 31, 2))

plt.xlabel("Year")
plt.ylabel("Average PM2.5 concentration [µg/m3]")
plt.title("Average PM2.5 concentration at two stations over the years 2010-2023")
plt.legend()
plt.show()


As you can see, the average PM2.5 concentration at both measuring stations decreases year by year

We will now present the distribution of average PM2.5 concentration at all stations over the years 2010-2023

In [None]:
df2 = df.loc[:, ["Kod stacji", "Średnia", "Rok"]] # We select the fields we are interested in
plt.figure(figsize=(10, 50))
sns.boxplot(x = "Year", y = "Station Code", data = df2)
plt.title("Distribution of average PM2.5 concentration at all stations over the years 2010-2023")
plt.xlabel("Year")
plt.ylabel("Station")
plt.show()

As you can see, at most measuring stations in Poland, the average PM2.5 concentration in Poland has very large differences over the years

We will now look at the number of PM2.5 exceedances according to WHO standards (25 µg/m³)

In [None]:
df1 = df.loc[:, ["Województwo", "Średnia"]]

# We change the format of Average to float
tmp = df1["Średnia"].to_numpy().astype(str)
tmp = np.char.replace(tmp, ',', '.').astype(float)
df1["Średnia"] = tmp

filtered_df = df1[df1["Średnia"] > 25]
filtered_df = filtered_df.groupby("Województwo").count()

filtered_df.loc["zachodniopomorskie"] = 0
filtered_df.loc["warmińsko-mazurskie"] = 0

filtered_df.plot(kind = "bar")

plt.title("Number of WHO standard PM2.5 exceedances by voivodeship")
plt.xlabel("Voivodeship")
plt.ylabel("Number of exceedances")
plt.show()

As you can see, the number of WHO-standard PM2.5 exceedances is small in most voivodeships.
Only the Małopolskie and Śląskie voivodeships stand out negatively

In [None]:
df = df[["Województwo", "Liczba pomiarów"]]
filtered_df2 = df.groupby("Województwo").sum()

joint_df = filtered_df.join(filtered_df2)

joint_df = joint_df["Średnia"] / joint_df["Liczba pomiarów"] * 100

joint_df.plot(kind = "bar")

plt.title("Percentage of measurements exceeding WHO standards by voivodeship")
plt.xlabel("Voivodeship")
plt.ylabel("Percentage of measurements exceeding WHO standards")
plt.show()



As you can see, there are very few average measurements that exceed WHO standards in the whole of Poland