In [117]:
from pathlib import Path

import numpy
import pandas as pd
import polars as ps
import seaborn as sb

# Data Preprocessing

In [118]:
type_map: list = [ps.Utf8, ps.Utf8, ps.Int32, ps.Utf8, ps.Utf8,
                  ps.Float32, ps.Float32, ps.Float32, ps.Float32,
                  ps.Float32, ps.Float32, ps.Float32, ps.Float32, ps.Float32,
                  ps.Float32, ps.Float32, ps.Float32, ps.Float32,
                  ps.Int32, ps.Datetime, ps.Float32, ps.Float32, ps.Float32, ps.Int32]
null_values: list = ['', '-']

## Load Datasets

In [123]:
datasets: list = [ps.read_csv(file, encoding='utf8', dtypes=type_map, null_values=null_values, n_threads=8)
                  for file in Path('data').glob('*.csv')]
dataset: ps.DataFrame = ps.concat(datasets)

## Remove Duplicate Rows and Add New Column

In [124]:
dataset = dataset.unique()
dataset = dataset.with_columns(ps.col('PublishTime').dt.strftime('%Y-%m').alias('PublishYearAndMonth'))

## Convert Non-numerical Data to Numbers

In [125]:
dataset: pd.DataFrame = dataset.to_pandas()
dataset.replace('-', numpy.NaN, inplace=True)

In [130]:
dataset['Pollutant']

## Fill Missing Values

In [131]:
dataset.isnull().any()

In [132]:
dataset['AQI'] = dataset.groupby(['County', 'PublishYearAndMonth'])['AQI'].transform(
  lambda group: group.fillna(group.median()))

In [133]:
# dataset['Pollutant'] = dataset.groupby(['County', 'PublishYearAndMonth'])['Pollutant'].transform(
#   lambda group: group.fillna(group.median()))

In [134]:
# dataset['Status'] = dataset.groupby(['County', 'PublishYearAndMonth'])['Status'].transform(
#   lambda group: group.fillna(group.median()))

In [135]:
dataset['WindDirec'] = dataset.groupby(['County', 'PublishYearAndMonth'])['WindDirec'].transform(
  lambda group: group.fillna(group.median()))

In [136]:
dataset['SO2'] = dataset.groupby(['County', 'PublishYearAndMonth'])['SO2'].transform(
  lambda group: group.fillna(group.median()))

In [137]:
dataset.isnull().any()

# Data Analysis

In [None]:
sb.heatmap(dataset.corr(), annot=True)