In [90]:
from pathlib import Path

import pandas as pd
import polars as ps
import seaborn as sb

# Data Preprocessing

## Load Datasets

In [91]:
type_map: list = [ps.Utf8, ps.Utf8, ps.Int32, ps.Utf8, ps.Utf8,
                  ps.Float32, ps.Float32, ps.Float32, ps.Float32,
                  ps.Float32, ps.Float32, ps.Float32, ps.Float32, ps.Float32,
                  ps.Float32, ps.Float32, ps.Float32, ps.Float32,
                  ps.Int32, ps.Datetime, ps.Float32, ps.Float32, ps.Float32, ps.Int32]
null_values: list = ['', '-']

In [92]:
datasets: list = [ps.read_csv(file, encoding='utf8', dtypes=type_map, null_values=null_values, n_threads=8)
                  for file in Path('data').glob('*.csv')]
dataset: ps.DataFrame = ps.concat(datasets)

## Remove Duplicate Rows and Add New Column

In [93]:
dataset = dataset.unique()
dataset = dataset.with_columns(ps.col('PublishTime').dt.strftime('%Y-%m').alias('PublishYearAndMonth'))

## Convert Non-numerical Data to Numbers

In [94]:
dataset['Pollutant'].value_counts(sort=True)

Pollutant,counts
str,u32
,758757
"""細懸浮微粒""",394179
"""臭氧八小時""",46944
"""二氧化氮""",9469
"""懸浮微粒""",8893
"""二氧化硫""",128
"""臭氧""",9
"""一氧化碳""",2


In [95]:
pollutant_type: list = dataset['Pollutant'].value_counts(sort=True)['Pollutant'].to_list()
pollutant_type.remove(None)
pollutant_map: dict = dict(zip(pollutant_type, [i for i in range(len(pollutant_type))]))

In [96]:
dataset = dataset.with_columns(ps.col('Pollutant').map_dict(pollutant_map, default='unknown').alias('Pollutant').cast(ps.Int32, strict=False))

In [97]:
dataset['Status'].value_counts(sort=True)

Status,counts
str,u32
"""良好""",755203
"""普通""",385681
"""對敏感族群不健康""",64309
"""對所有族群不健康""",9610
,3554
"""非常不健康""",22
"""危害""",2


In [98]:
status_type: list = dataset['Status'].value_counts(sort=True)['Status'].to_list()
status_type.remove(None)
status_map: dict = dict(zip(status_type, [i for i in range(len(status_type))]))

In [99]:
dataset = dataset.with_columns(ps.col('Status').map_dict(status_map, default='unknown').alias('Status').cast(ps.Int32, strict=False))

## Fill Missing Values

In [80]:
dataset: pd.DataFrame = dataset.to_pandas()

In [81]:
dataset.isnull().any()

SiteName               False
County                 False
AQI                     True
Pollutant               True
Status                  True
SO2                     True
CO                      True
CO_8hr                  True
O3                      True
O3_8hr                  True
PM10                    True
PM10_AVG                True
PM2.5                   True
PM2.5_AVG               True
NO2                     True
NOx                     True
NO                      True
WindSpeed               True
WindDirec               True
PublishTime            False
SO2_AVG                 True
Longitude              False
Latitude               False
SiteId                 False
PublishYearAndMonth    False
dtype: bool

In [82]:
dataset['AQI'] = dataset.groupby(['County', 'PublishYearAndMonth'])['AQI'].transform(
  lambda group: group.fillna(group.median()))

In [83]:
dataset['Pollutant'] = dataset.groupby(['County', 'PublishYearAndMonth'])['Pollutant'].transform(
  lambda group: group.fillna(group.median()))

  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)


In [84]:
dataset['Status'] = dataset.groupby(['County', 'PublishYearAndMonth'])['Status'].transform(
  lambda group: group.fillna(group.median()))

In [85]:
dataset['WindDirec'] = dataset.groupby(['County', 'PublishYearAndMonth'])['WindDirec'].transform(
  lambda group: group.fillna(group.median()))

  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, ou

In [86]:
dataset['SO2'] = dataset.groupby(['County', 'PublishYearAndMonth'])['SO2'].transform(
  lambda group: group.fillna(group.median()))

In [87]:
dataset.isnull().any()

SiteName               False
County                 False
AQI                    False
Pollutant               True
Status                  True
SO2                    False
CO                      True
CO_8hr                  True
O3                      True
O3_8hr                  True
PM10                    True
PM10_AVG                True
PM2.5                   True
PM2.5_AVG               True
NO2                     True
NOx                     True
NO                      True
WindSpeed               True
WindDirec               True
PublishTime            False
SO2_AVG                 True
Longitude              False
Latitude               False
SiteId                 False
PublishYearAndMonth    False
dtype: bool

# Data Analysis

In [88]:
sb.heatmap(dataset.corr(), annot=True)

ValueError: could not convert string to float: '菜寮'