# PM 2.5 Prediction

In [1]:
import os

import numpy as np
import pandas as pd

In [2]:
train_csv_path = "data/train.csv"
test_csv_path = "data/test.csv"

train_df = pd.read_csv(train_csv_path)
test_df = pd.read_csv(test_csv_path)

## rename column names

In [3]:
train_df = train_df.rename(columns={"日期": "date", "測站": "station", "測項": "obs_item"})

In [4]:
train_df

Unnamed: 0,date,station,obs_item,0,1,2,3,4,5,6,...,14,15,16,17,18,19,20,21,22,23
0,2014-01-01,豐原,AMB_TEMP,14,14,14,13,12,12,12,...,22,22,21,19,17,16,15,15,15,15
1,2014-01-01,豐原,CH4,1.8,1.8,1.8,1.8,1.8,1.8,1.8,...,1.8,1.8,1.8,1.8,1.8,1.8,1.8,1.8,1.8,1.8
2,2014-01-01,豐原,CO,0.51,0.41,0.39,0.37,0.35,0.3,0.37,...,0.37,0.37,0.47,0.69,0.56,0.45,0.38,0.35,0.36,0.32
3,2014-01-01,豐原,NMHC,0.2,0.15,0.13,0.12,0.11,0.06,0.1,...,0.1,0.13,0.14,0.23,0.18,0.12,0.1,0.09,0.1,0.08
4,2014-01-01,豐原,NO,0.9,0.6,0.5,1.7,1.8,1.5,1.9,...,2.5,2.2,2.5,2.3,2.1,1.9,1.5,1.6,1.8,1.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4315,2014-12-20,豐原,THC,1.8,1.8,1.8,1.8,1.8,1.7,1.7,...,1.8,1.8,2,2.1,2,1.9,1.9,1.9,2,2
4316,2014-12-20,豐原,WD_HR,46,13,61,44,55,68,66,...,59,308,327,21,100,109,108,114,108,109
4317,2014-12-20,豐原,WIND_DIREC,36,55,72,327,74,52,59,...,18,311,52,54,121,97,107,118,100,105
4318,2014-12-20,豐原,WIND_SPEED,1.9,2.4,1.9,2.8,2.3,1.9,2.1,...,2.3,2.6,1.3,1,1.5,1,1.7,1.5,2,2


# check missing value

In [5]:
train_df.isnull().values.any()

False

In [6]:
train_df[
    train_df.isna().any(axis=1)
]  # select all rows with NaN under an entire DataFrame:

Unnamed: 0,date,station,obs_item,0,1,2,3,4,5,6,...,14,15,16,17,18,19,20,21,22,23


### Covert value columns to numeric

In [7]:
val_cols = train_df.columns[3:]
train_df[val_cols] = train_df[val_cols].apply(pd.to_numeric, errors="coerce")
train_df

Unnamed: 0,date,station,obs_item,0,1,2,3,4,5,6,...,14,15,16,17,18,19,20,21,22,23
0,2014-01-01,豐原,AMB_TEMP,14.00,14.00,14.00,13.00,12.00,12.00,12.00,...,22.00,22.00,21.00,19.00,17.00,16.00,15.00,15.00,15.00,15.00
1,2014-01-01,豐原,CH4,1.80,1.80,1.80,1.80,1.80,1.80,1.80,...,1.80,1.80,1.80,1.80,1.80,1.80,1.80,1.80,1.80,1.80
2,2014-01-01,豐原,CO,0.51,0.41,0.39,0.37,0.35,0.30,0.37,...,0.37,0.37,0.47,0.69,0.56,0.45,0.38,0.35,0.36,0.32
3,2014-01-01,豐原,NMHC,0.20,0.15,0.13,0.12,0.11,0.06,0.10,...,0.10,0.13,0.14,0.23,0.18,0.12,0.10,0.09,0.10,0.08
4,2014-01-01,豐原,NO,0.90,0.60,0.50,1.70,1.80,1.50,1.90,...,2.50,2.20,2.50,2.30,2.10,1.90,1.50,1.60,1.80,1.50
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4315,2014-12-20,豐原,THC,1.80,1.80,1.80,1.80,1.80,1.70,1.70,...,1.80,1.80,2.00,2.10,2.00,1.90,1.90,1.90,2.00,2.00
4316,2014-12-20,豐原,WD_HR,46.00,13.00,61.00,44.00,55.00,68.00,66.00,...,59.00,308.00,327.00,21.00,100.00,109.00,108.00,114.00,108.00,109.00
4317,2014-12-20,豐原,WIND_DIREC,36.00,55.00,72.00,327.00,74.00,52.00,59.00,...,18.00,311.00,52.00,54.00,121.00,97.00,107.00,118.00,100.00,105.00
4318,2014-12-20,豐原,WIND_SPEED,1.90,2.40,1.90,2.80,2.30,1.90,2.10,...,2.30,2.60,1.30,1.00,1.50,1.00,1.70,1.50,2.00,2.00


In [8]:
train_df.isnull().values.any()

True

In [9]:
train_df[train_df.isna().any(axis=1)]

Unnamed: 0,date,station,obs_item,0,1,2,3,4,5,6,...,14,15,16,17,18,19,20,21,22,23
10,2014-01-01,豐原,RAINFALL,,,,,,,,...,,,,,,,,,,
28,2014-01-02,豐原,RAINFALL,,,,,,,,...,,,,,,,,,,
46,2014-01-03,豐原,RAINFALL,,,,,,,1.2,...,,,,,,,,,,
64,2014-01-04,豐原,RAINFALL,,,,,,,,...,,,,,,,,,,
82,2014-01-05,豐原,RAINFALL,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4240,2014-12-16,豐原,RAINFALL,,,,,,,,...,,,,,,,,,,
4258,2014-12-17,豐原,RAINFALL,,,,,,,,...,,,,,,,,,,
4276,2014-12-18,豐原,RAINFALL,,,,,,,,...,0.0,,,,,,,,,
4294,2014-12-19,豐原,RAINFALL,,,,,,,,...,0.2,0.2,,,,,,0.2,,0.4


In [10]:
train_df[train_df.isna().any(axis=1)]["obs_item"].unique()

array(['RAINFALL'], dtype=object)

In [11]:
np.unique(train_df[train_df["obs_item"] == "RAINFALL"].iloc[:, 3:].values.flatten())

array([ 0. ,  0.2,  0.4,  0.6,  0.8,  1. ,  1.2,  1.4,  1.6,  1.8,  2. ,
        2.2,  2.4,  2.6,  2.8,  3. ,  3.2,  3.4,  3.6,  3.8,  4. ,  4.2,
        4.6,  4.8,  5. ,  5.4,  6.4,  6.8,  7. ,  7.2,  7.4,  7.6,  7.8,
        8.2,  8.4,  8.6,  9.2,  9.8, 10. , 11. , 12. , 13. , 14. , 15. ,
       17. , 18. , 19. , 20. , 21. , 23. , 27. , 38. , 56. , 66. , 74. ,
        nan])

In [12]:
train_df.iloc[train_df[train_df["obs_item"] == "RAINFALL"].index] = train_df[
    train_df["obs_item"] == "RAINFALL"
].fillna(0)

In [13]:
train_df.isnull().values.any()

False

In [14]:
train_df.iloc[:, 3:].values.dtype

dtype('float64')

# get rid of first 3 columns

In [15]:
train_data = train_df.iloc[:, 3:]

# get month data

In [16]:
date_gb = train_df.groupby(["date"])
print(f"total day:\n{date_gb.size()} \n")

check_obs_item = []
for date, group in date_gb:
    check_obs_item.append(group["obs_item"].values)
np.unique(check_obs_item)

total day:
date
2014-01-01    18
2014-01-02    18
2014-01-03    18
2014-01-04    18
2014-01-05    18
              ..
2014-12-16    18
2014-12-17    18
2014-12-18    18
2014-12-19    18
2014-12-20    18
Length: 240, dtype: int64 



  for date, group in date_gb:


array(['AMB_TEMP', 'CH4', 'CO', 'NMHC', 'NO', 'NO2', 'NOx', 'O3', 'PM10',
       'PM2.5', 'RAINFALL', 'RH', 'SO2', 'THC', 'WD_HR', 'WIND_DIREC',
       'WIND_SPEED', 'WS_HR'], dtype=object)

In [17]:
train_df[train_df["date"] == "2014-09-01"]["obs_item"]

2880      AMB_TEMP
2881           CH4
2882            CO
2883          NMHC
2884            NO
2885           NO2
2886           NOx
2887            O3
2888          PM10
2889         PM2.5
2890      RAINFALL
2891            RH
2892           SO2
2893           THC
2894         WD_HR
2895    WIND_DIREC
2896    WIND_SPEED
2897         WS_HR
Name: obs_item, dtype: object

In [18]:
fc = 18  # feature count

year_data = list()
for month in range(12):  # 0 - 11
    total_hr = 24 * 20
    temp = np.zeros((fc, total_hr))

    day_per_month = 20
    for day in range(day_per_month):
        hr_idx = 24 * day
        row_idx = 18 * 20 * month + 18 * day
        temp[:, hr_idx : hr_idx + 24] = train_data.iloc[row_idx : row_idx + 18]

    year_data.append(temp)

year_data = np.array(year_data)
year_data.shape

(12, 18, 480)

# partition 9hr interval as a x, and 10th hr as a y

In [26]:
x_all, y_all = list(), list()

for month in range(12):
    month_data = year_data[month]
    for hr_itv_idx in range(24 * 20 - 9):
        x = month_data[:, hr_itv_idx : hr_itv_idx + 9]
        y = month_data[9, hr_itv_idx + 9]  # pm2.5 is at row-9

        x_all.append(x)
        y_all.append(y)

x_all = np.array(x_all)
y_all = np.array(y_all)

x_all.shape, y_all.shape

((5652, 18, 9), (5652,))

### Train Valid Split

In [36]:
x_train = x_all[:int(x_all.shape[0] * 0.8)]
y_train = y_all[:int(y_all.shape[0] * 0.8)]

x_valid = x_all[int(x_all.shape[0] * 0.8):]
y_valid = y_all[int(y_all.shape[0] * 0.8):]

In [None]:
W = np.ones(1 + 18 * 9)
X = np.empty([n, W.size - 1])
for i in range(n):
    X[i] = x_train[i][4:10].reshape(1, -1)
# 添加 1
X = np.concatenate((np.ones([n, 1]), X), axis=1)

In [49]:
W = np.ones(1 + 18 * 9)
X = np.empty([n, W.size - 1])
X

NameError: name 'n' is not defined