In [1]:
import os

import numpy as np
import pandas as pd
from tqdm import tqdm

from linear_regression import get_z_score_norm, LinearRegression, k_fold_cross_validation, get_lr_grid, z_score_norm
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt

In [2]:
train_csv_path = 'data/train.csv'
test_csv_path = 'data/test.csv'

train_df = pd.read_csv(train_csv_path)
test_df = pd.read_csv(test_csv_path, header=None)

train_df = train_df.rename(columns={"日期": "date", "測站": "station", "測項": "obs_item"})

val_cols = train_df.columns[3:]
train_df[val_cols] = train_df[val_cols].apply(pd.to_numeric, errors="coerce")
train_df

train_df.iloc[train_df[train_df["obs_item"] == "RAINFALL"].index] = train_df[train_df["obs_item"] == "RAINFALL"].fillna(0)

train_data = train_df.iloc[:, 3:]

date_gb = train_df.groupby(["date"])

check_obs_item = []
for date, group in date_gb:
    check_obs_item.append(group["obs_item"].values)
np.unique(check_obs_item)

fc = 18  # feature count

year_data = list()
for month in range(12):  # 0 - 11
    total_hr = 24 * 20
    temp = np.zeros((fc, total_hr))

    day_per_month = 20
    for day in range(day_per_month):
        hr_idx = 24 * day
        row_idx = 18 * 20 * month + 18 * day
        temp[:, hr_idx : hr_idx + 24] = train_data.iloc[row_idx : row_idx + 18]

    year_data.append(temp)

year_data = np.array(year_data)
year_data.shape

x_all, y_all = list(), list()

for month in range(12):
    month_data = year_data[month]
    for hr_itv_idx in range(24 * 20 - 9):
        x = month_data[:, hr_itv_idx : hr_itv_idx + 9]
        y = month_data[9, hr_itv_idx + 9]  # pm2.5 is at row-9

        x_all.append(x)
        y_all.append(y)

x_all = np.array(x_all)
y_all = np.array(y_all)

x_all.shape, y_all.shape

  for date, group in date_gb:


((5652, 18, 9), (5652,))

In [3]:
val_cols = test_df.columns[2:]
test_df[val_cols] = test_df[val_cols].apply(pd.to_numeric, errors='coerce')
test_df.iloc[test_df[test_df[1] == 'RAINFALL'].index] = test_df[test_df[1] == 'RAINFALL'].fillna(0)

temp = []
for g_name, g_df in test_df.groupby(0):
    x = g_df.iloc[:, 2:].values
    temp.append(x)
X_test = np.array(temp)

m, _, _, = X_test.shape
X_test = X_test.reshape((m, -1))

In [4]:
m, feat_n, hr_n, = x_all.shape
x_all = x_all.reshape((m, -1))

In [5]:
seed = 369
X_train, X_valid, y_train, y_valid = train_test_split(x_all, y_all, test_size=0.2, random_state=seed)

In [12]:
iteration = 20000
lr = 1e-6

model = LinearRegression(X_train, y_train, iteration, lr, validation=(X_valid, y_valid))
final_w, final_b, history = model.gradient_descent()
# final_w, final_b, history = model.ada_grad()

Iteration    0: Cost  165.859   
Iteration 2000: Cost   36.499   
Iteration 4000: Cost   30.619   
Iteration 6000: Cost   27.623   
Iteration 8000: Cost   25.700   
Iteration 10000: Cost   24.334   
Iteration 12000: Cost   23.307   
Iteration 14000: Cost   22.504   
Iteration 16000: Cost   21.859   
Iteration 18000: Cost   21.328   


In [13]:
pred = model.predict(X_test)

In [14]:
close_form_pred = np.load('close_form_pred.npy')

In [15]:
mean_squared_error(close_form_pred, pred)

745.1762363648523