In [1]:
import pandas as pd
import numpy as np
import math

In [2]:
df = pd.read_csv("train.csv", encoding="ANSI")
df = df.replace("NR", 0)
df = df.drop(columns=['日期','測項','測站']).astype(float)
df.head(18)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,14,15,16,17,18,19,20,21,22,23
0,14.0,14.0,14.0,13.0,12.0,12.0,12.0,12.0,15.0,17.0,...,22.0,22.0,21.0,19.0,17.0,16.0,15.0,15.0,15.0,15.0
1,1.8,1.8,1.8,1.8,1.8,1.8,1.8,1.8,1.8,1.8,...,1.8,1.8,1.8,1.8,1.8,1.8,1.8,1.8,1.8,1.8
2,0.51,0.41,0.39,0.37,0.35,0.3,0.37,0.47,0.78,0.74,...,0.37,0.37,0.47,0.69,0.56,0.45,0.38,0.35,0.36,0.32
3,0.2,0.15,0.13,0.12,0.11,0.06,0.1,0.13,0.26,0.23,...,0.1,0.13,0.14,0.23,0.18,0.12,0.1,0.09,0.1,0.08
4,0.9,0.6,0.5,1.7,1.8,1.5,1.9,2.2,6.6,7.9,...,2.5,2.2,2.5,2.3,2.1,1.9,1.5,1.6,1.8,1.5
5,16.0,9.2,8.2,6.9,6.8,3.8,6.9,7.8,15.0,21.0,...,11.0,11.0,22.0,28.0,19.0,12.0,8.1,7.0,6.9,6.0
6,17.0,9.8,8.7,8.6,8.5,5.3,8.8,9.9,22.0,29.0,...,14.0,13.0,25.0,30.0,21.0,13.0,9.7,8.6,8.7,7.5
7,16.0,30.0,27.0,23.0,24.0,28.0,24.0,22.0,21.0,29.0,...,65.0,64.0,51.0,34.0,33.0,34.0,37.0,38.0,38.0,36.0
8,56.0,50.0,48.0,35.0,25.0,12.0,4.0,2.0,11.0,38.0,...,52.0,51.0,66.0,85.0,85.0,63.0,46.0,36.0,42.0,42.0
9,26.0,39.0,36.0,35.0,31.0,28.0,25.0,20.0,19.0,30.0,...,36.0,45.0,42.0,49.0,45.0,44.0,41.0,30.0,24.0,13.0


In [3]:
raw_data = df.to_numpy()
month_data = {}
for month in range(12):
    sample = np.empty([18, 480])
    for day in range(20):
        sample[:, day * 24 : (day + 1) * 24] = raw_data[18 * (20 * month + day) : 18 * (20 * month + day + 1), :]
    month_data[month] = sample

In [4]:
x = np.empty([12 * 471, 18 * 9], dtype = float)
y = np.empty([12 * 471, 1], dtype = float)
for month in range(12):
    for day in range(20):
        for hour in range(24):
            if day == 19 and hour > 14:
                continue
            x[month * 471 + day * 24 + hour, :] = month_data[month][:,day * 24 + hour : day * 24 + hour + 9].reshape(1, -1) #vector dim:18*9 (9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9)
            y[month * 471 + day * 24 + hour, 0] = month_data[month][9, day * 24 + hour + 9] #value
print(x.shape)
print(y.shape)

(5652, 162)
(5652, 1)


In [5]:
train_mean = np.mean(x, axis=0)
train_std = np.std(x, axis=0)

x = (x - train_mean) / train_std
x = np.insert(x, 0, 1, axis=1)
print(x.shape)

(5652, 163)


## Training

In [195]:
lr = 100000
reg = 0.05
epoch = 10000
eps = 1e-13
w = np.zeros([x.shape[1], 1])
adagrad = np.zeros([w.shape[0], 1])

In [196]:
def cal_loss(x, w):
    loss = 0.5 * (np.sum(np.power(np.dot(x, w) - y, 2)) + reg * np.sum(np.power(w, 2))) / x.shape[0]
    return loss

In [197]:
for t in range(epoch):
    
    gd = np.dot(x.transpose(), np.dot(x, w) - y)
    adagrad += gd ** 2
    w = w - (lr  * gd / np.sqrt(adagrad + eps) + reg * w ) / x.shape[0]
    
    if (t+1) % (epoch / 10) == 0:
        loss = cal_loss(x, w)
        print("Epoch:",t+1,":", loss)

Epoch: 1000 : 17.35537361369146
Epoch: 2000 : 16.49010685088354
Epoch: 3000 : 16.26955181671212
Epoch: 4000 : 16.197732317838028
Epoch: 5000 : 16.171365116044914
Epoch: 6000 : 16.160517479330593
Epoch: 7000 : 16.155382204317053
Epoch: 8000 : 16.15251425495114
Epoch: 9000 : 16.15063141992855
Epoch: 10000 : 16.14922622127596


## Testing

In [198]:
df_test = pd.read_csv("test.csv", header=None, encoding="ANSI")
df_test = df_test.replace("NR", 0)
df_test = df_test.drop(columns=[0,1]).astype(float)
df_test.head()

Unnamed: 0,2,3,4,5,6,7,8,9,10
0,21.0,21.0,20.0,20.0,19.0,19.0,19.0,18.0,17.0
1,1.7,1.7,1.7,1.7,1.7,1.7,1.7,1.7,1.8
2,0.39,0.36,0.36,0.4,0.53,0.55,0.34,0.31,0.23
3,0.16,0.24,0.22,0.27,0.27,0.26,0.27,0.29,0.1
4,1.3,1.3,1.3,1.3,1.4,1.6,1.2,1.1,0.9


In [199]:
test_data = []
for i in range(0, 4320, 18):
    test_data.append(df_test.iloc[i:i+18].to_numpy().flatten())

test_data = np.array(test_data)

In [200]:
test_data = (test_data - train_mean) / train_std
test_data = np.insert(test_data, 0, 1, axis=1)

In [201]:
predict = []
for i in range(test_data.shape[0]):
    predict.append(np.dot(test_data[i], w))
    
predict = np.array(predict)

In [202]:
id = []
for i in range(240):
    id.append("id_"+str(i))
id = np.array(id)

In [203]:
submit = pd.DataFrame(columns=['id', 'value'])
submit.id = id
submit.value = predict
submit.head()

Unnamed: 0,id,value
0,id_0,6.540573
1,id_1,18.171227
2,id_2,23.896967
3,id_3,7.727713
4,id_4,27.046812


In [204]:
submit.to_csv("submit.csv", header=True, index=False)