# linear regression
## spring
[link](https://docs.google.com/presentation/d/1L1LwpKm5DxhHndiyyiZ3wJA2mKOJTQ2heKo45Me5yVg/edit#slide=id.g1eaee6a347_0_8)

## fall
[link](https://ntumlta.github.io/2017fall-ml-hw1/)

- hw1.sh
 - Python3.5+ required
 - Only (1)numpy (2)scipy (3)pandas are allowed
 - numpy.linalg.lstsq is forbidden.
 - Please handcraft "linear regression" using Gradient Descent
 - beat public simple baseline
 - For those who wish to load model instead of running whole training precess:
 - please upload your training code named train.py
 - as long as there are Gradient Descent Code in train.py, it's fine
- hw1_best.sh
 - Python3.5+ required
 - any library is allowed
 - meet the higher score you choose in kaggle
 
 ### Data 簡介

* [train.csv](./data/train.csv) : 每個月前20天每個小時的氣象資料(每小時有18種測資)。共12個月。
* [test.csv](./data/test.csv) : 排除train.csv中剩餘的資料，取連續9小時的資料當feature，預測第10小時的PM2.5值。總共取240筆不重複的test data。
* [sampleSubmission.csv](./data/sampleSubmission.csv)
* [testing答案](./data/ans.csv)

## linear regression

找出 $f$ 使得  
loss function() $$L(f) = \sum_{f} (\hat{y}^n - f(x))^2$$ 最小 $f^* = arg \displaystyle \min_{f} L(f)$  
又因為 $f(X) = b + w \cdot X$, where $X \in \Omega, b \in \Bbb{F}$
可以寫成 $$L(b, w) = \sum_{b, w} (\hat{y}^n - b - w \cdot X)^2$$
引此可以將題目變成找
$$b^*, w^* = arg \min_{b,w} \sum_{b, w} (\hat{y}^n - b - w \cdot X)^2$$

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

train = "./data/train.csv"
test = "./data/test.csv"

df = pd.read_csv(train, parse_dates={'date':[0]}, encoding='big5')
test_df = pd.read_csv(test, encoding='big5')

In [33]:
df.shape

(4320, 27)

In [2]:
df.head(18) #測項有18項

Unnamed: 0,date,測站,測項,0,1,2,3,4,5,6,...,14,15,16,17,18,19,20,21,22,23
0,2014-01-01,豐原,AMB_TEMP,14,14,14,13,12,12,12,...,22,22,21,19,17,16,15,15,15,15
1,2014-01-01,豐原,CH4,1.8,1.8,1.8,1.8,1.8,1.8,1.8,...,1.8,1.8,1.8,1.8,1.8,1.8,1.8,1.8,1.8,1.8
2,2014-01-01,豐原,CO,0.51,0.41,0.39,0.37,0.35,0.3,0.37,...,0.37,0.37,0.47,0.69,0.56,0.45,0.38,0.35,0.36,0.32
3,2014-01-01,豐原,NMHC,0.2,0.15,0.13,0.12,0.11,0.06,0.1,...,0.1,0.13,0.14,0.23,0.18,0.12,0.1,0.09,0.1,0.08
4,2014-01-01,豐原,NO,0.9,0.6,0.5,1.7,1.8,1.5,1.9,...,2.5,2.2,2.5,2.3,2.1,1.9,1.5,1.6,1.8,1.5
5,2014-01-01,豐原,NO2,16,9.2,8.2,6.9,6.8,3.8,6.9,...,11,11,22,28,19,12,8.1,7,6.9,6
6,2014-01-01,豐原,NOx,17,9.8,8.7,8.6,8.5,5.3,8.8,...,14,13,25,30,21,13,9.7,8.6,8.7,7.5
7,2014-01-01,豐原,O3,16,30,27,23,24,28,24,...,65,64,51,34,33,34,37,38,38,36
8,2014-01-01,豐原,PM10,56,50,48,35,25,12,4,...,52,51,66,85,85,63,46,36,42,42
9,2014-01-01,豐原,PM2.5,26,39,36,35,31,28,25,...,36,45,42,49,45,44,41,30,24,13


In [3]:
def rainfall(raindata):
    try:
        return float(raindata)
    except:
        return 0

In [5]:
grouped = df.groupby("測項")
# rainfall is label data
df.iloc[grouped.groups.get("RAINFALL"),3:] = \
            df.iloc[grouped.groups.get("RAINFALL"),3:].applymap(rainfall)
# change data types
df.iloc[:,3:] = df.iloc[:,3:].astype(float)

In [6]:
df.head(18)

Unnamed: 0,date,測站,測項,0,1,2,3,4,5,6,...,14,15,16,17,18,19,20,21,22,23
0,2014-01-01,豐原,AMB_TEMP,14.0,14.0,14.0,13.0,12.0,12.0,12.0,...,22.0,22.0,21.0,19.0,17.0,16.0,15.0,15.0,15.0,15.0
1,2014-01-01,豐原,CH4,1.8,1.8,1.8,1.8,1.8,1.8,1.8,...,1.8,1.8,1.8,1.8,1.8,1.8,1.8,1.8,1.8,1.8
2,2014-01-01,豐原,CO,0.51,0.41,0.39,0.37,0.35,0.3,0.37,...,0.37,0.37,0.47,0.69,0.56,0.45,0.38,0.35,0.36,0.32
3,2014-01-01,豐原,NMHC,0.2,0.15,0.13,0.12,0.11,0.06,0.1,...,0.1,0.13,0.14,0.23,0.18,0.12,0.1,0.09,0.1,0.08
4,2014-01-01,豐原,NO,0.9,0.6,0.5,1.7,1.8,1.5,1.9,...,2.5,2.2,2.5,2.3,2.1,1.9,1.5,1.6,1.8,1.5
5,2014-01-01,豐原,NO2,16.0,9.2,8.2,6.9,6.8,3.8,6.9,...,11.0,11.0,22.0,28.0,19.0,12.0,8.1,7.0,6.9,6.0
6,2014-01-01,豐原,NOx,17.0,9.8,8.7,8.6,8.5,5.3,8.8,...,14.0,13.0,25.0,30.0,21.0,13.0,9.7,8.6,8.7,7.5
7,2014-01-01,豐原,O3,16.0,30.0,27.0,23.0,24.0,28.0,24.0,...,65.0,64.0,51.0,34.0,33.0,34.0,37.0,38.0,38.0,36.0
8,2014-01-01,豐原,PM10,56.0,50.0,48.0,35.0,25.0,12.0,4.0,...,52.0,51.0,66.0,85.0,85.0,63.0,46.0,36.0,42.0,42.0
9,2014-01-01,豐原,PM2.5,26.0,39.0,36.0,35.0,31.0,28.0,25.0,...,36.0,45.0,42.0,49.0,45.0,44.0,41.0,30.0,24.0,13.0


In [7]:
# normalize
for key, index in grouped.groups.items():
    item_avg = grouped.get_group(key).mean().mean()
    df.iloc[index,3:] = df.iloc[index,3:]/item_avg
    print(key, item_avg)

AMB_TEMP 22.52741319444444
CH4 1.702395833333332
CO 0.3883628472222222
NMHC 0.1404270833333333
NO 2.1357291666666662
NO2 10.125989583333334
NOx 12.247725694444448
O3 31.905468749999994
PM10 42.709201388888886
PM2.5 21.41423611111111
RAINFALL 0.20062500000000003
RH 73.22916666666667
SO2 2.763125
THC 1.8396527777777762
WD_HR 156.3292708333333
WIND_DIREC 158.48279513888886
WIND_SPEED 2.2972395833333334
WS_HR 1.7127604166666668


In [12]:
df.head()

Unnamed: 0,date,測站,測項,0,1,2,3,4,5,6,...,14,15,16,17,18,19,20,21,22,23
0,2014-01-01,豐原,AMB_TEMP,0.621465,0.621465,0.621465,0.577075,0.532684,0.532684,0.532684,...,0.976588,0.976588,0.932198,0.843417,0.754636,0.710246,0.665855,0.665855,0.665855,0.665855
1,2014-01-01,豐原,CH4,1.057333,1.057333,1.057333,1.057333,1.057333,1.057333,1.057333,...,1.057333,1.057333,1.057333,1.057333,1.057333,1.057333,1.057333,1.057333,1.057333,1.057333
2,2014-01-01,豐原,CO,1.313205,1.055714,1.004216,0.952717,0.901219,0.772473,0.952717,...,0.952717,0.952717,1.210208,1.776689,1.44195,1.15871,0.978466,0.901219,0.926968,0.823972
3,2014-01-01,豐原,NMHC,1.424227,1.06817,0.925747,0.854536,0.783325,0.427268,0.712113,...,0.712113,0.925747,0.996959,1.637861,1.281804,0.854536,0.712113,0.640902,0.712113,0.569691
4,2014-01-01,豐原,NO,0.421402,0.280934,0.234112,0.795981,0.842803,0.702336,0.889626,...,1.17056,1.030093,1.17056,1.076916,0.983271,0.889626,0.702336,0.749159,0.842803,0.702336


In [13]:
grouped_bydate = df.groupby('date')

In [35]:
# 取連續9小時的資料當feature，預測第10小時的PM2.5
X_train = []
y_train = []
for date, index in grouped_bydate.groups.items():
    print(date)
    X_train.append(grouped_bydate.get_group(date).iloc[:,3:12].values.flatten())
    y_train.append(grouped_bydate.get_group(date).iloc[9,13])

2014-01-01 00:00:00
2014-01-02 00:00:00
2014-01-03 00:00:00
2014-01-04 00:00:00
2014-01-05 00:00:00
2014-01-06 00:00:00
2014-01-07 00:00:00
2014-01-08 00:00:00
2014-01-09 00:00:00
2014-01-10 00:00:00
2014-01-11 00:00:00
2014-01-12 00:00:00
2014-01-13 00:00:00
2014-01-14 00:00:00
2014-01-15 00:00:00
2014-01-16 00:00:00
2014-01-17 00:00:00
2014-01-18 00:00:00
2014-01-19 00:00:00
2014-01-20 00:00:00
2014-02-01 00:00:00
2014-02-02 00:00:00
2014-02-03 00:00:00
2014-02-04 00:00:00
2014-02-05 00:00:00
2014-02-06 00:00:00
2014-02-07 00:00:00
2014-02-08 00:00:00
2014-02-09 00:00:00
2014-02-10 00:00:00
2014-02-11 00:00:00
2014-02-12 00:00:00
2014-02-13 00:00:00
2014-02-14 00:00:00
2014-02-15 00:00:00
2014-02-16 00:00:00
2014-02-17 00:00:00
2014-02-18 00:00:00
2014-02-19 00:00:00
2014-02-20 00:00:00
2014-03-01 00:00:00
2014-03-02 00:00:00
2014-03-03 00:00:00
2014-03-04 00:00:00
2014-03-05 00:00:00
2014-03-06 00:00:00
2014-03-07 00:00:00
2014-03-08 00:00:00
2014-03-09 00:00:00
2014-03-10 00:00:00


## gradient descent
$x_{n+1} = x_n - \eta_n\nabla{f(x_n)} $

In [45]:
# 
def loss_func(b, W, X=X_train, y=y_train):
    sum = 0
    W = np.array(W)
    for idx in range(len(y)):
        sum = sum + ( y[idx] - b - np.dot(W, X[idx]))**2
    return sum

In [46]:
X_train[0].size

162

In [47]:
W = np.random.rand(162)

In [48]:
W = np.zeros(162)
W[0] = 1

In [49]:
for i in range(162):
    W = np.zeros(162)
    W[i] = 1
    print(loss_func(0, W))

161.027287253
161.825346457
162.449510911
162.174456562
161.258115971
161.052731767
161.226111619
161.071638703
157.57462876
134.177087224
132.819354204
132.792539603
132.279969905
131.104008719
131.800533787
131.999735856
133.057635325
130.217308162
126.833830021
130.000829241
130.902583552
127.892750016
123.699650094
117.771101666
104.301309147
118.772874829
143.778194166
162.802912897
161.794440786
167.241821134
151.100944772
142.311887246
143.817136754
131.73213034
148.707119176
224.779221817
205.044942523
213.28704724
215.624753813
199.766159834
180.747615864
183.924802194
227.083501233
1080.76337625
1843.81069307
153.046344892
156.421652286
156.057904677
150.247149081
138.662128598
132.982660134
121.060063613
134.686244796
134.970762321
153.145361008
158.862669487
160.817229467
153.556527939
141.295557633
135.981929624
123.759891379
183.156688367
241.928501141
171.170146883
174.798820835
176.502258147
183.713638193
187.804789501
189.828748714
190.405307449
189.028810495
165.67713