# linear regression
## spring
[link](https://docs.google.com/presentation/d/1L1LwpKm5DxhHndiyyiZ3wJA2mKOJTQ2heKo45Me5yVg/edit#slide=id.g1eaee6a347_0_8)

## fall
[link](https://ntumlta.github.io/2017fall-ml-hw1/)

- hw1.sh
 - Python3.5+ required
 - Only (1)numpy (2)scipy (3)pandas are allowed
 - numpy.linalg.lstsq is forbidden.
 - Please handcraft "linear regression" using Gradient Descent
 - beat public simple baseline
 - For those who wish to load model instead of running whole training precess:
 - please upload your training code named train.py
 - as long as there are Gradient Descent Code in train.py, it's fine
- hw1_best.sh
 - Python3.5+ required
 - any library is allowed
 - meet the higher score you choose in kaggle
 
 ### Data 簡介

* [train.csv](./data/train.csv) : 每個月前20天每個小時的氣象資料(每小時有18種測資)。共12個月。
* [test.csv](./data/test.csv) : 排除train.csv中剩餘的資料，取連續9小時的資料當feature，預測第10小時的PM2.5值。總共取240筆不重複的test data。
* [sampleSubmission.csv](./data/sampleSubmission.csv)
* [testing答案](./data/ans.csv)

## linear regression

找出 $f$ 使得  
loss function() $$L(f) = \sum_{f} (\hat{y}^n - f(x))^2$$ 最小 $f^* = arg \displaystyle \min_{f} L(f)$  
又因為 $f(X) = b + w \cdot X$, where $X \in \Omega, b \in \Bbb{F}$
可以寫成 $$L(b, w) = \sum_{b, w} (\hat{y}^n - b - w \cdot X)^2$$
引此可以將題目變成找
$$b^*, w^* = arg \min_{b,w} \sum_{b, w} (\hat{y}^n - b - w \cdot X)^2$$


In [177]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

train = "./data/train.csv"
test = "./data/test.csv"

df = pd.read_csv(train, parse_dates={'date':[0]}, encoding='big5')
test_df = pd.read_csv(test, encoding='big5', header=None)

In [178]:
df.shape

(4320, 27)

In [179]:
df.head(18) #測項有18項

Unnamed: 0,date,測站,測項,0,1,2,3,4,5,6,...,14,15,16,17,18,19,20,21,22,23
0,2014-01-01,豐原,AMB_TEMP,14,14,14,13,12,12,12,...,22,22,21,19,17,16,15,15,15,15
1,2014-01-01,豐原,CH4,1.8,1.8,1.8,1.8,1.8,1.8,1.8,...,1.8,1.8,1.8,1.8,1.8,1.8,1.8,1.8,1.8,1.8
2,2014-01-01,豐原,CO,0.51,0.41,0.39,0.37,0.35,0.3,0.37,...,0.37,0.37,0.47,0.69,0.56,0.45,0.38,0.35,0.36,0.32
3,2014-01-01,豐原,NMHC,0.2,0.15,0.13,0.12,0.11,0.06,0.1,...,0.1,0.13,0.14,0.23,0.18,0.12,0.1,0.09,0.1,0.08
4,2014-01-01,豐原,NO,0.9,0.6,0.5,1.7,1.8,1.5,1.9,...,2.5,2.2,2.5,2.3,2.1,1.9,1.5,1.6,1.8,1.5
5,2014-01-01,豐原,NO2,16,9.2,8.2,6.9,6.8,3.8,6.9,...,11,11,22,28,19,12,8.1,7,6.9,6
6,2014-01-01,豐原,NOx,17,9.8,8.7,8.6,8.5,5.3,8.8,...,14,13,25,30,21,13,9.7,8.6,8.7,7.5
7,2014-01-01,豐原,O3,16,30,27,23,24,28,24,...,65,64,51,34,33,34,37,38,38,36
8,2014-01-01,豐原,PM10,56,50,48,35,25,12,4,...,52,51,66,85,85,63,46,36,42,42
9,2014-01-01,豐原,PM2.5,26,39,36,35,31,28,25,...,36,45,42,49,45,44,41,30,24,13


In [180]:
def rainfall(raindata):
    try:
        return float(raindata)
    except:
        return 0

In [181]:
grouped = df.groupby("測項")
# rainfall is label data
df.iloc[grouped.groups.get("RAINFALL"),3:] = \
            df.iloc[grouped.groups.get("RAINFALL"),3:].applymap(rainfall)
# change data types
df.iloc[:,3:] = df.iloc[:,3:].astype(np.float16)

In [182]:
df.head(18)

Unnamed: 0,date,測站,測項,0,1,2,3,4,5,6,...,14,15,16,17,18,19,20,21,22,23
0,2014-01-01,豐原,AMB_TEMP,14.0,14.0,14.0,13.0,12.0,12.0,12.0,...,22.0,22.0,21.0,19.0,17.0,16.0,15.0,15.0,15.0,15.0
1,2014-01-01,豐原,CH4,1.799805,1.799805,1.799805,1.799805,1.799805,1.799805,1.799805,...,1.799805,1.799805,1.799805,1.799805,1.799805,1.799805,1.799805,1.799805,1.799805,1.799805
2,2014-01-01,豐原,CO,0.509766,0.409912,0.389893,0.370117,0.350098,0.300049,0.370117,...,0.370117,0.370117,0.469971,0.689941,0.560059,0.449951,0.379883,0.350098,0.360107,0.320068
3,2014-01-01,豐原,NMHC,0.199951,0.150024,0.130005,0.119995,0.109985,0.059998,0.099976,...,0.099976,0.130005,0.140015,0.22998,0.180054,0.119995,0.099976,0.090027,0.099976,0.080017
4,2014-01-01,豐原,NO,0.899902,0.600098,0.5,1.700195,1.799805,1.5,1.900391,...,2.5,2.199219,2.5,2.300781,2.099609,1.900391,1.5,1.599609,1.799805,1.5
5,2014-01-01,豐原,NO2,16.0,9.203125,8.203125,6.898438,6.800781,3.800781,6.898438,...,11.0,11.0,22.0,28.0,19.0,12.0,8.101562,7.0,6.898438,6.0
6,2014-01-01,豐原,NOx,17.0,9.796875,8.703125,8.601562,8.5,5.300781,8.796875,...,14.0,13.0,25.0,30.0,21.0,13.0,9.703125,8.601562,8.703125,7.5
7,2014-01-01,豐原,O3,16.0,30.0,27.0,23.0,24.0,28.0,24.0,...,65.0,64.0,51.0,34.0,33.0,34.0,37.0,38.0,38.0,36.0
8,2014-01-01,豐原,PM10,56.0,50.0,48.0,35.0,25.0,12.0,4.0,...,52.0,51.0,66.0,85.0,85.0,63.0,46.0,36.0,42.0,42.0
9,2014-01-01,豐原,PM2.5,26.0,39.0,36.0,35.0,31.0,28.0,25.0,...,36.0,45.0,42.0,49.0,45.0,44.0,41.0,30.0,24.0,13.0


In [101]:
# normalize
# for key, index in grouped.groups.items():
#     item_values = grouped.get_group(key).iloc[:,3:].values.flatten()
#     item_avg = item_values.mean()
#     item_std = item_values.std()
#     df.iloc[index,3:] = df.iloc[index,3:]-item_avg
#     df.iloc[index,3:] = df.iloc[index,3:]/item_std
#     print(key, item_avg, item_std)


AMB_TEMP 22.5274131944 6.28960619484
CH4 1.70239583333 0.125254558508
CO 0.388362847222 0.323544567397
NMHC 0.140427083333 0.104635735025
NO 2.13572916667 2.2819569657
NO2 10.1259895833 6.18701792958
NOx 12.2477256944 7.57647572496
O3 31.90546875 18.7018620848
PM10 42.7092013889 26.2200156539
PM2.5 21.4142361111 16.661090904
RAINFALL 0.200625 2.0452654401
RH 73.2291666667 13.3601910488
SO2 2.763125 1.81678266784
THC 1.83965277778 0.181822995407
WD_HR 156.329270833 95.7375696937
WIND_DIREC 158.482795139 94.6892111736
WIND_SPEED 2.29723958333 1.06531539649
WS_HR 1.71276041667 1.06259085472


In [183]:
grouped_bydate = df.groupby('date')

In [184]:
# 取連續9小時的資料當feature，預測第10小時的PM2.5
X_train = []
y_train = []
for date, index in grouped_bydate.groups.items():
    print(date)
    X_train.append(grouped_bydate.get_group(date).iloc[:,3:12].values.flatten())
    y_train.append(grouped_bydate.get_group(date).iloc[9,13])

2014-01-01 00:00:00
2014-01-02 00:00:00
2014-01-03 00:00:00
2014-01-04 00:00:00
2014-01-05 00:00:00
2014-01-06 00:00:00
2014-01-07 00:00:00
2014-01-08 00:00:00
2014-01-09 00:00:00
2014-01-10 00:00:00
2014-01-11 00:00:00
2014-01-12 00:00:00
2014-01-13 00:00:00
2014-01-14 00:00:00
2014-01-15 00:00:00
2014-01-16 00:00:00
2014-01-17 00:00:00
2014-01-18 00:00:00
2014-01-19 00:00:00
2014-01-20 00:00:00
2014-02-01 00:00:00
2014-02-02 00:00:00
2014-02-03 00:00:00
2014-02-04 00:00:00
2014-02-05 00:00:00
2014-02-06 00:00:00
2014-02-07 00:00:00
2014-02-08 00:00:00
2014-02-09 00:00:00
2014-02-10 00:00:00
2014-02-11 00:00:00
2014-02-12 00:00:00
2014-02-13 00:00:00
2014-02-14 00:00:00
2014-02-15 00:00:00
2014-02-16 00:00:00
2014-02-17 00:00:00
2014-02-18 00:00:00
2014-02-19 00:00:00
2014-02-20 00:00:00
2014-03-01 00:00:00
2014-03-02 00:00:00
2014-03-03 00:00:00
2014-03-04 00:00:00
2014-03-05 00:00:00
2014-03-06 00:00:00
2014-03-07 00:00:00
2014-03-08 00:00:00
2014-03-09 00:00:00
2014-03-10 00:00:00


In [185]:
type(X_train[0])

numpy.ndarray

In [186]:
test = np.matrix(X_train)

In [187]:
test_std = (test-test.mean(axis=0))/test.std(0)

In [197]:
class standardize():
    mean = None
    std = None
    def __init__(self):
        pass
    def fit(self, X):
        '''X as matrix like'''
        X = np.matrix(X)
        self.mean = X.mean(0)
        self.std = X.std(0)
    def transfer(self, X):
        try:
            return (X-self.mean)/self.std
        except:
            print("dimesion may be same of fitting")
            

## gradient descent
$x_{n+1} = x_n - \eta_n\nabla{f(x_n)} $ s.t. $f(x_{n+1})\le f(x_n))$
現在 $f = L(W,b) = \displaystyle\sum_{X} (\hat{y} - b - W \cdot X)^2$因此
$$\begin{aligned}
\nabla L(w,b) &= 
    \begin{bmatrix}
    {\partial L(w_1,b)}\over{\partial w}  \\
    \vdots \\
    {\partial L(w,b)}\over{\partial b}
    \end{bmatrix}\\
 &=
    \begin{bmatrix}
     \sum_{X} -2 x_i (\hat{y} -b - W \cdot X) \\
     \vdots \\
     \sum_{X} -2 (\hat{y} - b - W\cdot X)
    \end{bmatrix}\\
\end{aligned}
$$  


In [135]:
# 
def loss_func( b, W, X=X_train, y=y_train):
    sum = 0
    w = np.array(W)
    for idx in range(len(y)):
        sum = sum + ( y[idx] - b - np.dot(w, X[idx]))**2
    return sum

In [136]:
X_train[0].size

162

In [137]:
W = np.zeros(163)

loss_func(W[0],W[1:])

173399.0

In [138]:
def grad( b, W, X=X_train, y=y_train):
    """
    input b scale, W array_like
    ouput grad L(b,W)
    """
    card_datas = len(y)
    #b_array = np.full(card_datas, b)
    W = np.array(W)
    #X = np.ndarray(X)
    #y = np.array(y)
    feature_length = len(W)
    b_sum = 0
    w_sum = np.zeros(feature_length)
    for idx in range(card_datas):
        err = (y[idx] - b - np.dot(W, X[idx]))
        b_sum = b_sum - 2*err
        w_sum = w_sum - 2*err*X[idx]
    return (b_sum, w_sum)

In [139]:
b = 0
W = np.zeros(162)
eta = 0.00001
loss_0 = loss_func(b,W)
loss_1 = loss_0 + 1 
for i in range(20):
    gb, gW = grad(b,W)
    b = b - eta*gb
    W = W - eta*gW
    print(loss_func(b,W))

408176710717.0
1.58446311228e+18
6.15129494815e+24
2.38809168838e+31
9.27118917312e+37
3.59931526507e+44
1.39734721571e+51
5.42486305715e+57
2.10607205267e+64
8.176316055e+70
3.17425722195e+77
1.23232869709e+84
4.78421851632e+90
1.85735728348e+97
7.21074103688e+103
2.79939604315e+110
1.08679789862e+117
4.219230342e+123
1.63801427123e+130
6.35919477075e+136


In [143]:
b = 0
W = np.zeros(162)
eta = 0.00001
loss_0 = loss_func(b,W)
loss_1 = loss_0 -1 
count = 0
while True:
    tmp_err = loss_0 - loss_1 #error test avoid the loop too long
    gb, gW = grad(b,W)
    b = b - eta*gb
    W = W - eta*gW
    loss_0 = loss_1
    loss_1 = loss_func(b,W)
    count += 1
    if abs(tmp_err - loss_0 + loss_1) < 1e-6 or count > 5000:
        break
print(loss_0, loss_1)

  
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app
  


nan nan


In [142]:
count

1

In [20]:
def haty(b, W, X):
    return b + np.dot(W,X)

In [21]:
test_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,id_0,AMB_TEMP,15.0,14.0,14.0,13.0,13.0,13.0,13.0,13.0,12.0
1,id_0,CH4,1.8,1.8,1.8,1.8,1.8,1.8,1.8,1.8,1.8
2,id_0,CO,0.36,0.35,0.34,0.33,0.33,0.34,0.34,0.37,0.42
3,id_0,NMHC,0.11,0.09,0.09,0.1,0.1,0.1,0.1,0.11,0.12
4,id_0,NO,0.6,0.4,0.3,0.3,0.3,0.7,0.8,0.8,0.9


In [117]:
def rainfall(raindata):
    '''rain data is object'''
    try:
        return float(raindata)
    except:
        return 0
    
def data_trans(df, item_index = 2, date_index = 0):
    '''transfer csv file to X,y forms with normalize
    here supports partitiion of datas is connected.
    train data item:2 date:0
    test data item:1 date:0'''
    data_start = item_index + 1
    data_end = data_start + 9 #
    columns = df.columns
    grouped_byitems = df.groupby(columns[item_index])
    grouped_bydate = df.groupby(columns[date_index])
    #trans object data to value
    df.iloc[grouped_byitems.groups.get("RAINFALL"), data_start:] = \
        df.iloc[grouped_byitems.groups.get("RAINFALL"), data_start:].\
        applymap(rainfall)
    # change data types
    df.iloc[:, data_start:] = df.iloc[:, data_start:].astype(float)
    #
    X_train = []
    y_train = []
    if data_end == 12:
        for date, index in grouped_bydate.groups.items():
            X_train.append(grouped_bydate.get_group(date).\
                           iloc[:,data_start:data_end].values.flatten())
            y_train.append(grouped_bydate.get_group(date).iloc[9,13])
    else:
        for date, index in grouped_bydate.groups.items():
            X_train.append(grouped_bydate.get_group(date).\
                           iloc[:,data_start:data_end].values.flatten())
    return  (X_train, y_train)


In [118]:
X, y = data_trans(df)

In [119]:
tX, ty = data_trans(test_df, item_index=1)

In [123]:
tX[0].size

162