In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.linear_model import LinearRegression as SKLLR
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler

from linear_regression import CloseFormSol, GradientDescent, AdaGrad, GradientDescentBiasSep, AdaGradBiasSep

In [2]:
train_csv_path = 'data/train.csv'
test_csv_path = 'data/test.csv'

train_df = pd.read_csv(train_csv_path)
test_df = pd.read_csv(test_csv_path, header=None)

train_df = train_df.rename(columns={"日期": "date", "測站": "station", "測項": "obs_item"})

train_df = train_df.replace('NR', 0)
test_df = test_df.replace('NR', 0)

In [3]:
train_df[train_df.columns[3:]] = train_df[train_df.columns[3:]].apply(pd.to_numeric, errors='coerce')
test_df[test_df.columns[2:]] = test_df[test_df.columns[2:]].apply(pd.to_numeric, errors='coerce')

print(f"NaN in train df = {train_df.isnull().values.any()}")
print(f"NaN in test df = {test_df.isnull().values.any()}")
print(f"train val dtype = {train_df[train_df.columns[3:]].values.dtype}")
print(f"test val dtype = {test_df[test_df.columns[2:]].values.dtype}")

NaN in train df = False
NaN in test df = False
train val dtype = float64
test val dtype = float64


In [4]:
def get_train_x_train_y(train_df: pd.DataFrame):
    fc = 18  # feature count
    
    # get year data
    year_data = list()
    for month in range(12):  # 0 - 11
        total_hr = 24 * 20
        temp = np.zeros((fc, total_hr))

        day_per_month = 20
        for day in range(day_per_month):
            hr_idx = 24 * day
            row_idx = 18 * 20 * month + 18 * day
            temp[:, hr_idx : hr_idx + 24] = train_df.iloc[row_idx : row_idx + 18]

        year_data.append(temp)

    year_data = np.array(year_data)
    
    train_x, train_y = list(), list()
    
    for month in range(12):
        month_data = year_data[month]
        for hr_itv_idx in range(24 * 20 - 9):
            x = month_data[:, hr_itv_idx : hr_itv_idx + 9].flatten()
            y = month_data[9, hr_itv_idx + 9]  # pm2.5 is at row-9

            train_x.append(x)
            train_y.append(y)
    
    train_x, train_y = np.array(train_x), np.array(train_y)
    print(f"train_x, shape = {train_x.shape}")
    print(f"train_y, shape = {train_y.shape}")
    
    return train_x, train_y


def get_test_x(test_df: pd.DataFrame):
    test_x = list()
    for i in range(0, len(test_df), 18):
        sub_df = test_df.iloc[i:i+18, 2:]
        test_x.append(sub_df.values.flatten())
        
    test_x = np.array(test_x)
    print(f"test_x, shape = {test_x.shape}")
    
    return test_x

In [5]:
train_x, train_y = get_train_x_train_y(train_df.iloc[:, 3:])
test_x = get_test_x(test_df)

train_x, shape = (5652, 162)
train_y, shape = (5652,)
test_x, shape = (240, 162)


# Close Form Solution

In [6]:
cfs = CloseFormSol()
cfs.fit(train_x, train_y)
cfs_pred = cfs.predict(test_x)

# Sklearn Linear Regression

In [7]:
skl_lr = SKLLR()
skl_lr.fit(train_x, train_y)
skl_pred = skl_lr.predict(test_x)

In [8]:
diff = mean_squared_error(cfs_pred, skl_pred)
print(f"Diffence between Close Form Solution and Sklearn Linear Regression = {round(diff, 3)}")

Diffence between Close Form Solution and Sklearn Linear Regression = 0.0


# My Own Linear Regression

### Gradient Descent

### add bias into w solution

In [9]:
gd = GradientDescent(iteration=10000, lr=1e-6)
gd.fit(train_x, train_y)

Iteration    0: Cost  732.851   
Iteration 1000: Cost   73.409   
Iteration 2000: Cost   61.568   
Iteration 3000: Cost   55.562   
Iteration 4000: Cost   51.737   
Iteration 5000: Cost   49.037   
Iteration 6000: Cost   47.016   
Iteration 7000: Cost   45.441   
Iteration 8000: Cost   44.177   
Iteration 9000: Cost   43.138   


In [10]:
gd_pred = gd.predict(test_x)

diff = mean_squared_error(cfs_pred, gd_pred)
print(f"Diffence between Close Form Solution and My Gradient Descent = {round(diff, 3)}")

Diffence between Close Form Solution and My Gradient Descent = 11.054


### seperate bias solution

In [11]:
gd_bs = GradientDescentBiasSep(iteration=10000, lr=1e-6)
gd_bs.fit(train_x, train_y)

Iteration    0: Cost  732.851   
Iteration 1000: Cost   73.409   
Iteration 2000: Cost   61.568   
Iteration 3000: Cost   55.562   
Iteration 4000: Cost   51.737   
Iteration 5000: Cost   49.037   
Iteration 6000: Cost   47.016   
Iteration 7000: Cost   45.441   
Iteration 8000: Cost   44.177   
Iteration 9000: Cost   43.138   


In [12]:
gd_bs_pred = gd_bs.predict(test_x)

diff = mean_squared_error(cfs_pred, gd_bs_pred)
print(f"Diffence between Close Form Solution and My Gradient Descent BS = {round(diff, 3)}")

Diffence between Close Form Solution and My Gradient Descent BS = 11.054


### AdaGrad

### add bias to w solution

In [13]:
ada_grad = AdaGrad(iteration=10000, lr=1.5)
ada_grad.fit(train_x, train_y)

Iteration    0: Cost  732.851   
Iteration 1000: Cost   49.866   
Iteration 2000: Cost   42.866   
Iteration 3000: Cost   39.827   
Iteration 4000: Cost   38.112   
Iteration 5000: Cost   37.002   
Iteration 6000: Cost   36.222   
Iteration 7000: Cost   35.642   
Iteration 8000: Cost   35.194   
Iteration 9000: Cost   34.837   


In [14]:
ada_pred = ada_grad.predict(test_x)

diff = mean_squared_error(cfs_pred, ada_pred)
print(f"Diffence between Close Form Solution and My Adagrad = {round(diff, 3)}")

Diffence between Close Form Solution and My Adagrad = 2.805


### seperate bias solution

In [15]:
ada_grad_bs = AdaGradBiasSep(iteration=10000, lr=1.5)
ada_grad_bs.fit(train_x, train_y)

Iteration    0: Cost  732.851   
Iteration 1000: Cost   49.866   
Iteration 2000: Cost   42.866   
Iteration 3000: Cost   39.827   
Iteration 4000: Cost   38.112   
Iteration 5000: Cost   37.002   
Iteration 6000: Cost   36.222   
Iteration 7000: Cost   35.642   
Iteration 8000: Cost   35.194   
Iteration 9000: Cost   34.837   


In [16]:
ada_bs_pred = ada_grad_bs.predict(test_x)

diff = mean_squared_error(cfs_pred, ada_bs_pred)
print(f"Diffence between Close Form Solution and My Adagrad BS = {round(diff, 3)}")

Diffence between Close Form Solution and My Adagrad BS = 2.805


### AdaGrad with Normalization

In [17]:
scaler = StandardScaler()
train_x_norm = scaler.fit_transform(train_x)
test_x_norm = scaler.transform(test_x)

In [18]:
ada_grad_norm = AdaGradBiasSep(iteration=10000, lr=1.5)
ada_grad_norm.fit(train_x_norm, train_y)

Iteration    0: Cost  732.851   
Iteration 1000: Cost   33.992   
Iteration 2000: Cost   32.764   
Iteration 3000: Cost   32.445   
Iteration 4000: Cost   32.346   
Iteration 5000: Cost   32.313   
Iteration 6000: Cost   32.300   
Iteration 7000: Cost   32.294   
Iteration 8000: Cost   32.290   
Iteration 9000: Cost   32.287   


In [19]:
ada_nrom_pred = ada_grad_norm.predict(test_x_norm)

diff = mean_squared_error(cfs_pred, ada_nrom_pred)
print(f"Diffence between Close Form Solution and My Adagrad Norm = {round(diff, 3)}")

Diffence between Close Form Solution and My Adagrad Norm = 0.266


# What happen if start from best w and b?

In [20]:
best_w = cfs.w
gd_sp = GradientDescentBiasSep(iteration=10000, lr=1e-6)
gd_sp.fit(train_x, train_y, init_w=best_w[1:], init_b=best_w[0])

Iteration    0: Cost   32.257   
Iteration 1000: Cost   32.257   
Iteration 2000: Cost   32.257   
Iteration 3000: Cost   32.257   
Iteration 4000: Cost   32.257   
Iteration 5000: Cost   32.257   
Iteration 6000: Cost   32.257   
Iteration 7000: Cost   32.257   
Iteration 8000: Cost   32.257   
Iteration 9000: Cost   32.257   


In [21]:
gd_sp_pred = gd_sp.predict(test_x)

diff = mean_squared_error(cfs_pred, gd_sp_pred)
print(f"Diffence between Close Form Solution and My Gradient Descent SP = {round(diff, 3)}")

Diffence between Close Form Solution and My Gradient Descent SP = 0.0
