In [18]:
!wget https://raw.githubusercontent.com/kevin900804/Deep_Learning_YouBike_Predict/main/YouBike_Data.csv
!wget https://raw.githubusercontent.com/kevin900804/Deep_Learning_YouBike_Predict/main/YouBike_Data_Test.csv
!wget https://raw.githubusercontent.com/kevin900804/Deep_Learning_YouBike_Predict/main/YouBike_Data_All.csv

--2022-07-19 09:01:36--  https://raw.githubusercontent.com/kevin900804/Deep_Learning_YouBike_Predict/main/YouBike_Data.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2519 (2.5K) [text/plain]
Saving to: ‘YouBike_Data.csv’


2022-07-19 09:01:36 (55.6 MB/s) - ‘YouBike_Data.csv’ saved [2519/2519]

--2022-07-19 09:01:36--  https://raw.githubusercontent.com/kevin900804/Deep_Learning_YouBike_Predict/main/YouBike_Data_Test.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2557 (2.5K) [text/plain]
Saving to: ‘YouBike_Data_Test.csv’


In [19]:
import math
import numpy as np

import pandas as pd
import os
import csv

from tqdm import tqdm

import torch 
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split
from torch.utils.tensorboard import SummaryWriter

In [20]:
def same_seed(seed): 
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

def train_valid_split(data_set, valid_ratio, seed):
    valid_set_size = int(valid_ratio * len(data_set))
    train_set_size = len(data_set) - valid_set_size
    train_set, valid_set = random_split(data_set, [train_set_size, valid_set_size], generator=torch.Generator().manual_seed(seed))
    return np.array(train_set), np.array(valid_set)

def predict(test_loader, model, device):
    model.eval()
    preds = []
    for x in tqdm(test_loader):
        x = x.to(device)                       
        with torch.no_grad():              
            pred = model(x)                     
            preds.append(pred.detach().cpu())
    preds = torch.cat(preds, dim=0).numpy()
    return preds

In [21]:
class YouBikeDataset(Dataset):
    def __init__(self, x, y=None):
        if y is None: 
            self.y = y
        else:
            self.y = torch.FloatTensor(y)
        self.x = torch.FloatTensor(x)

    def __getitem__(self, idx):
        if self.y is None:
            return self.x[idx]
        else:
            return self.x[idx], self.y[idx]

    def __len__(self):
        return len(self.x)

In [22]:
class My_Model(nn.Module):
    def __init__(self, input_dim):
        super(My_Model, self).__init__()
        self.layers = nn.Sequential(
            nn.Linear(input_dim, 16),
            nn.ReLU(),
            nn.Linear(16, 8),
            nn.ReLU(),
            nn.Linear(8, 4),
            nn.ReLU(),
            nn.Linear(4, 2),
            nn.ReLU(),
            nn.Linear(2, 1)
        )

    def forward(self, x):
        x = self.layers(x)
        x = x.squeeze(1)
        return x

In [23]:
import torch
import pandas as pd
from sklearn.feature_selection import SelectKBest, f_regression

data = pd.read_csv('./YouBike_Data.csv')
x = data[data.columns[1:11]]
y = data[data.columns[11]]
from sklearn import preprocessing
x = (x - x.min()) / (x.max() - x.min())

bestfeatures = SelectKBest(score_func=f_regression,k='all')
fit = bestfeatures.fit(x,y)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(x.columns)

bestfeatures = SelectKBest(score_func=f_regression)
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Variables','Score']  
print(featureScores.nlargest(10,'Score'))

                                   Variables     Score
0                               rainfall(mm)  2.534780
6                           num_of_rainy_day  1.311336
3                               num_of_bikes  0.476641
5                  insolation_duration(hour)  0.263690
9                             last_12_months  0.221171
4                 total_bike_lane_length(km)  0.175166
1       monthly_average_temperature(Celsius)  0.144311
2                       num_of_bike_stations  0.112669
8  num_of_scooters_owned_per_thousand_people  0.070508
7      num_of_cars_owned_per_thousand_people  0.006026


In [24]:
def select_feat(train_data, valid_data, test_data, select_all=True):
    y_train, y_valid = train_data[:,-1], valid_data[:,-1]
    raw_x_train, raw_x_valid, raw_x_test = train_data[:,1:-1], valid_data[:,1:-1], test_data[:,1:]
    if select_all:
        feat_idx = list(range(raw_x_train.shape[1]))
    else:
        feat_idx = [0,6,3,5]
    return raw_x_train[:,feat_idx], raw_x_valid[:,feat_idx], raw_x_test[:,feat_idx], y_train, y_valid

In [25]:
def trainer(train_loader, valid_loader, model, config, device):
    criterion = nn.MSELoss(reduction='mean') 
    optimizer = torch.optim.Adam(model.parameters(), weight_decay=0.01)

    writer = SummaryWriter()

    if not os.path.isdir('./models'):
        os.mkdir('./models')

    n_epochs, best_loss, step, early_stop_count = config['n_epochs'], math.inf, 0, 0
    for epoch in range(n_epochs):
        model.train()
        loss_record = []

        train_pbar = tqdm(train_loader, position=0, leave=True)

        for x, y in train_pbar:
            optimizer.zero_grad()               
            x, y = x.to(device), y.to(device)    
            pred = model(x)             
            loss = criterion(pred, y)
            loss.backward()                     
            optimizer.step()                    
            step += 1
            loss_record.append(loss.detach().item()) 
            
            train_pbar.set_description(f'Epoch [{epoch+1}/{n_epochs}]')
            train_pbar.set_postfix({'loss': loss.detach().item()})

        mean_train_loss = sum(loss_record)/len(loss_record)
        writer.add_scalar('Loss/train', mean_train_loss, step)

        model.eval()
        loss_record = []
        for x, y in valid_loader:
            x, y = x.to(device), y.to(device)
            with torch.no_grad():
                pred = model(x)
                loss = criterion(pred, y)

            loss_record.append(loss.item())
            
        mean_valid_loss = sum(loss_record)/len(loss_record)
        print(f'Epoch [{epoch+1}/{n_epochs}]: Train loss: {mean_train_loss:.4f}, Valid loss: {mean_valid_loss:.4f}')
        writer.add_scalar('Loss/valid', mean_valid_loss, step)
        
        if mean_valid_loss < best_loss:
            best_loss = mean_valid_loss
            torch.save(model.state_dict(), config['save_path'])
            print('Saving model with loss {:.3f}...'.format(best_loss))
            early_stop_count = 0
        else: 
            early_stop_count += 1 

        if early_stop_count >= config['early_stop']: 
            print('\nModel is not improving, so we halt the training session.')
            return

In [26]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
config = {
    'seed': 4096,
    'select_all': False,
    'valid_ratio': 0.2,
    'n_epochs': 10000,            
    'batch_size': 50, 
    'learning_rate': 1e-5,              
    'early_stop': 1000,     
    'save_path': './models/model.ckpt'
}

In [27]:
same_seed(config['seed'])

train_data, test_data = pd.read_csv('./YouBike_Data.csv').values, pd.read_csv('./YouBike_Data_Test.csv').values
train_data, valid_data = train_valid_split(train_data, config['valid_ratio'], config['seed'])

print(f"""train_data size: {train_data.shape} 
valid_data size: {valid_data.shape} 
test_data size: {test_data.shape}""")

x_train, x_valid, x_test, y_train, y_valid = select_feat(train_data, valid_data, test_data, config['select_all'])

print(f'number of features: {x_train.shape[1]}')

train_dataset, valid_dataset, test_dataset = YouBikeDataset(x_train, y_train), \
                                             YouBikeDataset(x_valid, y_valid), \
                                             YouBikeDataset(x_test)

train_loader = DataLoader(train_dataset, batch_size=config['batch_size'], shuffle=True, pin_memory=True)
valid_loader = DataLoader(valid_dataset, batch_size=config['batch_size'], shuffle=True, pin_memory=True)
test_loader = DataLoader(test_dataset, batch_size=config['batch_size'], shuffle=False, pin_memory=True)

train_data size: (29, 12) 
valid_data size: (7, 12) 
test_data size: (41, 11)
number of features: 4


In [28]:
model = My_Model(input_dim=x_train.shape[1]).to(device)
trainer(train_loader, valid_loader, model, config, device)

Epoch [1/10000]: 100%|██████████| 1/1 [00:05<00:00,  5.79s/it, loss=1.08e+3]


Epoch [1/10000]: Train loss: 1082.0540, Valid loss: 339.9699
Saving model with loss 339.970...


Epoch [2/10000]: 100%|██████████| 1/1 [00:00<00:00, 93.13it/s, loss=484]


Epoch [2/10000]: Train loss: 483.9357, Valid loss: 78.7697
Saving model with loss 78.770...


Epoch [3/10000]: 100%|██████████| 1/1 [00:00<00:00, 105.54it/s, loss=140]


Epoch [3/10000]: Train loss: 140.1974, Valid loss: 19.2243
Saving model with loss 19.224...


Epoch [4/10000]: 100%|██████████| 1/1 [00:00<00:00, 125.00it/s, loss=21.2]


Epoch [4/10000]: Train loss: 21.2235, Valid loss: 16.3251
Saving model with loss 16.325...


Epoch [5/10000]: 100%|██████████| 1/1 [00:00<00:00, 118.17it/s, loss=16.4]


Epoch [5/10000]: Train loss: 16.3500, Valid loss: 14.2379
Saving model with loss 14.238...


Epoch [6/10000]: 100%|██████████| 1/1 [00:00<00:00, 111.53it/s, loss=14.2]


Epoch [6/10000]: Train loss: 14.2241, Valid loss: 12.6866
Saving model with loss 12.687...


Epoch [7/10000]: 100%|██████████| 1/1 [00:00<00:00, 122.43it/s, loss=12.6]


Epoch [7/10000]: Train loss: 12.6301, Valid loss: 11.5070
Saving model with loss 11.507...


Epoch [8/10000]: 100%|██████████| 1/1 [00:00<00:00, 26.57it/s, loss=11.4]


Epoch [8/10000]: Train loss: 11.4033, Valid loss: 10.5853
Saving model with loss 10.585...


Epoch [9/10000]: 100%|██████████| 1/1 [00:00<00:00, 91.77it/s, loss=10.4]


Epoch [9/10000]: Train loss: 10.4441, Valid loss: 9.8471
Saving model with loss 9.847...


Epoch [10/10000]: 100%|██████████| 1/1 [00:00<00:00, 120.01it/s, loss=9.69]


Epoch [10/10000]: Train loss: 9.6855, Valid loss: 9.2517
Saving model with loss 9.252...


Epoch [11/10000]: 100%|██████████| 1/1 [00:00<00:00, 106.63it/s, loss=9.07]


Epoch [11/10000]: Train loss: 9.0747, Valid loss: 8.7715
Saving model with loss 8.772...


Epoch [12/10000]: 100%|██████████| 1/1 [00:00<00:00, 101.85it/s, loss=8.58]


Epoch [12/10000]: Train loss: 8.5784, Valid loss: 8.3706
Saving model with loss 8.371...


Epoch [13/10000]: 100%|██████████| 1/1 [00:00<00:00, 112.86it/s, loss=8.17]


Epoch [13/10000]: Train loss: 8.1694, Valid loss: 8.0330
Saving model with loss 8.033...


Epoch [14/10000]: 100%|██████████| 1/1 [00:00<00:00, 104.97it/s, loss=7.83]


Epoch [14/10000]: Train loss: 7.8266, Valid loss: 7.7465
Saving model with loss 7.747...


Epoch [15/10000]: 100%|██████████| 1/1 [00:00<00:00, 105.21it/s, loss=7.53]


Epoch [15/10000]: Train loss: 7.5348, Valid loss: 7.5017
Saving model with loss 7.502...


Epoch [16/10000]: 100%|██████████| 1/1 [00:00<00:00, 89.31it/s, loss=7.28]


Epoch [16/10000]: Train loss: 7.2844, Valid loss: 7.2910
Saving model with loss 7.291...


Epoch [17/10000]: 100%|██████████| 1/1 [00:00<00:00, 104.11it/s, loss=7.07]


Epoch [17/10000]: Train loss: 7.0679, Valid loss: 7.1083
Saving model with loss 7.108...


Epoch [18/10000]: 100%|██████████| 1/1 [00:00<00:00, 102.49it/s, loss=6.88]


Epoch [18/10000]: Train loss: 6.8792, Valid loss: 6.9488
Saving model with loss 6.949...


Epoch [19/10000]: 100%|██████████| 1/1 [00:00<00:00, 106.81it/s, loss=6.71]


Epoch [19/10000]: Train loss: 6.7137, Valid loss: 6.8082
Saving model with loss 6.808...


Epoch [20/10000]: 100%|██████████| 1/1 [00:00<00:00, 115.07it/s, loss=6.57]


Epoch [20/10000]: Train loss: 6.5672, Valid loss: 6.6835
Saving model with loss 6.684...


Epoch [21/10000]: 100%|██████████| 1/1 [00:00<00:00, 37.82it/s, loss=6.44]


Epoch [21/10000]: Train loss: 6.4366, Valid loss: 6.5718
Saving model with loss 6.572...


Epoch [22/10000]: 100%|██████████| 1/1 [00:00<00:00, 102.08it/s, loss=6.32]


Epoch [22/10000]: Train loss: 6.3192, Valid loss: 6.4708
Saving model with loss 6.471...


Epoch [23/10000]: 100%|██████████| 1/1 [00:00<00:00, 84.12it/s, loss=6.21]


Epoch [23/10000]: Train loss: 6.2126, Valid loss: 6.3787
Saving model with loss 6.379...


Epoch [24/10000]: 100%|██████████| 1/1 [00:00<00:00, 80.58it/s, loss=6.12]


Epoch [24/10000]: Train loss: 6.1151, Valid loss: 6.2937
Saving model with loss 6.294...


Epoch [25/10000]: 100%|██████████| 1/1 [00:00<00:00, 26.94it/s, loss=6.02]


Epoch [25/10000]: Train loss: 6.0250, Valid loss: 6.2147
Saving model with loss 6.215...


Epoch [26/10000]: 100%|██████████| 1/1 [00:00<00:00, 119.37it/s, loss=5.94]


Epoch [26/10000]: Train loss: 5.9409, Valid loss: 6.1403
Saving model with loss 6.140...


Epoch [27/10000]: 100%|██████████| 1/1 [00:00<00:00, 113.65it/s, loss=5.86]

Epoch [27/10000]: Train loss: 5.8618, Valid loss: 6.0696





Saving model with loss 6.070...


Epoch [28/10000]:   0%|          | 0/1 [00:00<?, ?it/s, loss=5.79]


KeyboardInterrupt: ignored

In [None]:
%reload_ext tensorboard
%tensorboard --logdir=./runs/

In [None]:
def save_pred(preds, file):
    with open(file, 'w') as fp:
        writer = csv.writer(fp)
        writer.writerow(['year_month', 'num_of_bicycle_rentals(million)'])
        for i, p in enumerate(preds):
            writer.writerow([i, p])

model = My_Model(input_dim=x_train.shape[1]).to(device)
model.load_state_dict(torch.load(config['save_path']))
preds = predict(test_loader, model, device) 
save_pred(preds, 'pred.csv')         

# Reference
This notebook refers to code written by Heng-Jui Chang @ NTUEE (https://github.com/ga642381/ML2021-Spring/blob/main/HW01/HW01.ipynb)