In [1]:
from pathlib import Path
import pandas as pd

In [2]:
path = Path.home() / 'OneDrive - Seagroup/ai/time_series/bike_sharing_daily.csv'
df = pd.read_csv(path)
df.head()

Unnamed: 0,instant,dteday,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,2011-01-01,1,0,1,0,6,0,2,0.344167,0.363625,0.805833,0.160446,331,654,985
1,2,2011-01-02,1,0,1,0,0,0,2,0.363478,0.353739,0.696087,0.248539,131,670,801
2,3,2011-01-03,1,0,1,0,1,1,1,0.196364,0.189405,0.437273,0.248309,120,1229,1349
3,4,2011-01-04,1,0,1,0,2,1,1,0.2,0.212122,0.590435,0.160296,108,1454,1562
4,5,2011-01-05,1,0,1,0,3,1,1,0.226957,0.22927,0.436957,0.1869,82,1518,1600


In [3]:
onehot_fields = ['season', 'mnth', 'weekday', 'weathersit']
for field in onehot_fields:
    dummies = pd.get_dummies(df[field], prefix=field, drop_first=False)
    df = pd.concat([df, dummies], axis=1)
df = df.drop(onehot_fields, axis=1)
df.head()

Unnamed: 0,instant,dteday,yr,holiday,workingday,temp,atemp,hum,windspeed,casual,...,weekday_0,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6,weathersit_1,weathersit_2,weathersit_3
0,1,2011-01-01,0,0,0,0.344167,0.363625,0.805833,0.160446,331,...,0,0,0,0,0,0,1,0,1,0
1,2,2011-01-02,0,0,0,0.363478,0.353739,0.696087,0.248539,131,...,1,0,0,0,0,0,0,0,1,0
2,3,2011-01-03,0,0,1,0.196364,0.189405,0.437273,0.248309,120,...,0,1,0,0,0,0,0,1,0,0
3,4,2011-01-04,0,0,1,0.2,0.212122,0.590435,0.160296,108,...,0,0,1,0,0,0,0,1,0,0
4,5,2011-01-05,0,0,1,0.226957,0.22927,0.436957,0.1869,82,...,0,0,0,1,0,0,0,1,0,0


In [4]:
continuous_fields = ['casual', 'registered', 'cnt', 'temp', 'hum', 'windspeed']
scaled_features = {}
for field in continuous_fields:
    mean, std = df[field].mean(), df[field].std()
    scaled_features[field] = [mean, std]
    df.loc[:, field] = (df[field] - mean)/std
scaled_features

{'casual': [848.1764705882352, 686.6224882846549],
 'registered': [3656.172366621067, 1560.2563770194527],
 'cnt': [4504.3488372093025, 1937.2114516187678],
 'temp': [0.495384788508892, 0.18305099611148867],
 'hum': [0.6278940629274967, 0.14242909513835394],
 'windspeed': [0.190486211627907, 0.07749787068166943]}

In [5]:
df_backup = df.copy()

fields_to_drop = ['instant', 'dteday', 'atemp', 'workingday']
df.drop(fields_to_drop, axis=1, inplace=True)
df.head()

Unnamed: 0,yr,holiday,temp,hum,windspeed,casual,registered,cnt,season_1,season_2,...,weekday_0,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6,weathersit_1,weathersit_2,weathersit_3
0,0,0,-0.826097,1.249316,-0.387626,-0.753218,-1.924153,-1.816709,1,0,...,0,0,0,0,0,0,1,0,1,0
1,0,0,-0.720601,0.478785,0.749089,-1.044499,-1.913899,-1.911691,1,0,...,1,0,0,0,0,0,0,0,1,0
2,0,0,-1.633538,-1.338358,0.746121,-1.060519,-1.555624,-1.62881,1,0,...,0,1,0,0,0,0,0,1,0,0
3,0,0,-1.613675,-0.263001,-0.389562,-1.077996,-1.411417,-1.518858,1,0,...,0,0,1,0,0,0,0,1,0,0
4,0,0,-1.46641,-1.340576,-0.046275,-1.115863,-1.370398,-1.499242,1,0,...,0,0,0,1,0,0,0,1,0,0


In [6]:
# Split of 60 days of data from the end of the df for validation
validation_data = df[-60:]
df = df[:-60]

# Split of 21 days of data from the end of the df for testing
test_data = df[-21:]
df = df[:-21]

# The remaining (earlier) data will be used for training
train_data = df.copy()

# What have we ended up with?
print(f'''Validation data length: {len(validation_data)}
Test data length: {len(test_data)}
Train data length: {len(train_data)}''')

Validation data length: 60
Test data length: 21
Train data length: 650


In [7]:
target_fields = ['cnt', 'casual', 'registered']

train_features, train_targets = train_data.drop(target_fields, axis=1), train_data[target_fields]
test_features, test_targets = test_data.drop(target_fields, axis=1), test_data[target_fields]
validation_features, validation_targets = validation_data.drop(target_fields, axis=1), validation_data[target_fields]

In [8]:
import torch
import torch.optim as optim
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from lightning import Trainer, LightningModule, seed_everything
from lightning.pytorch.callbacks import ModelCheckpoint

seed_everything(42)
l_rate = 0.2
mse_loss = nn.MSELoss(reduction = 'mean')

Global seed set to 42


In [9]:
class Regression(LightningModule):
    def __init__(self):
        super(Regression, self).__init__()
        self.fc1 = nn.Linear(train_features.shape[1], 10)
        self.fc2 = nn.Linear(10, 1)

    def forward(self, x):
        x = torch.sigmoid(self.fc1(x))
        x = self.fc2(x)
        return x

    def train_dataloader(self):
        train_dataset = TensorDataset(torch.tensor(train_features.values).float(), torch.tensor(train_targets[['cnt']].values).float())
        train_loader = DataLoader(dataset=train_dataset, batch_size=128)
        return train_loader

    def val_dataloader(self):
        validation_dataset = TensorDataset(torch.tensor(validation_features.values).float(), torch.tensor(validation_targets[['cnt']].values).float())
        validation_loader = DataLoader(dataset=validation_dataset, batch_size=128)
        return validation_loader

    def test_dataloader(self):
        test_dataset = TensorDataset(torch.tensor(test_features.values).float(), torch.tensor(test_targets[['cnt']].values).float())
        test_loader = DataLoader(dataset=test_dataset, batch_size=128)
        return test_loader

    def configure_optimizers(self):
        return optim.Adam(self.parameters(), lr=l_rate)

    def training_step(self, batch, batch_idx):
        x, y = batch
        logits = self.forward(x)
        loss = mse_loss(logits, y)
        self.log('train_loss', loss, on_step=False, on_epoch=True, prog_bar=True)
        # logs = {'loss': loss}
        return {'loss': loss}

    def validation_step(self, batch, batch_idx):
        x, y = batch
        logits = self.forward(x)
        loss = mse_loss(logits, y)
        self.log('val_loss', loss, on_step=False, on_epoch=True, prog_bar=True)
        return {'val_loss': loss}

    def test_step(self, batch, batch_idx):
        x, y = batch
        logits = self.forward(x)
        loss = mse_loss(logits, y)
        correct = torch.sum(logits == y.data)

        # predictions_pred.append(logits)
        # predictions_actual.append(y.data)
        self.log('test_loss', loss, on_step=False, on_epoch=True, prog_bar=True)
        return {'test_loss': loss, 'test_correct': correct, 'logits': logits}

In [None]:
model = Regression()
checkpoint_callback = ModelCheckpoint(dirpath="/", save_top_k=2, monitor="val_loss")
trainer = Trainer(max_epochs=50,
                  callbacks=[checkpoint_callback],
                  enable_progress_bar=True,
                  log_every_n_steps=50)
trainer.fit(model)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")

  | Name | Type   | Params
--------------------------------
0 | fc1  | Linear | 320   
1 | fc2  | Linear | 11    
--------------------------------
331       Trainable params
0         Non-trainable params
331       Total params
0.001     Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

In [None]:
predictions_pred = []
predictions_actual = []
trainer.test()
