## **TODO:** Set the value of `URL` to the URL from your learning materials

In [None]:
URL = None
import os
assert URL and (type(URL) is str), "Be sure to initialize URL using the value from your learning materials"
os.environ['URL'] = URL

In [None]:
%%bash
pip install pytorch-lightning
wget -q $URL -O ./data.zip
mkdir -p data checkpoints
find *.zip | xargs unzip -o -d data/

## Demo: PyTorch Lightining

In [None]:
import os
import pandas as pd
import torch as pt

from torch.utils.data import DataLoader
from torch.utils.data import TensorDataset

pt.set_default_dtype(pt.float64)

In [None]:
from pathlib import Path

df = pd.concat(
    pd.read_csv(file) for file in Path('data/').glob('part-*.csv')
)

In [None]:
working_df = df.drop('origindatetime_tr', axis = 1)
working_df.shape

In [None]:
test_df = working_df.sample(frac = 0.10, random_state = 42)
test_df.shape

In [None]:
train_df = working_df.drop(index = test_df.index)
train_df.shape

In [None]:
FEATURES = ['origin_block_latitude','origin_block_longitude','destination_block_latitude','destination_block_longitude']
TARGET = ['fareamount']

BATCH_SIZE = 2 ** 18
PIN_MEMORY = True

X_train = pt.tensor(train_df[FEATURES].values)
X_train = X_train.pin_memory() if PIN_MEMORY else X_train

y_train = pt.tensor(train_df[TARGET].values)
y_train = y_train.pin_memory() if PIN_MEMORY else y_train

train_ds = TensorDataset(y_train, X_train)

train_dl = DataLoader(train_ds, batch_size=BATCH_SIZE, pin_memory = PIN_MEMORY, num_workers = os.cpu_count())

X_test = pt.tensor(test_df[FEATURES].values)
X_test = X_test.pin_memory() if PIN_MEMORY else X_test

y_test = pt.tensor(test_df[TARGET].values)
y_test = y_test.pin_memory() if PIN_MEMORY else y_test

test_ds = TensorDataset(y_test, X_test)
test_dl = DataLoader(test_ds, batch_size = BATCH_SIZE, pin_memory = PIN_MEMORY)
                    #  , num_workers = os.cpu_count())

len(train_ds), len(test_ds), BATCH_SIZE

In [None]:
import pytorch_lightning as pl
from pytorch_lightning import Trainer

class TaxiFareRegressor(pl.LightningModule):
  def __init__(self, **kwargs):    
    super(TaxiFareRegressor, self).__init__()
    self.save_hyperparameters()

    SEED = int(self.hparams.seed)
    pt.manual_seed(SEED)

    NUM_FEATURES = int(self.hparams.num_features)
    self.layers = pt.nn.Sequential(
        pt.nn.Linear(NUM_FEATURES, 1, bias = False)
    )

  def forward(self, X):
    return self.layers(X)

  def loss(self, y_est, y):
    mse = pt.nn.functional.mse_loss(y_est.squeeze_(), y.squeeze_())
    rmse = mse.sqrt()
    return mse, rmse

  def training_step(self, batch, batch_idx):
      y, X = batch

      y_est = self.forward(X)
      mse, rmse = self.loss(y_est, y)

      self.log('train_mse', mse, prog_bar=True, on_step=True, logger=True)
      self.log('train_rmse', rmse, prog_bar=True, on_step=True, logger=True)

      return mse

  def test_step(self, batch, batch_idx):
    y, X = batch

    with pt.no_grad():
      mse, rmse = self.loss(self.forward(X), y)

    self.log('test_mse', mse, prog_bar=True, on_step=True, logger=True)
    self.log('test_rmse', rmse, prog_bar=True, on_step=True, logger=True)

  def configure_optimizers(self):
    return pt.optim.AdamW(self.layers.parameters())

In [None]:
model = TaxiFareRegressor(**{
    'seed': '42',
    'lr': '0.03',    
    'num_features': '4',
    'max_epochs': '1'
})

In [None]:
from pytorch_lightning import loggers as pl_loggers
from pytorch_lightning.profiler import AdvancedProfiler
adv_profiler = AdvancedProfiler()

tb_logger = pl_loggers.TensorBoardLogger('lightning_logs/')
csv_logger = pl_loggers.CSVLogger(save_dir = "logs", 
                    name = "taxifare",
                    version = f"seed_{model.hparams.seed}")

MAX_EPOCHS = int(model.hparams.max_epochs)
trainer = pl.Trainer(gpus = 1, 
                     max_epochs = MAX_EPOCHS, 
                     default_root_dir='./checkpoints',
                     log_every_n_steps=1,
                     progress_bar_refresh_rate = 20, 
                    #  overfit_batches=0.05,
                    #  profiler=adv_profiler,
                     logger=[tb_logger, csv_logger])

trainer.fit(model, train_dataloader=train_dl) 

In [None]:
trainer.callback_metrics

In [None]:
%reload_ext tensorboard
%tensorboard --logdir lightning_logs/

In [None]:
import pandas as pd
metrics_df = pd.read_csv(f'logs/taxifare/seed_{model.hparams.seed}/metrics.csv')
metrics_df

In [None]:
ax = metrics_df[['step', 'train_rmse']].plot('step', 'train_rmse')

ax.figure.set_size_inches(12, 6)
ax.set_xlabel('step', fontsize = 20)
ax.tick_params(axis='x', labelsize=20)
ax.tick_params(axis='y', labelsize=20)
ax.legend(fontsize = 20);

In [None]:
trainer.save_checkpoint('checkpoints/model')

In [None]:
model = TaxiFareRegressor.load_from_checkpoint('checkpoints/model')
model

In [None]:
trainer.test(model, test_dl)
trainer.callback_metrics

Copyright 2021 CounterFactual.AI LLC. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.