# Chapter 5: Building Your First DataPipe

## 5.3 A New Dataset

In [None]:
!wget https://github.com/dvgodoy/assets/raw/main/PyTorchInPractice/data/100KUsedCar/car_prices.zip
!unzip car_prices.zip -d car_prices

### 5.3.1 DataPipes

In [36]:
import torchdata.datapipes as dp

datapipe = dp.iter.FileLister('./car_prices')

In [37]:
from torch.utils.data import DataLoader
next(iter(DataLoader(dataset=datapipe, batch_size=16)))

['./car_prices/audi.csv',
 './car_prices/bmw.csv',
 './car_prices/cclass.csv',
 './car_prices/focus.csv',
 './car_prices/ford.csv',
 './car_prices/hyundi.csv',
 './car_prices/merc.csv',
 './car_prices/skoda.csv',
 './car_prices/toyota.csv',
 './car_prices/unclean cclass.csv',
 './car_prices/unclean focus.csv',
 './car_prices/vauxhall.csv',
 './car_prices/vw.csv']

In [38]:
def filter_for_data(filename):
    return ("unclean" not in filename) and ("focus" not in filename) and ("cclass" not in filename) and filename.endswith(".csv")

datapipe = datapipe.filter(filter_fn=filter_for_data)

In [39]:
next(iter(DataLoader(dataset=datapipe, batch_size=16)))

['./car_prices/audi.csv',
 './car_prices/bmw.csv',
 './car_prices/ford.csv',
 './car_prices/hyundi.csv',
 './car_prices/merc.csv',
 './car_prices/skoda.csv',
 './car_prices/toyota.csv',
 './car_prices/vauxhall.csv',
 './car_prices/vw.csv']

#### 5.3.1.1 Loading CSV Files

![](https://raw.githubusercontent.com/dvgodoy/assets/main/PyTorchInPractice/images/ch0/data_step1.png)

In [40]:
datapipe = datapipe.open_files(mode='rt')
datapipe = datapipe.parse_csv(delimiter=",", skip_lines=1, return_path=True)

In [41]:
next(iter(DataLoader(dataset=datapipe, batch_size=4)))

[('./car_prices/audi.csv',
  './car_prices/audi.csv',
  './car_prices/audi.csv',
  './car_prices/audi.csv'),
 [(' A1', ' A6', ' A1', ' A4'),
  ('2017', '2016', '2016', '2017'),
  ('12500', '16500', '11000', '16800'),
  ('Manual', 'Automatic', 'Manual', 'Automatic'),
  ('15735', '36203', '29946', '25952'),
  ('Petrol', 'Diesel', 'Petrol', 'Diesel'),
  ('150', '20', '30', '145'),
  ('55.4', '64.2', '55.4', '67.3'),
  ('1.4', '2.0', '1.4', '2.0')]]

In [42]:
import os

def get_manufacturer(content):
    path, data = content
    manuf = os.path.splitext(os.path.basename(path))[0].upper()
    data.extend([manuf])
    return data

datapipe = datapipe.map(get_manufacturer)

In [43]:
next(iter(DataLoader(dataset=datapipe, batch_size=4)))

[(' A1', ' A6', ' A1', ' A4'),
 ('2017', '2016', '2016', '2017'),
 ('12500', '16500', '11000', '16800'),
 ('Manual', 'Automatic', 'Manual', 'Automatic'),
 ('15735', '36203', '29946', '25952'),
 ('Petrol', 'Diesel', 'Petrol', 'Diesel'),
 ('150', '20', '30', '145'),
 ('55.4', '64.2', '55.4', '67.3'),
 ('1.4', '2.0', '1.4', '2.0'),
 ('AUDI', 'AUDI', 'AUDI', 'AUDI')]

#### 5.3.1.2 Encoding Categorical Attributes

![](https://raw.githubusercontent.com/dvgodoy/assets/main/PyTorchInPractice/images/ch0/data_step3.png)

In [44]:
import pandas as pd

colnames = ['model', 'year', 'price', 'transmission', 'mileage', 'fuel_type', 'road_tax', 'mpg', 'engine_size', 'manufacturer']
df = pd.DataFrame(list(datapipe), columns=colnames)
N_ROWS = len(df)

In [47]:
cont_attr = ['year', 'mileage', 'road_tax', 'mpg', 'engine_size']
cat_attr = ['model', 'transmission', 'fuel_type', 'manufacturer']

def gen_encoder_dict(series):
    values = series.unique()
    return dict(zip(values, range(len(values))))

dropdown_encoders = {col: gen_encoder_dict(df[col]) for col in cat_attr}

In [48]:
dropdown_encoders['fuel_type']

{'Petrol': 0, 'Diesel': 1, 'Hybrid': 2, 'Other': 3, 'Electric': 4}

#### 5.3.1.3 Row Output

In [49]:
import numpy as np

def preproc(row):
    colnames = ['model', 'year', 'price', 'transmission', 'mileage', 'fuel_type', 'road_tax', 'mpg', 'engine_size', 'manufacturer']
    
    cat_attr = ['model', 'transmission', 'fuel_type', 'manufacturer']
    cont_attr = ['year', 'mileage', 'road_tax', 'mpg', 'engine_size']
    target = 'price'
    
    vals = dict(zip(colnames, row))
    cont_X = [float(vals[name]) for name in cont_attr]
    cat_X = [dropdown_encoders[name][vals[name]] for name in cat_attr]
            
    return {'label': np.array([float(vals[target])], dtype=np.float32),
            'cont_X': np.array(cont_X, dtype=np.float32), 
            'cat_X': np.array(cat_X, dtype=int)}

In [50]:
datapipe = datapipe.map(preproc)

In [51]:
next(iter(DataLoader(dataset=datapipe, batch_size=4)))

{'label': tensor([[12500.],
         [16500.],
         [11000.],
         [16800.]]),
 'cont_X': tensor([[2.0170e+03, 1.5735e+04, 1.5000e+02, 5.5400e+01, 1.4000e+00],
         [2.0160e+03, 3.6203e+04, 2.0000e+01, 6.4200e+01, 2.0000e+00],
         [2.0160e+03, 2.9946e+04, 3.0000e+01, 5.5400e+01, 1.4000e+00],
         [2.0170e+03, 2.5952e+04, 1.4500e+02, 6.7300e+01, 2.0000e+00]]),
 'cat_X': tensor([[0, 0, 0, 0],
         [1, 1, 1, 0],
         [0, 0, 0, 0],
         [2, 1, 1, 0]])}

#### 5.3.1.4 The Full DataPipe and Splits

![](https://raw.githubusercontent.com/dvgodoy/assets/main/PyTorchInPractice/images/ch0/data_step4.png)

In [52]:
datapipe = dp.iter.FileLister('./car_prices')
datapipe = datapipe.filter(filter_fn=filter_for_data)
datapipe = datapipe.open_files(mode='rt')
datapipe = datapipe.parse_csv(delimiter=",", skip_lines=1, return_path=True)
datapipe = datapipe.map(get_manufacturer)
datapipe = datapipe.map(preproc)

In [53]:
datapipes = {}
datapipes['train'] = datapipe.random_split(total_length=N_ROWS, weights={"train": 0.8, "val": 0.1, "test": 0.1}, seed=11, target='train')
datapipes['val'] = datapipe.random_split(total_length=N_ROWS, weights={"train": 0.8, "val": 0.1, "test": 0.1}, seed=11, target='val')
datapipes['test'] = datapipe.random_split(total_length=N_ROWS, weights={"train": 0.8, "val": 0.1, "test": 0.1}, seed=11, target='test')

datapipes['train'] = datapipes['train'].shuffle(buffer_size=100000)

![](https://raw.githubusercontent.com/dvgodoy/assets/main/PyTorchInPractice/images/ch0/data_step5.png)

In [54]:
dataloaders = {}
dataloaders['train'] = DataLoader(dataset=datapipes['train'], batch_size=128, drop_last=True, shuffle=True)
dataloaders['val'] = DataLoader(dataset=datapipes['val'], batch_size=128)
dataloaders['test'] = DataLoader(dataset=datapipes['test'], batch_size=128)

### 5.3.2 BatchNorm for Continuous Attributes

![](https://raw.githubusercontent.com/dvgodoy/assets/main/PyTorchInPractice/images/ch0/model_step1.png)

In [55]:
import torch.nn as nn

batch = next(iter(dataloaders['train']))
batch['cont_X'].mean(axis=0), batch['cont_X'].std(axis=0, unbiased=False)

(tensor([2.0174e+03, 2.1038e+04, 1.1512e+02, 5.3813e+01, 1.6766e+00]),
 tensor([1.5904e+00, 1.7717e+04, 5.7773e+01, 1.1183e+01, 5.1135e-01]))

In [56]:
bn_layer = nn.BatchNorm1d(num_features=len(cont_attr))

normalized_cont = bn_layer(batch['cont_X'])
normalized_cont.mean(axis=0), normalized_cont.std(axis=0, unbiased=False)

(tensor([ 4.0997e-05,  3.3528e-08,  3.7253e-08, -2.4214e-07, -1.0058e-07],
        grad_fn=<MeanBackward1>),
 tensor([1.0000, 1.0000, 1.0000, 1.0000, 1.0000], grad_fn=<StdBackward0>))

In [57]:
bn_layer.state_dict()

OrderedDict([('weight', tensor([1., 1., 1., 1., 1.])),
             ('bias', tensor([0., 0., 0., 0., 0.])),
             ('running_mean',
              tensor([2.0174e+02, 2.1038e+03, 1.1512e+01, 5.3813e+00, 1.6766e-01])),
             ('running_var',
              tensor([1.1549e+00, 3.1638e+07, 3.3730e+02, 1.3504e+01, 9.2635e-01])),
             ('num_batches_tracked', tensor(1))])