# Chapter 5: Building Your First Hugging Face Dataset

In [None]:
!pip install transformers datasets

## 5.3 A New Dataset

In [None]:
!wget https://github.com/dvgodoy/assets/raw/main/PyTorchInPractice/data/100KUsedCar/car_prices.zip
!unzip car_prices.zip -d car_prices

In [1]:
import os

def filter_for_data(filename):
    return ("unclean" not in filename) and ("focus" not in filename) and ("cclass" not in filename) and filename.endswith(".csv")

folder = './car_prices'
data_files = sorted([os.path.join(folder, fname) 
                     for fname in os.listdir(folder) 
                     if filter_for_data(fname)])
data_files

['./car_prices/audi.csv',
 './car_prices/bmw.csv',
 './car_prices/ford.csv',
 './car_prices/hyundi.csv',
 './car_prices/merc.csv',
 './car_prices/skoda.csv',
 './car_prices/toyota.csv',
 './car_prices/vauxhall.csv',
 './car_prices/vw.csv']

### 5.3.1 Hugging Face Datasets

#### 5.3.1.1 Loading CSV Files

![](https://raw.githubusercontent.com/dvgodoy/assets/main/PyTorchInPractice/images/ch0/data_step1.png)

In [2]:
from datasets import load_dataset, Split

colnames = ['model', 'year', 'price', 'transmission', 'mileage', 'fuel_type', 'road_tax', 'mpg', 'engine_size', 'manufacturer']

dataset = load_dataset(path="csv",
                       data_files=data_files, 
                       sep=',', 
                       skiprows=1, 
                       column_names=colnames,
                       split=Split.ALL)

Downloading and preparing dataset csv/default to /home/dvgodoy/.cache/huggingface/datasets/csv/default-fc5da23e90940b54/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /home/dvgodoy/.cache/huggingface/datasets/csv/default-fc5da23e90940b54/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1. Subsequent calls will reuse this data.


In [3]:
dataset.features, dataset.num_columns, dataset.shape

({'model': Value(dtype='string', id=None),
  'year': Value(dtype='int64', id=None),
  'price': Value(dtype='int64', id=None),
  'transmission': Value(dtype='string', id=None),
  'mileage': Value(dtype='int64', id=None),
  'fuel_type': Value(dtype='string', id=None),
  'road_tax': Value(dtype='int64', id=None),
  'mpg': Value(dtype='float64', id=None),
  'engine_size': Value(dtype='float64', id=None),
  'manufacturer': Value(dtype='float64', id=None)},
 10,
 (99187, 10))

In [4]:
dataset[:3]

{'model': [' A1', ' A6', ' A1'],
 'year': [2017, 2016, 2016],
 'price': [12500, 16500, 11000],
 'transmission': ['Manual', 'Automatic', 'Manual'],
 'mileage': [15735, 36203, 29946],
 'fuel_type': ['Petrol', 'Diesel', 'Petrol'],
 'road_tax': [150, 20, 30],
 'mpg': [55.4, 64.2, 55.4],
 'engine_size': [1.4, 2.0, 1.4],
 'manufacturer': [None, None, None]}

In [5]:
dataset['transmission']

['Manual',
 'Automatic',
 'Manual',
 'Automatic',
 'Manual',
 'Automatic',
 'Automatic',
 'Manual',
 'Manual',
 'Manual',
 'Manual',
 'Automatic',
 'Manual',
 'Manual',
 'Manual',
 'Automatic',
 'Automatic',
 'Automatic',
 'Automatic',
 'Manual',
 'Automatic',
 'Automatic',
 'Automatic',
 'Automatic',
 'Automatic',
 'Automatic',
 'Automatic',
 'Manual',
 'Automatic',
 'Manual',
 'Automatic',
 'Manual',
 'Automatic',
 'Automatic',
 'Automatic',
 'Automatic',
 'Manual',
 'Automatic',
 'Manual',
 'Manual',
 'Manual',
 'Automatic',
 'Automatic',
 'Automatic',
 'Automatic',
 'Manual',
 'Automatic',
 'Automatic',
 'Automatic',
 'Automatic',
 'Manual',
 'Automatic',
 'Manual',
 'Manual',
 'Automatic',
 'Automatic',
 'Automatic',
 'Manual',
 'Manual',
 'Manual',
 'Manual',
 'Manual',
 'Manual',
 'Manual',
 'Manual',
 'Manual',
 'Manual',
 'Manual',
 'Automatic',
 'Manual',
 'Manual',
 'Manual',
 'Manual',
 'Manual',
 'Manual',
 'Manual',
 'Automatic',
 'Automatic',
 'Manual',
 'Manual',
 'Manu

In [6]:
train_test = dataset.train_test_split(train_size=0.8)
train_test

DatasetDict({
    train: Dataset({
        features: ['model', 'year', 'price', 'transmission', 'mileage', 'fuel_type', 'road_tax', 'mpg', 'engine_size', 'manufacturer'],
        num_rows: 79349
    })
    test: Dataset({
        features: ['model', 'year', 'price', 'transmission', 'mileage', 'fuel_type', 'road_tax', 'mpg', 'engine_size', 'manufacturer'],
        num_rows: 19838
    })
})

In [7]:
val_test = train_test['test'].train_test_split(train_size=0.5)
val_test

DatasetDict({
    train: Dataset({
        features: ['model', 'year', 'price', 'transmission', 'mileage', 'fuel_type', 'road_tax', 'mpg', 'engine_size', 'manufacturer'],
        num_rows: 9919
    })
    test: Dataset({
        features: ['model', 'year', 'price', 'transmission', 'mileage', 'fuel_type', 'road_tax', 'mpg', 'engine_size', 'manufacturer'],
        num_rows: 9919
    })
})

In [8]:
from datasets import DatasetDict
datasets = DatasetDict({'train': train_test['train'],  # training set from first split
                       'val': val_test['train'],      # test set from first split, split further and renamed
                       'test': val_test['test']})     # test set from first split, split further
datasets

DatasetDict({
    train: Dataset({
        features: ['model', 'year', 'price', 'transmission', 'mileage', 'fuel_type', 'road_tax', 'mpg', 'engine_size', 'manufacturer'],
        num_rows: 79349
    })
    val: Dataset({
        features: ['model', 'year', 'price', 'transmission', 'mileage', 'fuel_type', 'road_tax', 'mpg', 'engine_size', 'manufacturer'],
        num_rows: 9919
    })
    test: Dataset({
        features: ['model', 'year', 'price', 'transmission', 'mileage', 'fuel_type', 'road_tax', 'mpg', 'engine_size', 'manufacturer'],
        num_rows: 9919
    })
})

#### 5.3.1.2 Encoding Categorical Attributes

![](https://raw.githubusercontent.com/dvgodoy/assets/main/PyTorchInPractice/images/ch0/data_step3.png)

In [9]:
datasets['train'].unique('fuel_type')

Flattening the indices:   0%|          | 0/79349 [00:00<?, ? examples/s]

['Diesel', 'Petrol', 'Hybrid', 'Other', 'Electric']

In [10]:
cont_attr = ['year', 'mileage', 'road_tax', 'mpg', 'engine_size']
cat_attr = ['model', 'transmission', 'fuel_type']

def gen_encoder_dict(dataset, col):
    values = sorted(dataset.unique(col))
    values += ['UNKNOWN']
    return dict(zip(values, range(len(values))))

dropdown_encoders = {col: gen_encoder_dict(datasets['train'], col) for col in cat_attr}

Loading cached processed dataset at /home/dvgodoy/.cache/huggingface/datasets/csv/default-fc5da23e90940b54/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-62f0d185f6c0cce5.arrow
Loading cached processed dataset at /home/dvgodoy/.cache/huggingface/datasets/csv/default-fc5da23e90940b54/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-62f0d185f6c0cce5.arrow
Loading cached processed dataset at /home/dvgodoy/.cache/huggingface/datasets/csv/default-fc5da23e90940b54/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-62f0d185f6c0cce5.arrow


In [11]:
dropdown_encoders['fuel_type']

{'Diesel': 0,
 'Electric': 1,
 'Hybrid': 2,
 'Other': 3,
 'Petrol': 4,
 'UNKNOWN': 5}

#### 5.3.1.3 Row Output

In [12]:
datasets['train'][0]

{'model': ' Insignia',
 'year': 2018,
 'price': 15695,
 'transmission': 'Manual',
 'mileage': 11793,
 'fuel_type': 'Diesel',
 'road_tax': 145,
 'mpg': 65.7,
 'engine_size': 1.6,
 'manufacturer': None}

In [13]:
import numpy as np

def preproc(row):
    colnames = ['model', 'year', 'price', 'transmission', 'mileage', 'fuel_type', 'road_tax', 'mpg', 'engine_size']#, 'manufacturer']
    
    cat_attr = ['model', 'transmission', 'fuel_type']#, 'manufacturer']
    cont_attr = ['year', 'mileage', 'road_tax', 'mpg', 'engine_size']
    target = 'price'
    
    cont_X = [float(row[name]) for name in cont_attr]
    cat_X = [dropdown_encoders[name].get(row[name], dropdown_encoders[name]['UNKNOWN']) for name in cat_attr]
            
    return {'label': np.array([float(row[target])], dtype=np.float32),
            'cont_X': np.array(cont_X, dtype=np.float32), 
            'cat_X': np.array(cat_X, dtype=int)}

![](https://raw.githubusercontent.com/dvgodoy/assets/main/PyTorchInPractice/images/ch0/data_step4.png)

In [14]:
datasets = datasets.map(preproc)
datasets

Map:   0%|          | 0/79349 [00:00<?, ? examples/s]

Map:   0%|          | 0/9919 [00:00<?, ? examples/s]

Map:   0%|          | 0/9919 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['model', 'year', 'price', 'transmission', 'mileage', 'fuel_type', 'road_tax', 'mpg', 'engine_size', 'manufacturer', 'label', 'cont_X', 'cat_X'],
        num_rows: 79349
    })
    val: Dataset({
        features: ['model', 'year', 'price', 'transmission', 'mileage', 'fuel_type', 'road_tax', 'mpg', 'engine_size', 'manufacturer', 'label', 'cont_X', 'cat_X'],
        num_rows: 9919
    })
    test: Dataset({
        features: ['model', 'year', 'price', 'transmission', 'mileage', 'fuel_type', 'road_tax', 'mpg', 'engine_size', 'manufacturer', 'label', 'cont_X', 'cat_X'],
        num_rows: 9919
    })
})

In [16]:
datasets = datasets.select_columns(['label', 'cont_X', 'cat_X'])
datasets

DatasetDict({
    train: Dataset({
        features: ['label', 'cont_X', 'cat_X'],
        num_rows: 79349
    })
    val: Dataset({
        features: ['label', 'cont_X', 'cat_X'],
        num_rows: 9919
    })
    test: Dataset({
        features: ['label', 'cont_X', 'cat_X'],
        num_rows: 9919
    })
})

In [17]:
datasets['train'][:2]

{'label': [[15695.0], [15200.0]],
 'cont_X': [[2018.0, 11793.0, 145.0, 65.69999694824219, 1.600000023841858],
  [2016.0, 15440.0, 20.0, 67.30000305175781, 2.0]],
 'cat_X': [[87, 1, 0], [59, 1, 0]]}

In [18]:
datasets = datasets.with_format('torch')
datasets['train'][:2]

{'label': tensor([[15695.],
         [15200.]]),
 'cont_X': tensor([[2.0180e+03, 1.1793e+04, 1.4500e+02, 6.5700e+01, 1.6000e+00],
         [2.0160e+03, 1.5440e+04, 2.0000e+01, 6.7300e+01, 2.0000e+00]]),
 'cat_X': tensor([[87,  1,  0],
         [59,  1,  0]])}

![](https://raw.githubusercontent.com/dvgodoy/assets/main/PyTorchInPractice/images/ch0/data_step5.png)

In [19]:
from torch.utils.data import DataLoader

dataloaders = {}
dataloaders['train'] = DataLoader(dataset=datasets['train'], batch_size=128, drop_last=True, shuffle=True)
dataloaders['val'] = DataLoader(dataset=datasets['val'], batch_size=128)
dataloaders['test'] = DataLoader(dataset=datasets['test'], batch_size=128)

In [20]:
next(iter(dataloaders['train']))

{'label': tensor([[13799.],
         [20450.],
         [10495.],
         [10280.],
         [14800.],
         [ 8000.],
         [10495.],
         [17195.],
         [20992.],
         [16950.],
         [ 6995.],
         [20495.],
         [ 8514.],
         [12221.],
         [15999.],
         [12995.],
         [21990.],
         [ 8490.],
         [12490.],
         [28990.],
         [ 6750.],
         [ 9990.],
         [ 6950.],
         [ 8950.],
         [ 7998.],
         [32444.],
         [22950.],
         [14406.],
         [16990.],
         [24995.],
         [ 7490.],
         [31495.],
         [24000.],
         [48444.],
         [ 5599.],
         [22000.],
         [12585.],
         [12300.],
         [ 9695.],
         [10981.],
         [ 7676.],
         [ 8498.],
         [ 7990.],
         [11990.],
         [15298.],
         [23888.],
         [10999.],
         [37000.],
         [17250.],
         [10000.],
         [14691.],
         [20176.],
   

### 5.3.2 BatchNorm for Continuous Attributes

![](https://raw.githubusercontent.com/dvgodoy/assets/main/PyTorchInPractice/images/ch0/model_step1.png)

In [21]:
import torch.nn as nn

batch = next(iter(dataloaders['train']))
batch['cont_X'].mean(axis=0), batch['cont_X'].std(axis=0, unbiased=False)

(tensor([2.0170e+03, 2.5235e+04, 1.2258e+02, 5.3516e+01, 1.6953e+00]),
 tensor([1.9764e+00, 2.2630e+04, 6.1046e+01, 9.9440e+00, 5.8987e-01]))

In [22]:
bn_layer = nn.BatchNorm1d(num_features=len(cont_attr))

normalized_cont = bn_layer(batch['cont_X'])
normalized_cont.mean(axis=0), normalized_cont.std(axis=0, unbiased=False)

(tensor([-2.4028e-06,  5.2154e-08,  1.8626e-09, -1.0617e-07, -2.4214e-07],
        grad_fn=<MeanBackward1>),
 tensor([1.0000, 1.0000, 1.0000, 1.0000, 1.0000], grad_fn=<StdBackward0>))

In [23]:
bn_layer.state_dict()

OrderedDict([('weight', tensor([1., 1., 1., 1., 1.])),
             ('bias', tensor([0., 0., 0., 0., 0.])),
             ('running_mean',
              tensor([2.0170e+02, 2.5235e+03, 1.2258e+01, 5.3516e+00, 1.6953e-01])),
             ('running_var',
              tensor([1.2937e+00, 5.1613e+07, 3.7649e+02, 1.0866e+01, 9.3507e-01])),
             ('num_batches_tracked', tensor(1))])