<a href="https://colab.research.google.com/github/ibacaraujo/deep-learning-with-pytorch/blob/master/ch3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Chapter 3

In [8]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [0]:
import torch
import numpy as np
import csv

## 3.1 Tabular data

In [0]:
wine_path = "drive/My Drive/dl-pytorch/winequality-white.csv"
wineq_numpy = np.loadtxt(wine_path, dtype=np.float32, delimiter=";",
                         skiprows=1)
wineq_numpy

array([[ 7.  ,  0.27,  0.36, ...,  0.45,  8.8 ,  6.  ],
       [ 6.3 ,  0.3 ,  0.34, ...,  0.49,  9.5 ,  6.  ],
       [ 8.1 ,  0.28,  0.4 , ...,  0.44, 10.1 ,  6.  ],
       ...,
       [ 6.5 ,  0.24,  0.19, ...,  0.46,  9.4 ,  6.  ],
       [ 5.5 ,  0.29,  0.3 , ...,  0.38, 12.8 ,  7.  ],
       [ 6.  ,  0.21,  0.38, ...,  0.32, 11.8 ,  6.  ]], dtype=float32)

In [0]:
col_list = next(csv.reader(open(wine_path), delimiter=";"))

In [0]:
wineq_numpy.shape, col_list

((4898, 12),
 ['fixed acidity',
  'volatile acidity',
  'citric acid',
  'residual sugar',
  'chlorides',
  'free sulfur dioxide',
  'total sulfur dioxide',
  'density',
  'pH',
  'sulphates',
  'alcohol',
  'quality'])

In [0]:
# convert the NumPy array to a PyTorch tensor
wineq = torch.from_numpy(wineq_numpy)

In [0]:
wineq.shape, wineq.type()

(torch.Size([4898, 12]), 'torch.FloatTensor')

In [0]:
data = wineq[:, :-1]

In [0]:
data, data.shape

(tensor([[ 7.0000,  0.2700,  0.3600,  ...,  3.0000,  0.4500,  8.8000],
         [ 6.3000,  0.3000,  0.3400,  ...,  3.3000,  0.4900,  9.5000],
         [ 8.1000,  0.2800,  0.4000,  ...,  3.2600,  0.4400, 10.1000],
         ...,
         [ 6.5000,  0.2400,  0.1900,  ...,  2.9900,  0.4600,  9.4000],
         [ 5.5000,  0.2900,  0.3000,  ...,  3.3400,  0.3800, 12.8000],
         [ 6.0000,  0.2100,  0.3800,  ...,  3.2600,  0.3200, 11.8000]]),
 torch.Size([4898, 11]))

In [0]:
target = wineq[:, -1].long()

In [0]:
target, target.shape

(tensor([6, 6, 6,  ..., 6, 7, 6]), torch.Size([4898]))

In [0]:
target_onehot = torch.zeros(target.shape[0], 10)
target_onehot.scatter_(1, target.unsqueeze(1), 1.0)

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 1., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])

In [0]:
target

tensor([6, 6, 6,  ..., 6, 7, 6])

In [0]:
target_unsqueezed = target.unsqueeze(1)
target_unsqueezed

tensor([[6],
        [6],
        [6],
        ...,
        [6],
        [7],
        [6]])

In [0]:
target_unsqueezed_in_zero = target.unsqueeze(0)
target_unsqueezed_in_zero

tensor([[6, 6, 6,  ..., 6, 7, 6]])

In [0]:
# dim=0 means that reduction is performed at dimension 0
data_mean = torch.mean(data, dim=0)
data_mean

tensor([6.8548e+00, 2.7824e-01, 3.3419e-01, 6.3914e+00, 4.5772e-02, 3.5308e+01,
        1.3836e+02, 9.9403e-01, 3.1883e+00, 4.8985e-01, 1.0514e+01])

In [0]:
data_var = torch.var(data, dim=0)

In [0]:
# normalize the data
data_normalized = (data - data_mean) / torch.sqrt(data_var)
data_normalized

tensor([[ 1.7209e-01, -8.1764e-02,  2.1325e-01,  ..., -1.2468e+00,
         -3.4914e-01, -1.3930e+00],
        [-6.5743e-01,  2.1587e-01,  4.7991e-02,  ...,  7.3992e-01,
          1.3467e-03, -8.2418e-01],
        [ 1.4756e+00,  1.7448e-02,  5.4378e-01,  ...,  4.7502e-01,
         -4.3677e-01, -3.3662e-01],
        ...,
        [-4.2042e-01, -3.7940e-01, -1.1915e+00,  ..., -1.3131e+00,
         -2.6152e-01, -9.0544e-01],
        [-1.6054e+00,  1.1666e-01, -2.8253e-01,  ...,  1.0048e+00,
         -9.6250e-01,  1.8574e+00],
        [-1.0129e+00, -6.7703e-01,  3.7852e-01,  ...,  4.7502e-01,
         -1.4882e+00,  1.0448e+00]])

In [0]:
# to tell good and bad wines apart at a glance
bad_indexes = torch.le(target, 3)
bad_indexes.shape, bad_indexes.dtype, bad_indexes.sum()

(torch.Size([4898]), torch.bool, tensor(20))

In [0]:
# using advanced indexing to use bad_indexes to get the bad wines examples
bad_data = data[bad_indexes]
bad_data.shape

torch.Size([20, 11])

In [0]:
bad_data = data[torch.le(target, 3)]
mid_data = data[torch.gt(target, 3) & torch.lt(target, 7)]
good_data = data[torch.ge(target, 7)]

In [0]:
bad_mean = torch.mean(bad_data, dim=0)
mid_mean = torch.mean(mid_data, dim=0)
good_mean = torch.mean(good_data, dim=0)

In [0]:
for i, args in enumerate(zip(col_list, bad_mean, mid_mean, good_mean)):
  print('{:2} {:20} {:6.2f} {:6.2f} {:6.2f}'.format(i, *args))

 0 fixed acidity          7.60   6.89   6.73
 1 volatile acidity       0.33   0.28   0.27
 2 citric acid            0.34   0.34   0.33
 3 residual sugar         6.39   6.71   5.26
 4 chlorides              0.05   0.05   0.04
 5 free sulfur dioxide   53.33  35.42  34.55
 6 total sulfur dioxide 170.60 141.83 125.25
 7 density                0.99   0.99   0.99
 8 pH                     3.19   3.18   3.22
 9 sulphates              0.47   0.49   0.50
10 alcohol               10.34  10.26  11.42


In [0]:
# now get the indexes in which the total sulfur dioxide column 
# is below the mid-point
total_sulfur_threshold = 141.83
total_sulfur_data = data[:, 6]
# less than because less than this value are good ones, as observed in the table
predicted_indexes = torch.lt(total_sulfur_data, total_sulfur_threshold)

In [0]:
predicted_indexes.shape, predicted_indexes.dtype, predicted_indexes.sum()

(torch.Size([4898]), torch.bool, tensor(2727))

In [0]:
# next to get the indices of the good wines
actual_indexes = torch.gt(target, 5)
actual_indexes.shape, actual_indexes.dtype, actual_indexes.sum()

(torch.Size([4898]), torch.bool, tensor(3258))

In [0]:
n_matches = torch.sum(actual_indexes & predicted_indexes)
print(n_matches)
n_matches = torch.sum(actual_indexes & predicted_indexes).item()
print(n_matches)

tensor(2018)
2018


In [0]:
n_matches = torch.sum(actual_indexes & predicted_indexes).item()
n_predicted = torch.sum(predicted_indexes).item()
n_actual = torch.sum(actual_indexes).item()

In [0]:
n_matches, n_matches / n_predicted, n_matches / n_actual

(2018, 0.74000733406674, 0.6193984039287906)

## 3.2 Time series

In [0]:
bikes_numpy = np.loadtxt('drive/My Drive/dl-pytorch/bike-sharing-dataset/hour-fixed.csv',
                         dtype=np.float32,
                         delimiter=",",
                         skiprows=1,
                         converters={1: lambda x: float(x[8:10])})


In [0]:
bikes = torch.from_numpy(bikes_numpy)
bikes

tensor([[1.0000e+00, 1.0000e+00, 1.0000e+00,  ..., 3.0000e+00, 1.3000e+01,
         1.6000e+01],
        [2.0000e+00, 1.0000e+00, 1.0000e+00,  ..., 8.0000e+00, 3.2000e+01,
         4.0000e+01],
        [3.0000e+00, 1.0000e+00, 1.0000e+00,  ..., 5.0000e+00, 2.7000e+01,
         3.2000e+01],
        ...,
        [1.7377e+04, 3.1000e+01, 1.0000e+00,  ..., 7.0000e+00, 8.3000e+01,
         9.0000e+01],
        [1.7378e+04, 3.1000e+01, 1.0000e+00,  ..., 1.3000e+01, 4.8000e+01,
         6.1000e+01],
        [1.7379e+04, 3.1000e+01, 1.0000e+00,  ..., 1.2000e+01, 3.7000e+01,
         4.9000e+01]])

In [0]:
bikes.shape, bikes.stride()

(torch.Size([17520, 17]), (17, 1))

In [0]:
daily_bikes = bikes.view(-1, 24, bikes.shape[1])
daily_bikes.shape, daily_bikes.stride()

(torch.Size([730, 24, 17]), (408, 17, 1))

In [0]:
daily_bikes = daily_bikes.transpose(1, 2)
daily_bikes.shape, daily_bikes.stride()

(torch.Size([730, 17, 24]), (408, 1, 17))

In [0]:
first_day = bikes[:24].long()
weather_onehot = torch.zeros(first_day.shape[0], 4)
first_day[:, 9]

tensor([1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 2, 2, 2, 2])

In [0]:
weather_onehot.scatter(
    dim=1,
    index=first_day[:,9].unsqueeze(1) - 1,
    value=1.0)

tensor([[1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [0., 1., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [0., 0., 1., 0.],
        [0., 0., 1., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.]])

In [0]:
torch.cat((bikes[:24], weather_onehot), 1)[:1]

tensor([[ 1.0000,  1.0000,  1.0000,  0.0000,  1.0000,  0.0000,  0.0000,  6.0000,
          0.0000,  1.0000,  0.2400,  0.2879,  0.8100,  0.0000,  3.0000, 13.0000,
         16.0000,  0.0000,  0.0000,  0.0000,  0.0000]])

In [0]:
daily_weather_onehot = torch.zeros(daily_bikes.shape[0], 4,
                                   daily_bikes.shape[2])
daily_weather_onehot.shape

torch.Size([730, 4, 24])

In [0]:
daily_weather_onehot.scatter(1, daily_bikes[:,9,:].long().unsqueeze(1)-1,
                             1.0)
daily_weather_onehot.shape

torch.Size([730, 4, 24])

In [0]:
daily_bikes = torch.cat((daily_bikes, daily_weather_onehot), dim=1)

In [0]:
daily_bikes[:,9,:] = (daily_bikes[:, 9, :] - 1.0) / 3.0

In [0]:
temp = daily_bikes[:, 10, :]
temp_min = torch.min(temp)
temp_max = torch.max(temp)
daily_bikes[:, 10, :] = (daily_bikes[:, 10, :] - temp_min) / (temp_max - 
    temp_min)

In [0]:
temp = daily_bikes[:, 10, :]
daily_bikes[:, 10, :] = (daily_bikes[:, 10, :] - torch.mean(temp) / 
    torch.std(temp))

## 3.3 Text

In [10]:
!ls 'drive/My Drive/dl-pytorch/'

1342-0.txt	      ourpoints.hdf5  winequality-white.csv
bike-sharing-dataset  ourpoints.t


In [0]:
with open('drive/My Drive/dl-pytorch/1342-0.txt', encoding='utf-8') as f:
  text = f.read()

We need now to parse each character in the text and provide a one-hot encoding for each of them.

In [15]:
# First, split the text into a list of lines
lines = text.split('\n')
# pick arbitrary line to focus on
line = lines[200]
line

'      Michaelmas, and some of his servants are to be in the house by'

In [16]:
# Create tensor that can hold the total number of one-hot encoded
# characters for the whole line
letter_tensor = torch.zeros(len(line), 128)
letter_tensor.shape

torch.Size([68, 128])

In [0]:
# Set 1 in each row at the right position
# so that each row represents the right character
for i, letter in enumerate(line.lower().strip()):
  letter_index = ord(letter) if ord(letter) < 128 else 0
  letter_tensor[i][letter_index] = 1

In [0]:
def clean_words(input_str):
  punctuation = '.,;:"!?_-'
  word_list = input_str.lower().replace('\n', ' ').split()
  word_list = [word.strip(punctuation) for word in word_list]
  return word_list

In [20]:
words_in_line = clean_words(line)
line, words_in_line

('      Michaelmas, and some of his servants are to be in the house by',
 ['michaelmas',
  'and',
  'some',
  'of',
  'his',
  'servants',
  'are',
  'to',
  'be',
  'in',
  'the',
  'house',
  'by'])

In [21]:
# Build a mapping of words to indexes in your encoding
word_list = sorted(set(clean_words(text)))
word2index_dict = {word: i for (i, word) in enumerate(word_list)}

len(word2index_dict), word2index_dict['impossible']

(8497, 3814)

In [22]:
word_tensor = torch.zeros(len(words_in_line), len(word2index_dict))
for i, word in enumerate(words_in_line):
  word_index = word2index_dict[word]
  word_tensor[i][word_index] = 1
  print('{:2} {:4} {}'.format(i, word_index, word))

print(word_tensor.shape)

 0 4738 michaelmas
 1  464 and
 2 6854 some
 3 5134 of
 4 3624 his
 5 6617 servants
 6  576 are
 7 7422 to
 8  791 be
 9 3841 in
10 7318 the
11 3667 house
12 1084 by
torch.Size([13, 8497])
