# Chapter 4 - In Text Examples and Musings

#### 4.1

In [1]:
import imageio

In [2]:
img_arr = imageio.imread('/home/kamil/_LEARNING/dlwpt-code/data/p1ch4/image-dog/bobby.jpg')

In [3]:
img_arr.shape

(720, 1280, 3)

In [4]:
# we can change the image

In [5]:
import torch

In [7]:
img = torch.from_numpy(img_arr)

In [9]:
img.shape

torch.Size([720, 1280, 3])

In [10]:
out = img.permute(2,0,1)

In [12]:
out.shape

torch.Size([3, 720, 1280])

In [13]:
from pathlib import Path

In [14]:
data_dir = Path('/home/kamil/_LEARNING/dlwpt-code/data/p1ch4/image-cats/')

In [15]:
batch_size = 3

In [16]:
batch = torch.zeros(batch_size, 3, 256, 256, dtype=torch.uint8)

In [19]:
for i,filename in enumerate(data_dir.glob('*.*')):
    img_arr = imageio.imread(filename)
    img_t = torch.from_numpy(img_arr)
    img_t = img_t.permute(2,0,1) # change order to Channel, Height, Width from Height, Width, Channel
    img_t = img_t[:3] # take only the first three channels, avoiding others like alpha
    batch[i] = img_t

In [20]:
batch = batch.float()

In [21]:
batch = batch / 255.0

In [25]:
# per channel z-score
n_channels = batch.shape[1] # get the number of channels
for c in range(n_channels): # loop over each channel
    mean = torch.mean(batch[:,c]) # get the mean of the channel for all batches: all batches, all rows, all widths for a specific channel
    std = torch.std(batch[:,c]) # same as above, but standard deviation
    batch[:,c] = (batch[:,c] - mean) / std # update the batch, now every pixel in that channel will have 0-mean and 1-stdev

In [26]:
# 4.2.1 volumetric data

In [27]:
dir_path = Path('/home/kamil/_LEARNING/dlwpt-code/data/p1ch4/volumetric-dicom/2-LUNG 3.0  B70f-04083/')

In [28]:
vol_arr = imageio.volread(dir_path, 'DICOM')

Reading DICOM (examining files): 1/99 files (1.0%99/99 files (100.0%)
  Found 1 correct series.
Reading DICOM (loading data): 99/99  (100.0%)


In [29]:
vol_arr.shape

(99, 512, 512)

In [30]:
# we have 99 images, each 512x512, no color channel this case because it's greyscale and omitted

In [31]:
# pytorch is expecting a channel dimension so we will need to add that in via unsqueeze

In [32]:
vol = torch.from_numpy(vol_arr)
vol = vol.float()
vol.shape

torch.Size([99, 512, 512])

In [33]:
vol = torch.unsqueeze(vol, 0) # add a channel dimension, so we should have Channel=1, Depth=99, Height=512, Width=512
vol.shape

torch.Size([1, 99, 512, 512])

#### 4.3 Tabular Data

In [34]:
BASE = Path('/home/kamil/_LEARNING/dlwpt-code/data/p1ch4/')

In [35]:
import csv

In [36]:
wine_path = BASE / 'tabular-wine' / 'winequality-white.csv'

In [37]:
wine_path.exists()

True

In [39]:
import numpy as np

In [40]:
wineq_numpy = np.loadtxt(wine_path, dtype=np.float32, delimiter=';', skiprows=1) # cast to float32 so tensor is correct type, skip the header

In [41]:
wineq_numpy

array([[ 7.  ,  0.27,  0.36, ...,  0.45,  8.8 ,  6.  ],
       [ 6.3 ,  0.3 ,  0.34, ...,  0.49,  9.5 ,  6.  ],
       [ 8.1 ,  0.28,  0.4 , ...,  0.44, 10.1 ,  6.  ],
       ...,
       [ 6.5 ,  0.24,  0.19, ...,  0.46,  9.4 ,  6.  ],
       [ 5.5 ,  0.29,  0.3 , ...,  0.38, 12.8 ,  7.  ],
       [ 6.  ,  0.21,  0.38, ...,  0.32, 11.8 ,  6.  ]], dtype=float32)

In [42]:
# get columns

In [43]:
col_list = next(csv.reader(open(wine_path), delimiter=';'))

In [44]:
col_list

['fixed acidity',
 'volatile acidity',
 'citric acid',
 'residual sugar',
 'chlorides',
 'free sulfur dioxide',
 'total sulfur dioxide',
 'density',
 'pH',
 'sulphates',
 'alcohol',
 'quality']

In [45]:
wineq_numpy.shape

(4898, 12)

In [46]:
# load to tensors
wineq = torch.from_numpy(wineq_numpy)

In [49]:
wineq.shape, wineq.dtype

(torch.Size([4898, 12]), torch.float32)

In [50]:
# get score and features
data = wineq[:,:-1] # select all rows, but not the last column
data.shape

torch.Size([4898, 11])

In [51]:
target = wineq[:,-1] # select all rows, ONLY the last column

In [52]:
target.shape

torch.Size([4898])

In [53]:
target

tensor([6., 6., 6.,  ..., 6., 7., 6.])

In [54]:
# we have 2 options: regress to get a score or classify to get a score
# let's one-hot encode

In [55]:
# convert to int
target = target.long()

In [56]:
target

tensor([6, 6, 6,  ..., 6, 7, 6])

In [57]:
target_onehot = torch.zeros(target.shape[0], 10) # 10 b/c we have that many distinct values

In [58]:
target_onehot

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])

In [59]:
target_onehot.scatter(1, target.unsqueeze(1), 1.0)

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 1., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])

In [63]:
target

tensor([6, 6, 6,  ..., 6, 7, 6])

In [62]:
target.unsqueeze(1)

tensor([[6],
        [6],
        [6],
        ...,
        [6],
        [7],
        [6]])

In [64]:
target_onehot = target_onehot.scatter(1, target.unsqueeze(1), 1.0)
# or use scatter_

In [65]:
target_onehot

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 1., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])

In [66]:
# normalize the data

In [74]:
data[:,0].shape

torch.Size([4898])

In [71]:
data_mean = data.mean(dim=0) # dim=0 makes us calculate for each column, without it we would get a single value
data_mean

tensor([6.8548e+00, 2.7824e-01, 3.3419e-01, 6.3914e+00, 4.5772e-02, 3.5308e+01,
        1.3836e+02, 9.9403e-01, 3.1883e+00, 4.8985e-01, 1.0514e+01])

In [70]:
data_std = data.std(dim=0)
data_std

tensor([8.4387e-01, 1.0079e-01, 1.2102e-01, 5.0721e+00, 2.1848e-02, 1.7007e+01,
        4.2498e+01, 2.9909e-03, 1.5100e-01, 1.1413e-01, 1.2306e+00])

In [75]:
data_normd = (data - data_mean)/data_std
data_normd

tensor([[ 1.7209e-01, -8.1764e-02,  2.1325e-01,  ..., -1.2468e+00,
         -3.4914e-01, -1.3930e+00],
        [-6.5743e-01,  2.1587e-01,  4.7991e-02,  ...,  7.3992e-01,
          1.3467e-03, -8.2418e-01],
        [ 1.4756e+00,  1.7448e-02,  5.4378e-01,  ...,  4.7502e-01,
         -4.3677e-01, -3.3662e-01],
        ...,
        [-4.2042e-01, -3.7940e-01, -1.1915e+00,  ..., -1.3131e+00,
         -2.6152e-01, -9.0544e-01],
        [-1.6054e+00,  1.1666e-01, -2.8253e-01,  ...,  1.0048e+00,
         -9.6250e-01,  1.8574e+00],
        [-1.0129e+00, -6.7703e-01,  3.7852e-01,  ...,  4.7502e-01,
         -1.4882e+00,  1.0448e+00]])

In [None]:
# finding thresholds


#### 4.4 Time Series

In [84]:
bikes_np = np.loadtxt(BASE/'bike-sharing-dataset'/'hour-fixed.csv', 
                      dtype=np.float32, \
                      delimiter=',', \
                      skiprows=1, \
                      converters={1: lambda x: float(x[8:10])})
bikes_np

array([[1.0000e+00, 1.0000e+00, 1.0000e+00, ..., 3.0000e+00, 1.3000e+01,
        1.6000e+01],
       [2.0000e+00, 1.0000e+00, 1.0000e+00, ..., 8.0000e+00, 3.2000e+01,
        4.0000e+01],
       [3.0000e+00, 1.0000e+00, 1.0000e+00, ..., 5.0000e+00, 2.7000e+01,
        3.2000e+01],
       ...,
       [1.7377e+04, 3.1000e+01, 1.0000e+00, ..., 7.0000e+00, 8.3000e+01,
        9.0000e+01],
       [1.7378e+04, 3.1000e+01, 1.0000e+00, ..., 1.3000e+01, 4.8000e+01,
        6.1000e+01],
       [1.7379e+04, 3.1000e+01, 1.0000e+00, ..., 1.2000e+01, 3.7000e+01,
        4.9000e+01]], dtype=float32)

In [85]:
bikes  = torch.from_numpy(bikes_np)
bikes

tensor([[1.0000e+00, 1.0000e+00, 1.0000e+00,  ..., 3.0000e+00, 1.3000e+01,
         1.6000e+01],
        [2.0000e+00, 1.0000e+00, 1.0000e+00,  ..., 8.0000e+00, 3.2000e+01,
         4.0000e+01],
        [3.0000e+00, 1.0000e+00, 1.0000e+00,  ..., 5.0000e+00, 2.7000e+01,
         3.2000e+01],
        ...,
        [1.7377e+04, 3.1000e+01, 1.0000e+00,  ..., 7.0000e+00, 8.3000e+01,
         9.0000e+01],
        [1.7378e+04, 3.1000e+01, 1.0000e+00,  ..., 1.3000e+01, 4.8000e+01,
         6.1000e+01],
        [1.7379e+04, 3.1000e+01, 1.0000e+00,  ..., 1.2000e+01, 3.7000e+01,
         4.9000e+01]])

In [87]:
bikes.shape, bikes.stride()

(torch.Size([17520, 17]), (17, 1))

In [88]:
# that is 17,520 hourly obersvations of 17 features

In [89]:
# let's reshape it so that we have 3-dimensions, batch, 24 hours, 17 features
daily_bikes = bikes.view(-1, 24, bikes.shape[1]) # -1 --> make it fit, 24 hours , bikes.shape[1] --> 17
daily_bikes.shape, daily_bikes.stride()

(torch.Size([730, 24, 17]), (408, 17, 1))

In [90]:
# pausing for now. next step is to one-hot encode the weather data

In [93]:
first_day = bikes[:24].long() # slice off the first day and convert to ints
first_day.shape # should be 1,24,17

torch.Size([24, 17])

In [97]:
weather_onehot = torch.zeros(first_day.shape[0],4) # should be 24, 4 --> and the 4 are for our 4 weather classifications of 1,2,3,4 
weather_onehot.shape

torch.Size([24, 4])

In [98]:
first_day[:,9] # view a slice of all hours but only the index=9 (i.e. 10th) column, the weather column

tensor([1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 2, 2, 2, 2])

In [103]:
weather_onehot = weather_onehot.scatter_(
    dim=1,
    index=first_day[:,9].unsqueeze(1).long() - 1, 
    value=1.0) 

# along the axis=1
# first day index columns, we  use unsqueeze to add extra dimension [[1],[1],...[2],[2]] and we subtract 1 to fix for indexing
# populate with 1.0's
weather_onehot

tensor([[1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [0., 1., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [0., 0., 1., 0.],
        [0., 0., 1., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.]])

In [104]:
#add this result back to our data
torch.cat((bikes[:24],weather_onehot),1)[:1] # concatenate the first day with the one-hot encoded data, adding it along the column axis

tensor([[ 1.0000,  1.0000,  1.0000,  0.0000,  1.0000,  0.0000,  0.0000,  6.0000,
          0.0000,  1.0000,  0.2400,  0.2879,  0.8100,  0.0000,  3.0000, 13.0000,
         16.0000,  1.0000,  0.0000,  0.0000,  0.0000]])

In [105]:
# the last 4 entries are [1,0,0,0] which correspond to our one-hot encoded 1 value

In [106]:
## to do this for the entire dataset we could follow the same procedure

In [109]:
# build our daily weather slice
daily_weather_one_hot = torch.zeros(daily_bikes.shape[0], 4, daily_bikes.shape[1]) # all the days, 4 columns, 24-hours
daily_weather_one_hot.shape

torch.Size([730, 4, 24])

In [108]:
daily_bikes.shape

torch.Size([730, 24, 17])

In [121]:
daily_weather_one_hot = daily_weather_one_hot.scatter(
    dim=1,
    index=daily_bikes[:,:,9].unsqueeze(1).long()-1, # all days, all hours, (10th column) : give extra dimension, convert to int, subtract 1 for indexing
    value=1.0)
daily_weather_one_hot.shape

torch.Size([730, 4, 24])

In [125]:
daily_bikes.shape, daily_weather_one_hot.shape

(torch.Size([730, 24, 17]), torch.Size([730, 4, 24]))

In [131]:
# need to rearrange the daily weather one
daily_weather_one_hot_reshape = torch.reshape(daily_weather_one_hot, (daily_weather_one_hot.shape[0],daily_weather_one_hot.shape[2],daily_weather_one_hot.shape[1]))
daily_weather_one_hot_reshape.shape

torch.Size([730, 24, 4])

In [132]:
# add the one hot encoded weather as a column
daily_bikes_one_hot = torch.cat((daily_bikes, daily_weather_one_hot_reshape),dim=2) # add one hot to daily bikes as a column

daily_bikes_one_hot.shape

torch.Size([730, 24, 21])

## 4.5 Natural Language Processing

In [154]:
with open(BASE/'jane-austen'/'1342-0.txt', encoding='utf8') as f:
    text = f.read()

In [155]:
lines = text.split('\n')
line = lines[200]
line

'“Impossible, Mr. Bennet, impossible, when I am not acquainted with him'

In [156]:
# use ascii, so 128 character limit
letter_t = torch.zeros(len(line),128)
letter_t.shape

torch.Size([70, 128])

In [157]:
# this is now a matrix that holds will hold a one-hot encoded letter 
# we have 70 characters in the string which will be our rows, then we encode 
# those characters from our limited 128 character set

In [158]:
for i, letter in enumerate(line.lower().strip()):
    letter_index = ord(letter) if ord(letter) < 128 else 0 # ord() returns the unicode integer value, we are limiting ourselves to the first 128 unicode values which is the ASCII set
    letter_t[i][letter_index] = 1 # for the ith letter at the ascii integer value write a 1 (otherwise all 0s)

In [159]:
ord?

[0;31mSignature:[0m [0mord[0m[0;34m([0m[0mc[0m[0;34m,[0m [0;34m/[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m Return the Unicode code point for a one-character string.
[0;31mType:[0m      builtin_function_or_method


In [165]:
def clean_words(input_str):
    punctuation = '.,;:"!?“”_-'
    word_list = input_str.lower().replace('\n',' ').split()
    word_list = [word.strip(punctuation) for word in word_list]
    return word_list

In [166]:
words_in_line = clean_words(line)
words_in_line

['impossible',
 'mr',
 'bennet',
 'impossible',
 'when',
 'i',
 'am',
 'not',
 'acquainted',
 'with',
 'him']

In [170]:
word_list = sorted(set(clean_words(text)))

In [171]:
word2index_dict = {word: i for (i,word) in enumerate(word_list)}

In [172]:
len(word2index_dict), word2index_dict['impossible']

(7261, 3394)

In [173]:
# now we need to one-hot encode using our dictionary and corresponding index values
word_t = torch.zeros(len(words_in_line), len(word2index_dict)) # row for each word in the line, column for each word in the vocabulary
for i, word in enumerate(words_in_line): # index of the word in the line will correspond to the row
    word_index = word2index_dict[word]
    word_t[i][word_index] = 1
    print('{:2} {:4} {}'.format(i,word_index,word))

 0 3394 impossible
 1 4305 mr
 2  813 bennet
 3 3394 impossible
 4 7078 when
 5 3315 i
 6  415 am
 7 4436 not
 8  239 acquainted
 9 7148 with
10 3215 him
