### 4.0.1 Installing packages

In [2]:
import torch
import numpy as np

## 4.1 Working with images

### 4.1.2 Loading an Image File

In [34]:
import imageio

img_arr= imageio.imread("C:/Users/Keitaro Ninomiya/Box/PyTorch Practice/bobby.jpg")
img_arr.shape

(720, 1280, 3)

In [35]:
img = torch.from_numpy(img_arr)
out = img.permute(2,0,1)

In [47]:
batch_size = 3
batch = torch.zeros(batch_size, 3, 256, 256, dtype = torch.uint8)

In [53]:
import os

data_dir = "https://github.com/deep-learning-with-pytorch/dlwpt-code/tree/master/data/p1ch4/image-cats"
filenames = [name for name in os.listdir(data_dir)
            if os.path.splittext(name)[-1]==".png"]
for i, filename in enumerate(filenames):
    img_arr = torch.from_numpy(img_arr)
    img_t = img_t.permute(2,0,1)
    img_t = img_t[:3]
    batch[i] = img_t

OSError: [WinError 123] The filename, directory name, or volume label syntax is incorrect: 'https://github.com/deep-learning-with-pytorch/dlwpt-code/tree/master/data/p1ch4/image-cats'

### 4.0.2 Downloading CSV data

In [23]:
import csv

wine_path="C:/Users/Keitaro Ninomiya/Box/PyTorch Practice/Chapter 4/winequality-white.csv"
wineq_numpy=np.loadtxt(wine_path,dtype=np.float32, delimiter=",",
                      skiprows=1)
wineq_numpy

array([[ 7.  ,  0.27,  0.36, ...,  0.45,  8.8 ,  6.  ],
       [ 6.3 ,  0.3 ,  0.34, ...,  0.49,  9.5 ,  6.  ],
       [ 8.1 ,  0.28,  0.4 , ...,  0.44, 10.1 ,  6.  ],
       ...,
       [ 6.5 ,  0.24,  0.19, ...,  0.46,  9.4 ,  6.  ],
       [ 5.5 ,  0.29,  0.3 , ...,  0.38, 12.8 ,  7.  ],
       [ 6.  ,  0.21,  0.38, ...,  0.32, 11.8 ,  6.  ]], dtype=float32)

## 4.3 Representing tabular data

### 4.3.1 Using a real-world dataset

We're gonna try converting real-life data to tensor format to facilitate data manipulation.

As example, lets use wine data.
The data is composed of different wines(5000 obs) and its characteristics(12 dims).

In [8]:
col_list = next(csv.reader(open(wine_path),delimiter=","))
wineq_numpy.shape, col_list

((4898, 12),
 ['fixed acidity',
  'volatile acidity',
  'citric acid',
  'residual sugar',
  'chlorides',
  'free sulfur dioxide',
  'total sulfur dioxide',
  'density',
  'pH',
  'sulphates',
  'alcohol',
  'quality'])

In [24]:
wineq = torch.from_numpy(wineq_numpy)
wineq.shape, wineq.dtype

(torch.Size([4898, 12]), torch.float32)

### 4.3.3 Representing scores

There are two types of representation of scores.
First one is the continuous type.
This is useful for data with cardinal information(wine quality).

The other one is the discrete type. This can be represented in one-hot encoding format.

In [26]:
data = wineq[:,:-1]
data, data.shape

(tensor([[ 7.0000,  0.2700,  0.3600,  ...,  3.0000,  0.4500,  8.8000],
         [ 6.3000,  0.3000,  0.3400,  ...,  3.3000,  0.4900,  9.5000],
         [ 8.1000,  0.2800,  0.4000,  ...,  3.2600,  0.4400, 10.1000],
         ...,
         [ 6.5000,  0.2400,  0.1900,  ...,  2.9900,  0.4600,  9.4000],
         [ 5.5000,  0.2900,  0.3000,  ...,  3.3400,  0.3800, 12.8000],
         [ 6.0000,  0.2100,  0.3800,  ...,  3.2600,  0.3200, 11.8000]]),
 torch.Size([4898, 11]))

In [27]:
target = wineq[:, -1]
target,target.shape

(tensor([6., 6., 6.,  ..., 6., 7., 6.]), torch.Size([4898]))

In [37]:
target=wineq[:,-1].long()
target

tensor([6, 6, 6,  ..., 6, 7, 6])

tensor([[6, 6, 6,  ..., 6, 7, 6]])

### 4.3.4 One-Hot Encoding

One-hot encoding requires outcome variables to be represented by multi-columns with 0/1 binary information.

E.G. {$y_1$=Hot,$y_2$=Cold} can be represented as {$y_1$=[1,0],$y_2$=[0,1]}

$scatter$ function does the convertion. But to perform scattering, the "dimensions" of the argument dataset and output dataset must match.

In [33]:
#Creating one-hot encoded  vars
target_onehot = torch.zeros(target.shape[0],10)
target_onehot.scatter_(1,target.unsqueeze(1),1.0)


tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 1., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])

In [27]:
target_unsqueezed = target.unsqueeze(1)
target_unsqueezed

tensor([[6],
        [6],
        [6],
        ...,
        [6],
        [7],
        [6]])

### 4.3.5 When to Categorize

In [29]:
data_mean = torch.mean(data, dim=0)
data_mean

tensor([6.8548e+00, 2.7824e-01, 3.3419e-01, 6.3914e+00, 4.5772e-02, 3.5308e+01,
        1.3836e+02, 9.9403e-01, 3.1883e+00, 4.8985e-01, 1.0514e+01])

In [31]:
data_var=torch.var(data, dim=0)
data_var

tensor([7.1211e-01, 1.0160e-02, 1.4646e-02, 2.5726e+01, 4.7733e-04, 2.8924e+02,
        1.8061e+03, 8.9455e-06, 2.2801e-02, 1.3025e-02, 1.5144e+00])

In [33]:
data_normalized = (data - data_mean)/torch.sqrt(data_var)
data_normalized

tensor([[ 1.7208e-01, -8.1761e-02,  2.1326e-01,  ..., -1.2468e+00,
         -3.4915e-01, -1.3930e+00],
        [-6.5743e-01,  2.1587e-01,  4.7996e-02,  ...,  7.3995e-01,
          1.3422e-03, -8.2419e-01],
        [ 1.4756e+00,  1.7450e-02,  5.4378e-01,  ...,  4.7505e-01,
         -4.3677e-01, -3.3663e-01],
        ...,
        [-4.2043e-01, -3.7940e-01, -1.1915e+00,  ..., -1.3130e+00,
         -2.6153e-01, -9.0545e-01],
        [-1.6054e+00,  1.1666e-01, -2.8253e-01,  ...,  1.0049e+00,
         -9.6251e-01,  1.8574e+00],
        [-1.0129e+00, -6.7703e-01,  3.7852e-01,  ...,  4.7505e-01,
         -1.4882e+00,  1.0448e+00]])

### 4.3.6 Finding thresholds
Now we are going to actually create a function that categorizes input data.

In [34]:
#Wine data, <=3 quality is defined as bad
bad_indexes = target <= 3
bad_indexes.shape ,bad_indexes.dtype, bad_indexes.sum()

(torch.Size([4898]), torch.bool, tensor(20))

In [36]:
bad_data=data[bad_indexes]
bad_data.shape

torch.Size([20, 11])

In [66]:
bad_data = data[target <= 3]
mid_data = data[(target > 3) & (target < 7)]
good_data = data[target >= 7]

bad_mean=torch.mean(bad_data,dim=0)
mid_mean=torch.mean(mid_data,dim=0)
good_mean= torch.mean(good_data,dim=0)

for i,args in enumerate(zip(col_list, bad_mean, mid_mean, good_mean)):
    print('{:2} {:20} {:6.2f} {:6.2f} {:6.2f}'.format(i,*args))

NameError: name 'col_list' is not defined

Now we can check the precision of prediction.

In [48]:
total_sulfur_threshold = 141.83
total_sulfur_data = data[:,6]
predicted_indexes = torch.lt(total_sulfur_data, total_sulfur_threshold)

predicted_indexes.shape, predicted_indexes.dtype, predicted_indexes.sum()

(torch.Size([4898]), torch.bool, tensor(2727))

In [46]:
actual_indexes = target >5

actual_indexes.shape, actual_indexes.dtype, actual_indexes.sum()

(torch.Size([4898]), torch.bool, tensor(3258))

In [49]:
torch.sum(actual_indexes & predicted_indexes)

tensor(2018)

In [43]:
n_matches = torch.sum(actual_indexes & predicted_indexes).item()
n_predicted = torch.sum(predicted_indexes).item()
n_actual = torch.sum(actual_indexes).item()

n_matches, n_matches/n_predicted, n_matches/n_actual

(2018, 0.74000733406674, 0.6193984039287906)

## 4.4 Working with time series
We can convert the time dimension of time series data with respect to time units (yrs, hours).

In [3]:
bikes_numpy = np.loadtxt(
    "C:/Users/Keitaro Ninomiya/Box/PyTorch Practice/Chapter 4/hour-fixed.csv",
    dtype=np.float32, 
    delimiter=",",
    skiprows=1,
    converters={1: lambda x: float(x[8:10])})
bikes = torch.from_numpy(bikes_numpy)
bikes

tensor([[1.0000e+00, 1.1000e+01, 1.0000e+00,  ..., 3.0000e+00, 1.3000e+01,
         1.6000e+01],
        [2.0000e+00, 1.1000e+01, 1.0000e+00,  ..., 8.0000e+00, 3.2000e+01,
         4.0000e+01],
        [3.0000e+00, 1.1000e+01, 1.0000e+00,  ..., 5.0000e+00, 2.7000e+01,
         3.2000e+01],
        ...,
        [1.7377e+04, 1.2000e+01, 1.0000e+00,  ..., 7.0000e+00, 8.3000e+01,
         9.0000e+01],
        [1.7378e+04, 1.2000e+01, 1.0000e+00,  ..., 1.3000e+01, 4.8000e+01,
         6.1000e+01],
        [1.7379e+04, 1.2000e+01, 1.0000e+00,  ..., 1.2000e+01, 3.7000e+01,
         4.9000e+01]])

### 4.4.2 Shaping the data by time period

In [7]:
bikes.shape, bikes.stride()

(torch.Size([17520, 17]), (17, 1))

In [51]:
daily_bikes = bikes.view(-1, 24, bikes.shape[1])
daily_bikes.shape, daily_bikes.stride()

(torch.Size([730, 24, 17]), (408, 17, 1))

### 4.4.3 Ready for training

In [18]:
first_day= bikes[:24].long() #long() cuts off decimal numbers.
weather_onehot = torch.zeros(first_day.shape[0], 4)
first_day[:,9]

tensor([1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 2, 2, 2, 2])

In [12]:
weather_onehot.scatter_(dim=1,
                        index=first_day[:,9].unsqueeze(1).long()-1,
                        value=1.0)

tensor([[1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [0., 1., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [0., 0., 1., 0.],
        [0., 0., 1., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.]])

In [59]:
torch.cat((bikes[:24],weather_onehot),1)[:1]

tensor([[ 1.0000, 11.0000,  1.0000,  0.0000,  1.0000,  0.0000,  0.0000,  6.0000,
          0.0000,  1.0000,  0.2400,  0.2879,  0.8100,  0.0000,  3.0000, 13.0000,
         16.0000,  0.0000,  0.0000,  0.0000,  0.0000]])

Casting can be defined using different columns as reference.

In [15]:
daily_weather_onehot = torch.zeros(daily_bikes.shape[0], 4,
                                  daily_bikes.shape[2])
daily_weather_onehot.shape

torch.Size([730, 4, 17])

In [16]:
daily_bikes = torch.cat((daily_bikes, daily_weather_onehot), dim=1)

In [21]:
daily_bikes[:,9,:] = (daily_bikes[:,9,:]-1.0)/3.0

In [65]:
temp = daily_bikes[:, 10, :]
temp_min = torch.min(temp)
temp_max = torch.max(temp)
daily_bikes[:, 10, :] =((daily_bikes[:, 10, :]- temp_min)
                        /(temp_max - temp_min))

## 4.5 Representing Text

In [24]:
with open("C:/Users/Keitaro Ninomiya/Box/PyTorch Practice/Chapter 4/1342-0.txt",encoding="utf8") as f:
    text= f.read()

In [25]:
lines = text.split("\n")
line = lines[200]
line

'“Impossible, Mr. Bennet, impossible, when I am not acquainted with him'

In [26]:
letter_t = torch.zeros(len(line),128)
letter_t.shape

torch.Size([70, 128])

In [27]:
for i,letter in enumerate(line.lower().strip()):
    letter_index = ord(letter) if ord(letter) <128 else 0
    letter_t[i][letter_index] = 1

In [29]:
def clean_words(input_str):
    punctuation = '.,;:"!?”“_-'
    word_list = input_str.lower().replace("\n","").split()
    word_list = [word.strip(punctuation) for word in word_list]
    return word_list

words_in_line = clean_words(line)
line, words_in_line

('“Impossible, Mr. Bennet, impossible, when I am not acquainted with him',
 ['impossible',
  'mr',
  'bennet',
  'impossible',
  'when',
  'i',
  'am',
  'not',
  'acquainted',
  'with',
  'him'])

In [31]:
word_list = sorted(set(clean_words(text)))
word2index_dict = {word: i for (i, word) in enumerate(word_list)}

len(word2index_dict), word2index_dict["impossible"]

(15514, 6925)