In [13]:
import csv
import numpy as np
import torch

In [2]:
!ls data/

ourpoints.hdf5        ourpoints.t           winequality-white.csv


In [6]:
wine_path = 'data/winequality-white.csv'


In [10]:
# Read as numpy

wineq_numpy = np.loadtxt(wine_path, dtype=np.float32, delimiter=";",skiprows=1)
wineq_numpy

array([[ 7.  ,  0.27,  0.36, ...,  0.45,  8.8 ,  6.  ],
       [ 6.3 ,  0.3 ,  0.34, ...,  0.49,  9.5 ,  6.  ],
       [ 8.1 ,  0.28,  0.4 , ...,  0.44, 10.1 ,  6.  ],
       ...,
       [ 6.5 ,  0.24,  0.19, ...,  0.46,  9.4 ,  6.  ],
       [ 5.5 ,  0.29,  0.3 , ...,  0.38, 12.8 ,  7.  ],
       [ 6.  ,  0.21,  0.38, ...,  0.32, 11.8 ,  6.  ]], dtype=float32)

In [9]:
col_list = next(csv.reader(open(wine_path), delimiter=';'))
col_list

['fixed acidity',
 'volatile acidity',
 'citric acid',
 'residual sugar',
 'chlorides',
 'free sulfur dioxide',
 'total sulfur dioxide',
 'density',
 'pH',
 'sulphates',
 'alcohol',
 'quality']

In [15]:
# convert numpy as pytorch

wineq = torch.from_numpy(wineq_numpy)
wineq.shape, wineq.type()

# Note: you have a torch.FloatTensor containing all columns, including the last, which refers to the quality score.

(torch.Size([4898, 12]), 'torch.FloatTensor')

In [17]:
data = wineq[:, :-1] # Select all rows and all columns except the last.
data, data.shape

(tensor([[ 7.0000,  0.2700,  0.3600,  ...,  3.0000,  0.4500,  8.8000],
         [ 6.3000,  0.3000,  0.3400,  ...,  3.3000,  0.4900,  9.5000],
         [ 8.1000,  0.2800,  0.4000,  ...,  3.2600,  0.4400, 10.1000],
         ...,
         [ 6.5000,  0.2400,  0.1900,  ...,  2.9900,  0.4600,  9.4000],
         [ 5.5000,  0.2900,  0.3000,  ...,  3.3400,  0.3800, 12.8000],
         [ 6.0000,  0.2100,  0.3800,  ...,  3.2600,  0.3200, 11.8000]]),
 torch.Size([4898, 11]))

In [18]:
target = wineq[:, -1] # Select all rows and last column
target, target.shape

(tensor([6., 6., 6.,  ..., 6., 7., 6.]), torch.Size([4898]))

In [20]:
target_onehot = torch.zeros(target.shape[0], 10)
target_onehot, target_onehot.shape

(tensor([[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]]),
 torch.Size([4898, 10]))

In [25]:
index = target.unsqueeze(1).type(torch.int64)
index

tensor([[6],
        [6],
        [6],
        ...,
        [6],
        [7],
        [6]])

In [26]:
# For scatter_ method, index value should be int64 value

target_onehot.scatter_(1, index, 1.0)
target_onehot, target_onehot.shape

(tensor([[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 1., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]]),
 torch.Size([4898, 10]))

In [28]:
target

tensor([6., 6., 6.,  ..., 6., 7., 6.])

In [31]:
print(target, target.shape)
target_unsqueezed = target.unsqueeze(1)
target_unsqueezed,target_unsqueezed.shape

# unsqueeze adds a singleton dimension, from a 1D tensor of 4898 elements to a 2D tensor of size (4898x1), without changing its contents.

tensor([6., 6., 6.,  ..., 6., 7., 6.]) torch.Size([4898])


(tensor([[6.],
         [6.],
         [6.],
         ...,
         [6.],
         [7.],
         [6.]]),
 torch.Size([4898, 1]))

In [35]:
# obtain means and standard deviations for each column
data_mean = torch.mean(data, dim=0)
print(data_mean, data_mean.shape)


data_var = torch.var(data, dim=0)
data_var,data_var.shape


tensor([6.8548e+00, 2.7824e-01, 3.3419e-01, 6.3914e+00, 4.5772e-02, 3.5308e+01,
        1.3836e+02, 9.9403e-01, 3.1883e+00, 4.8985e-01, 1.0514e+01]) torch.Size([11])


(tensor([7.1211e-01, 1.0160e-02, 1.4646e-02, 2.5726e+01, 4.7733e-04, 2.8924e+02,
         1.8061e+03, 8.9455e-06, 2.2801e-02, 1.3025e-02, 1.5144e+00]),
 torch.Size([11]))

In [37]:

# Normalize the data by subtracting the mean and dividing by the standard deviation, which helps with the learning process.

data_normalized = (data - data_mean) / torch.sqrt(data_var)
data_normalized

tensor([[ 1.7208e-01, -8.1761e-02,  2.1326e-01,  ..., -1.2468e+00,
         -3.4915e-01, -1.3930e+00],
        [-6.5743e-01,  2.1587e-01,  4.7996e-02,  ...,  7.3995e-01,
          1.3422e-03, -8.2419e-01],
        [ 1.4756e+00,  1.7450e-02,  5.4378e-01,  ...,  4.7505e-01,
         -4.3677e-01, -3.3663e-01],
        ...,
        [-4.2043e-01, -3.7940e-01, -1.1915e+00,  ..., -1.3130e+00,
         -2.6153e-01, -9.0545e-01],
        [-1.6054e+00,  1.1666e-01, -2.8253e-01,  ...,  1.0049e+00,
         -9.6251e-01,  1.8574e+00],
        [-1.0129e+00, -6.7703e-01,  3.7852e-01,  ...,  4.7505e-01,
         -1.4882e+00,  1.0448e+00]])

In [39]:
# From target , pick values less than '3'. torch.le()

bad_indexes = torch.le(target, 3)
bad_indexes.shape, bad_indexes.dtype, bad_indexes.sum()

# the shape of bad_indexes are equal to target value but it contains 0 or 1 (like true or false).
# .sum() returns the total True values . so the answer is 20

(torch.Size([4898]), torch.bool, tensor(20))

In [40]:
bad_data = data[bad_indexes]
bad_data.shape

torch.Size([20, 11])

In [47]:
bad_data = data[torch.le(target, 3)]
mid_data = data[torch.gt(target, 3) & torch.lt(target, 7)]
good_data = data[torch.ge(target, 7)]


bad_mean = torch.mean(bad_data, dim=0)
mid_mean = torch.mean(mid_data, dim=0)
good_mean = torch.mean(good_data, dim=0)

print('Idx column_name         bad_mean mid_mean good_mean'.format(i, *args))
for i, args in enumerate(zip(col_list, bad_mean, mid_mean, good_mean)):
    print('{:2} {:20} {:6.2f} {:6.2f} {:6.2f}'.format(i, *args))

Idx column_name         bad_mean mid_mean good_mean
 0 fixed acidity          7.60   6.89   6.73
 1 volatile acidity       0.33   0.28   0.27
 2 citric acid            0.34   0.34   0.33
 3 residual sugar         6.39   6.71   5.26
 4 chlorides              0.05   0.05   0.04
 5 free sulfur dioxide   53.33  35.42  34.55
 6 total sulfur dioxide 170.60 141.83 125.25
 7 density                0.99   0.99   0.99
 8 pH                     3.19   3.18   3.22
 9 sulphates              0.47   0.49   0.50
10 alcohol               10.34  10.26  11.42


In [43]:
total_sulfur_threshold = 141.83
total_sulfur_data = data[:,6]
predicted_indexes = torch.lt(total_sulfur_data, total_sulfur_threshold)
predicted_indexes.shape, predicted_indexes.dtype, predicted_indexes.sum()

# Note: Your threshold implies that slightly more than half of the wines are going to be high-quality

(torch.Size([4898]), torch.bool, tensor(2727))

In [45]:
actual_indexes = torch.gt(target, 5)
actual_indexes.shape, actual_indexes.dtype, actual_indexes.sum()


(torch.Size([4898]), torch.bool, tensor(3258))

### Note: Actually 3258 values are in mid range, but by computing mean we could able to get only 2727 values 

In [49]:
3258 - 2727

531

In [52]:
n_matches = torch.sum(actual_indexes & predicted_indexes).item()
n_predicted = torch.sum(predicted_indexes).item()
n_actual = torch.sum(actual_indexes).item()

print('n_matches, n_predicted, n_actual, (n_matches / n_predicted), (n_matches / n_actual)')
n_matches, n_predicted, n_actual, n_matches / n_predicted, n_matches / n_actual

n_matches, n_predicted, n_actual, (n_matches / n_predicted), (n_matches / n_actual)


(2018, 2727, 3258, 0.74000733406674, 0.6193984039287906)

### Note: You got around 2,000 wines right ! Because you had 2,700 wines predicted, a 74 percent chance exists that if you predict a wine to be high-quality .Unfortunately, you have 3,200 good wines and identified only 61 percent of them.