In [None]:
import csv
import numpy as np

wine_path = '../data/p1ch4/tabular-wine/winequality-white.csv'
wine_numpy = np.loadtxt(wine_path, dtype=np.float32, delimiter=';', skiprows=1)
wine_numpy

array([[ 7.  ,  0.27,  0.36, ...,  0.45,  8.8 ,  6.  ],
       [ 6.3 ,  0.3 ,  0.34, ...,  0.49,  9.5 ,  6.  ],
       [ 8.1 ,  0.28,  0.4 , ...,  0.44, 10.1 ,  6.  ],
       ...,
       [ 6.5 ,  0.24,  0.19, ...,  0.46,  9.4 ,  6.  ],
       [ 5.5 ,  0.29,  0.3 , ...,  0.38, 12.8 ,  7.  ],
       [ 6.  ,  0.21,  0.38, ...,  0.32, 11.8 ,  6.  ]], dtype=float32)

In [None]:
col_list = next(csv.reader(open(wine_path), delimiter=';'))
wine_numpy.shape, col_list

((4898, 12),
 ['fixed acidity',
  'volatile acidity',
  'citric acid',
  'residual sugar',
  'chlorides',
  'free sulfur dioxide',
  'total sulfur dioxide',
  'density',
  'pH',
  'sulphates',
  'alcohol',
  'quality'])

In [None]:
wineq = torch.from_numpy(wine_numpy)
wineq.shape, wineq.dtype

(torch.Size([4898, 12]), torch.float32)

In [None]:
data = wineq[:, :-1]
target = wineq[:, -1]

data.shape, target.shape

(torch.Size([4898, 11]), torch.Size([4898]))

In [None]:
target, target.long()

(tensor([6., 6., 6.,  ..., 6., 7., 6.]), tensor([6, 6, 6,  ..., 6, 7, 6]))

In [None]:
target_onehot = torch.zeros(target.shape[0], 10)
target_onehot.scatter_(1, target.unsqueeze(1).to(torch.int64), 1.)

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 1., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])

In [None]:
data_mean = torch.mean(data, dim=0)
data_std = torch.std(data, dim=0)
data_mean, data_std

(tensor([6.8548e+00, 2.7824e-01, 3.3419e-01, 6.3914e+00, 4.5772e-02, 3.5308e+01,
         1.3836e+02, 9.9403e-01, 3.1883e+00, 4.8985e-01, 1.0514e+01]),
 tensor([8.4387e-01, 1.0079e-01, 1.2102e-01, 5.0721e+00, 2.1848e-02, 1.7007e+01,
         4.2498e+01, 2.9909e-03, 1.5100e-01, 1.1413e-01, 1.2306e+00]))

In [None]:
data_norm = (data - data_mean) / data_std

In [None]:
bad_indexes = target <= 3
bad_indexes.shape, bad_indexes.dtype, bad_indexes.sum()

(torch.Size([4898]), torch.bool, tensor(20))

In [None]:
bad_wines = data[bad_indexes]
bad_wines.shape

torch.Size([20, 11])

In [None]:
bad_quality = data[target<=3]
mid_quality = data[(target > 3) & (target < 7)]
good_quality = data[target>=7]

badq_mean = torch.mean(bad_quality, dim=0)
midq_mean = torch.mean(mid_quality, dim=0)
goodq_mean = torch.mean(good_quality, dim=0)


for i, args in enumerate(zip(col_list, badq_mean, midq_mean, goodq_mean)):
    print('{:2} {:20} {:6.2f} {:6.2f} {:6.2f}'.format(i, *args))


 0 fixed acidity          7.60   6.89   6.73
 1 volatile acidity       0.33   0.28   0.27
 2 citric acid            0.34   0.34   0.33
 3 residual sugar         6.39   6.71   5.26
 4 chlorides              0.05   0.05   0.04
 5 free sulfur dioxide   53.33  35.42  34.55
 6 total sulfur dioxide 170.60 141.83 125.25
 7 density                0.99   0.99   0.99
 8 pH                     3.19   3.18   3.22
 9 sulphates              0.47   0.49   0.50
10 alcohol               10.34  10.26  11.42


In [None]:
total_sulfur_threshold = 141.83
total_sulfur_data = data[:, 6]

predicted_indexes = torch.lt(total_sulfur_data, total_sulfur_threshold)

predicted_indexes.shape, predicted_indexes.dtype, predicted_indexes.sum()

(torch.Size([4898]), torch.bool, tensor(2727))

In [None]:
actual_indexes = target > 5

actual_indexes.shape, actual_indexes.dtype, actual_indexes.sum()

(torch.Size([4898]), torch.bool, tensor(3258))

In [None]:
n_matches = torch.sum(predicted_indexes & actual_indexes).item()
n_predicted = torch.sum(predicted_indexes).item()
n_actual = torch.sum(actual_indexes).item()

n_matches, n_matches/n_predicted, n_matches/n_actual

(2018, 0.74000733406674, 0.6193984039287906)