In [3]:
import numpy as np 
import pandas as pd 
import csv 
import os 

import torch 


torch.set_printoptions(edgeitems=2, precision=2, linewidth=75)

In [4]:
path = os.getcwd()
DATA_PATH = "/".join(p for p in path.split(os.path.sep)[:-1]) + "/data/p1ch4/"
CSV_PATH = DATA_PATH + "tabular-wine/winequality-white.csv"

In [5]:
wineq_numpy = np.loadtxt(CSV_PATH, dtype=np.float32, delimiter=';', skiprows=1)

wineq_numpy

array([[ 7.  ,  0.27,  0.36, ...,  0.45,  8.8 ,  6.  ],
       [ 6.3 ,  0.3 ,  0.34, ...,  0.49,  9.5 ,  6.  ],
       [ 8.1 ,  0.28,  0.4 , ...,  0.44, 10.1 ,  6.  ],
       ...,
       [ 6.5 ,  0.24,  0.19, ...,  0.46,  9.4 ,  6.  ],
       [ 5.5 ,  0.29,  0.3 , ...,  0.38, 12.8 ,  7.  ],
       [ 6.  ,  0.21,  0.38, ...,  0.32, 11.8 ,  6.  ]], dtype=float32)

In [6]:
col_list = next(csv.reader(open(CSV_PATH), delimiter=';'))

wineq_numpy.shape, col_list

((4898, 12),
 ['fixed acidity',
  'volatile acidity',
  'citric acid',
  'residual sugar',
  'chlorides',
  'free sulfur dioxide',
  'total sulfur dioxide',
  'density',
  'pH',
  'sulphates',
  'alcohol',
  'quality'])

In [7]:
wineq = torch.from_numpy(wineq_numpy)

wineq

tensor([[ 7.00,  0.27,  ...,  8.80,  6.00],
        [ 6.30,  0.30,  ...,  9.50,  6.00],
        ...,
        [ 5.50,  0.29,  ..., 12.80,  7.00],
        [ 6.00,  0.21,  ..., 11.80,  6.00]])

In [10]:
data = wineq[:, :-1]
data, data.shape

(tensor([[ 7.00,  0.27,  ...,  0.45,  8.80],
         [ 6.30,  0.30,  ...,  0.49,  9.50],
         ...,
         [ 5.50,  0.29,  ...,  0.38, 12.80],
         [ 6.00,  0.21,  ...,  0.32, 11.80]]),
 torch.Size([4898, 11]))

In [11]:
target = wineq[:, -1].long() # quality

target, target.shape

(tensor([6, 6,  ..., 7, 6]), torch.Size([4898]))

# When Categorize
<img src="https://github.com/hoangminhtoan/2020_Projects/blob/master/Python_DL/PytorchNLP/data/images/how_to_treat_data.png">


In [12]:
target_onehot = torch.zeros(target.shape[0], 10)

target_onehot.scatter_(1, target.unsqueeze(1), 1.0)

target_onehot, target_onehot.shape

(tensor([[0., 0.,  ..., 0., 0.],
         [0., 0.,  ..., 0., 0.],
         ...,
         [0., 0.,  ..., 0., 0.],
         [0., 0.,  ..., 0., 0.]]),
 torch.Size([4898, 10]))

In [17]:
data_mean = torch.mean(data, dim=0)

data_mean, data_mean.shape

(tensor([6.85e+00, 2.78e-01, 3.34e-01, 6.39e+00, 4.58e-02, 3.53e+01,
         1.38e+02, 9.94e-01, 3.19e+00, 4.90e-01, 1.05e+01]),
 torch.Size([11]))

In [18]:
data_var = torch.var(data, dim=0)

data_var, data_var.shape

(tensor([7.12e-01, 1.02e-02, 1.46e-02, 2.57e+01, 4.77e-04, 2.89e+02,
         1.81e+03, 8.95e-06, 2.28e-02, 1.30e-02, 1.51e+00]),
 torch.Size([11]))

In [19]:
data_normalized = (data - data_mean) / torch.sqrt(data_var)

data_normalized

tensor([[ 1.72e-01, -8.18e-02,  ..., -3.49e-01, -1.39e+00],
        [-6.57e-01,  2.16e-01,  ...,  1.35e-03, -8.24e-01],
        ...,
        [-1.61e+00,  1.17e-01,  ..., -9.63e-01,  1.86e+00],
        [-1.01e+00, -6.77e-01,  ..., -1.49e+00,  1.04e+00]])

In [22]:
bad_indices = target <= 3  # Set a quality threshold
bad_indices, bad_indices.shape, bad_indices.sum()

(tensor([False, False,  ..., False, False]), torch.Size([4898]), tensor(20))

bad_indices.sum() = tensor(20) means that only 20 of the bad_indices entries are set to True!

In [23]:
bad_data = data[bad_indices]
bad_data, bad_data.shape

(tensor([[8.50e+00, 2.60e-01, 2.10e-01, 1.62e+01, 7.40e-02, 4.10e+01,
          1.97e+02, 9.98e-01, 3.02e+00, 5.00e-01, 9.80e+00],
         [5.80e+00, 2.40e-01, 4.40e-01, 3.50e+00, 2.90e-02, 5.00e+00,
          1.09e+02, 9.91e-01, 3.53e+00, 4.30e-01, 1.17e+01],
         [9.10e+00, 5.90e-01, 3.80e-01, 1.60e+00, 6.60e-02, 3.40e+01,
          1.82e+02, 9.97e-01, 3.23e+00, 3.80e-01, 8.50e+00],
         [7.10e+00, 3.20e-01, 3.20e-01, 1.10e+01, 3.80e-02, 1.60e+01,
          6.60e+01, 9.94e-01, 3.24e+00, 4.00e-01, 1.15e+01],
         [6.90e+00, 3.90e-01, 4.00e-01, 4.60e+00, 2.20e-02, 5.00e+00,
          1.90e+01, 9.92e-01, 3.31e+00, 3.70e-01, 1.26e+01],
         [1.03e+01, 1.70e-01, 4.70e-01, 1.40e+00, 3.70e-02, 5.00e+00,
          3.30e+01, 9.94e-01, 2.89e+00, 2.80e-01, 9.60e+00],
         [7.90e+00, 6.40e-01, 4.60e-01, 1.06e+01, 2.44e-01, 3.30e+01,
          2.27e+02, 9.98e-01, 2.87e+00, 7.40e-01, 9.10e+00],
         [8.30e+00, 3.30e-01, 4.20e-01, 1.15e+00, 3.30e-02, 1.80e+01,
          9.6

In [25]:
# seperate data according to threshold
bad_data = data[target <= 3]
mid_data = data[(target > 3) & (target < 7)]
good_data = data[target >= 7]

bad_mean = torch.mean(bad_data, dim=0)
mid_mean = torch.mean(mid_data, dim=0)
good_mean = torch.mean(good_data, dim=0)

for i, args in enumerate(zip(col_list, bad_mean, mid_mean, good_mean)):
    print("{:3} {:20} {:6.2f} {:6.2f} {:6.2f}".format(i, *args))

0 fixed acidity          7.60   6.89   6.73
  1 volatile acidity       0.33   0.28   0.27
  2 citric acid            0.34   0.34   0.33
  3 residual sugar         6.39   6.71   5.26
  4 chlorides              0.05   0.05   0.04
  5 free sulfur dioxide   53.33  35.42  34.55
  6 total sulfur dioxide 170.60 141.83 125.25
  7 density                0.99   0.99   0.99
  8 pH                     3.19   3.18   3.22
  9 sulphates              0.47   0.49   0.50
 10 alcohol               10.34  10.26  11.42


* It looks like the bad wines seem to have higher total sulfur dioxide

In [26]:
total_sulfur_threshold = 141.83
total_sulfur_data = data[:, 6]

predicted_indices = torch.lt(total_sulfur_data, total_sulfur_threshold)

predicted_indices.shape, predicted_indices.sum()

(torch.Size([4898]), tensor(2727))

In [27]:
actual_indices = target > 5

n_matches = torch.sum(actual_indices & predicted_indices).item()
n_predicted = torch.sum(predicted_indices).item()
n_actual = torch.sum(actual_indices).item()

n_matches, n_matches / n_predicted, n_matches / n_actual

(2018, 0.74000733406674, 0.6193984039287906)