In [1]:
# joepareti54@gmail.com  May 6-13 2022
#
# classification of wine types 
# using a dataset built from wine.csv 
#
# scaling and train-test split is implemented using sklearn
# some issues on datatypes required by pytorch was resolved here:
# https://stackoverflow.com/questions/60440292/runtimeerror-expected-scalar-type-long-but-found-float
#
# therefore the assignment in the training loop:
# LAB = LAB.type(torch.LongTensor)
#
# moreover, the following data conversions are required on the transformed data:
#
#X_TRAIN = X_TRAIN.astype(dtype=np.float32)
#y_train = y_train.astype(dtype=np.float32)
#
#The last layer of the nn is LogSoftmax
#This returns logits that need to be converted into probabilities, and since they are log you need the exp function
#
#the last cell compares the top class of the prediction on test data vs. label

In [2]:
import numpy as np
import torch
import torch.nn.functional as F
from torch import nn, optim
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torchvision import transforms
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
#
import pandas as pd

In [3]:
#read the wine.csv file into a pandas dataframe
data = np.asarray(pd.read_csv('./wine.csv', header = None))
X = data[1:, :]
print(X[0:2, :])

[['1' '14.23' '1.71' '2.43' '15.6' '127' '2.8' '3.06' '0.28' '2.29'
  '5.64' '1.04' '3.92' '1065']
 ['1' '13.2' '1.78' '2.14' '11.2' '100' '2.65' '2.76' '0.26' '1.28'
  '4.38' '1.05' '3.4' '1050']]


In [4]:
#split in train and test array
XX = X[:,1:]
y = X[:,0]
X_train, X_test, y_train, y_test = train_test_split(X[:,1:], X[:,0], test_size=0.2, random_state=42)
_train, X_test, y_train, y_test = train_test_split(XX, y, test_size=0.2, random_state=42)

In [5]:
#scaling
scaler = StandardScaler().fit(X_train)
X_TRAIN = scaler.transform(X_train)
X_TEST  = scaler.transform(X_test)

In [6]:
#convert to float32 numpy arrays
X_TEST = X_TEST.astype(dtype=np.float32)
y_test = y_test.astype(dtype=np.float32)
X_TRAIN = X_TRAIN.astype(dtype=np.float32)
y_train = y_train.astype(dtype=np.float32)


In [7]:
#define tensor transformation
transform =transforms.Compose([
    transforms.ToTensor()])


In [8]:
class Wine(Dataset):
    def __init__(self, file_flag,X_TRAIN, X_TEST, y_train,y_test, transform=transform):
        if file_flag == 'TEST':
            self.x = X_TEST 
            self.y = y_test
            self.num_samples = X_TEST.shape[0]
        else:
            self.x = X_TRAIN
            self.y = y_train
            self.num_samples = X_TRAIN.shape[0]

    def __len__(self):
        return(self.num_samples)

    def __getitem__(self, index):
        return (self.x[index], self.y[index])
#        

In [9]:
#validate dataset class
WineDS_TEST = Wine('TEST' ,X_TRAIN, X_TEST, y_train,y_test)
WineDS_TRAIN= Wine('TRAIN',X_TRAIN, X_TEST, y_train,y_test)
print('size of train ds ',len(WineDS_TRAIN))
print('size of test  ds ',len(WineDS_TEST))

size of train ds  142
size of test  ds  36


In [10]:
#convert data to a dataloder suitable for mini-batch processing when training
train_dataloader = DataLoader(WineDS_TRAIN, batch_size=4, shuffle=True)
test_dataloader = DataLoader(WineDS_TEST, batch_size=4, shuffle=False)
#
train_features , train_labels = next(iter(train_dataloader))
print(train_features)
print('----')
print(train_labels)

tensor([[-0.3415,  1.0321, -0.0030,  0.5887,  0.3806, -0.9270, -0.8016, -1.5493,
         -1.3206, -0.0248, -0.7561, -1.8105, -0.4478],
        [ 1.7142, -0.4417,  0.0688, -2.1708,  0.1066,  1.5908,  1.6369, -0.6105,
          2.3246,  1.0515,  1.0443,  0.5659,  2.6957],
        [-0.3782, -1.2225, -0.4343, -0.4279, -0.0989, -0.1402, -0.0621, -0.5322,
         -0.2545, -1.0495,  1.1729,  0.7882, -0.9457],
        [ 1.0535,  1.5321,  0.0688,  0.0078, -0.7839, -0.7696, -1.1714,  0.8760,
         -0.0826,  1.7016, -1.6563, -1.3241, -0.8461]])
----
tensor([3., 1., 2., 3.])


In [11]:

# 
class ThreeClassClassification(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(13, 6)
        self.fc2 = nn.Linear(6,4)
    def forward(self, x):
        x = x.view(x.shape[0], -1)
        x = F.sigmoid(self.fc1(x))
        x = F.log_softmax(self.fc2(x), dim=1)
        return x
        

In [12]:
model = ThreeClassClassification()
loss = nn.NLLLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
epochs = 100


In [13]:
for i in range(epochs):
    running_loss = 0
    for DATA, LAB in train_dataloader:
        optimizer.zero_grad()
        LAB = LAB.type(torch.LongTensor)
        log_ps = model(DATA)
        LOSS = loss(log_ps, LAB)
        
        LOSS.backward()
        optimizer.step()
        running_loss += LOSS.item()
    else:
        if i % 50 == 0:
            print(running_loss)
    

46.03226637840271




5.674673497676849


In [14]:
#test the model
for DATA, LAB in test_dataloader:
    log_ps = model(DATA)
#    print(torch.exp(log_ps), LAB)
    ps = torch.exp(log_ps)
    top_p, top_class = ps.topk(1, dim=1)
    print(top_class.view(1,-1), LAB)


tensor([[1, 1, 3, 1]]) tensor([1., 1., 3., 1.])
tensor([[2, 1, 2, 3]]) tensor([2., 1., 2., 3.])
tensor([[2, 3, 1, 3]]) tensor([2., 3., 1., 3.])
tensor([[1, 2, 1, 2]]) tensor([1., 2., 1., 2.])
tensor([[2, 2, 1, 2]]) tensor([2., 2., 1., 2.])
tensor([[1, 2, 2, 3]]) tensor([1., 2., 2., 3.])
tensor([[3, 3, 2, 2]]) tensor([3., 3., 2., 2.])
tensor([[2, 1, 1, 2]]) tensor([2., 1., 1., 2.])
tensor([[3, 1, 1, 1]]) tensor([3., 1., 1., 1.])


In [15]:
#conclusion:
# all predictions on the test set are right