## Logistic Regression from scratch

This tutorial intend to understand basics of Pytorch library, especially the dataloaders, by coding by hand every piece of a Log regression model.

### Iris dataset as a CSV file to iterate over

In [None]:
import os
import pandas as pd
import numpy as np
from sklearn import datasets
from sklearn.preprocessing import OneHotEncoder
if not os.path.isfile('iris.csv'):
    iris = datasets.load_iris()
    X = iris.data
    y = iris.target
    y=y.reshape(-1,1)
    enc = OneHotEncoder(handle_unknown='ignore')
    y = enc.fit_transform(y).toarray()
    Xy = np.concatenate((X,y), axis=1)

    headers=['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm', 'Iris-Setosa', 'Iris-Versicolor', 'Iris-Virginica']
    df = pd.DataFrame(Xy, columns=headers)

    df.to_csv('iris.csv', index=False, header=headers)
else:
    print('iris.csv exists')

### Pytorch Dataset type class to be fed into Pytorch dataloader

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader, random_split

def get_iris_labels(labels):
    text_labels = ['setosa', 'versicolour', 'virginica']
    return [text_labels[int(i)] for i in labels]

class MyIrisDataset(Dataset):
    def __init__(self, csv_file_path='iris.csv'):
        self.data = pd.read_csv(csv_file_path).astype('float32')
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        datapoint = self.data.iloc[idx].to_numpy()
        return {'X':datapoint[:-3], 'y_hat':datapoint[-3:]}
    
d = MyIrisDataset('iris.csv')

### Pytorch Dataloader to iterate over Pytorch dataloader

In [None]:
batch_size=8

train, test = random_split(d, [int(0.8*len(d)),int(0.2*len(d))])

train_loader = DataLoader(train, batch_size=batch_size,
                        shuffle=False, num_workers=4)

test_loader = DataLoader(test, batch_size=batch_size,
                        shuffle=False, num_workers=4)

for i_batch, Xy in enumerate(test_loader):
    print(Xy['X'])
    print(Xy['y_hat'])
    break


In [None]:
num_inputs = 4
num_outputs = 3
W = torch.normal(0, 0.01,(num_inputs, num_outputs),requires_grad=True)
b = torch.zeros(num_outputs,requires_grad=True)

In [None]:
def accuracy(y_hat, preds):
    acc = (torch.argmax(y_hat, 1)==torch.argmax(preds, 1))
    return (torch.mean(acc.float()))

def cross_entropy(y_hat, y):
    return -torch.sum(y_hat*torch.log(y), 1).view(-1,1).mean()
    
def softmax(X):
    X_exp = torch.exp(X)
    X_exp_sum = torch.sum(X_exp, 1).view(-1, 1)
    return X_exp / X_exp_sum

def model(X):
    # Simple Logistic(softmax) regression model
    return X @ W + b

In [None]:
for epoch in range(1, 201):
    for i_batch, Xy in enumerate(train_loader):
        X = Xy['X']
        y_hat = Xy['y_hat']
        
        preds = softmax(model(X))
        loss = cross_entropy(y_hat, preds)
        loss.backward()
        with torch.no_grad():
            W -= W.grad * 1e-2
            b -= b.grad * 1e-2
            W.grad.zero_()
            b.grad.zero_()
    if epoch % 15 == 0:
        with torch.no_grad():
            train_l = loss = cross_entropy(y_hat, softmax(model(X)))
            acc = accuracy(y_hat, preds)
            print('epoch %d, acc %f, loss %f' % (epoch, acc, train_l.mean().numpy()))

In [None]:
for epoch in range(1):    
    acc = 0
    loss = 0
    for i_batch, Xy in enumerate(test_loader):
        with torch.no_grad():
            loss += cross_entropy(y_hat, softmax(model(X)))
            acc += accuracy(y_hat, preds)
    print('Test acc %f, loss %f' % (acc/(i_batch+1), loss.mean().numpy()/(i_batch+1)))