# Dataset Processing with Pytorch
---

### Perform standard imports

In [1]:
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

### Loading data

In [2]:
df = pd.read_csv('../Data/iris.csv')
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0.0
1,4.9,3.0,1.4,0.2,0.0
2,4.7,3.2,1.3,0.2,0.0
3,4.6,3.1,1.5,0.2,0.0
4,5.0,3.6,1.4,0.2,0.0


### Plot the data

In [3]:

# df['target_label'] = df['target'].map({0: 'Setosa', 1: 'Virginica', 2: 'Versicolor'})

# # Pairplot with modified hue
# sns.pairplot(
#     df,
#     vars=['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)'],
#     hue='target_label',  # Use the new label column for hue
#     palette={'Setosa': "blue", 'Virginica': "red", 'Versicolor': "green"}
# )


#### Note:
Just notice that the unique classes are: 

- 0-Iris setosa 
- 1-Iris virginica 
- 2-Iris versicolor

### Classic Method for Train/Test split

In [4]:
from sklearn.model_selection import train_test_split

In [5]:
train_X, test_X, train_y, test_y = train_test_split(df.drop('target',axis=1).values,
                                                    df['target'].values, test_size=0.2,
                                                    random_state=42)

In [6]:
X_train = torch.tensor(train_X, dtype=torch.float)
X_test = torch.tensor(test_X, dtype=torch.float)
y_train = torch.tensor(train_y, dtype=torch.int).reshape(-1, 1)
y_test = torch.tensor(train_y, dtype=torch.int).reshape(-1, 1)

<div class="alert alert-info"><strong>NOTE: </strong>
<br>
It's up to us to remember which columns correspond to which features.</div>

In [7]:
X_train.shape

torch.Size([120, 4])

In [8]:
y_train.shape

torch.Size([120, 1])

<div class="alert alert-success"><strong>NOTE: </strong>
<br>
<strong>Oversampling a less common class</strong> so that it matches the more common classes is always the preferred choice.</div>

## Pytorch Dataset and Dataloader classes
It is a far better alternative.
<br>
First, `TensorDataset` class is used for constructing dataset. Then, `DataLoader` wrap the dataset and provide powerful sampler with single/multi-process iterators over it. 

In [9]:
from torch.utils.data import TensorDataset, DataLoader

In [10]:
data = torch.tensor(df.drop('target', axis=1).values, dtype=torch.float)
labels = torch.tensor(df['target'].values, dtype=torch.int)

iris = TensorDataset(data, labels)

In [11]:
iris

<torch.utils.data.dataset.TensorDataset at 0x7fa374531ac0>

In [12]:
iris[1]

(tensor([4.9000, 3.0000, 1.4000, 0.2000]), tensor(0, dtype=torch.int32))

In [13]:
len(iris)

150

In [21]:
# iris_loader -> for each batch -> tensor_of_data + tensor_of_labels
iris_loader = DataLoader(dataset=iris, batch_size=50, shuffle=True)
iris_loader

<torch.utils.data.dataloader.DataLoader at 0x7fa37476ba70>

In [22]:
next(iter(iris_loader))

[tensor([[6.7000, 3.0000, 5.2000, 2.3000],
         [4.8000, 3.0000, 1.4000, 0.3000],
         [5.6000, 3.0000, 4.5000, 1.5000],
         [5.7000, 4.4000, 1.5000, 0.4000],
         [5.2000, 3.4000, 1.4000, 0.2000],
         [5.4000, 3.7000, 1.5000, 0.2000],
         [5.5000, 4.2000, 1.4000, 0.2000],
         [6.4000, 2.7000, 5.3000, 1.9000],
         [7.0000, 3.2000, 4.7000, 1.4000],
         [6.2000, 3.4000, 5.4000, 2.3000],
         [5.9000, 3.0000, 4.2000, 1.5000],
         [5.1000, 3.8000, 1.6000, 0.2000],
         [5.2000, 2.7000, 3.9000, 1.4000],
         [6.3000, 3.4000, 5.6000, 2.4000],
         [5.7000, 3.8000, 1.7000, 0.3000],
         [5.1000, 3.5000, 1.4000, 0.3000],
         [6.4000, 3.1000, 5.5000, 1.8000],
         [7.7000, 2.6000, 6.9000, 2.3000],
         [5.9000, 3.0000, 5.1000, 1.8000],
         [5.6000, 2.5000, 3.9000, 1.1000],
         [6.5000, 3.0000, 5.2000, 2.0000],
         [4.8000, 3.0000, 1.4000, 0.1000],
         [6.9000, 3.1000, 5.1000, 2.3000],
         [6

In [28]:
# Show the number of label occurence in a batch
list(iris_loader)[0][1].bincount()

tensor([19, 11, 20])