In [1]:
import pandas as pd
import collections
from collections import Counter
import string
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader

In [2]:
df_all = pd.read_csv('frankenstein_with_splits.csv')
print("shape of the data: ", df_all.shape)
print('-'*60)
print(df_all.head())

shape of the data:  (90698, 3)
------------------------------------------------------------
                                  context        target  split
0                                , or the  frankenstein  train
1              frankenstein or the modern             ,  train
2    frankenstein , the modern prometheus            or  train
3  frankenstein , or modern prometheus by           the  train
4             , or the prometheus by mary        modern  train


# Define two relevent classes
### - Vocabulary ([see a walkthrough here](https://github.com/houzhj/Machine_Learning/blob/main/ipynb/Frankenstein/class_Vocabulary.ipynb))
### - CBOWVectorizer ([see a walkthrough here](https://github.com/houzhj/Machine_Learning/blob/main/ipynb/Frankenstein/class_Vectorizer.ipynb))

In [3]:
class Vocabulary(object):
    """Class to process text and extract vocabulary for mapping"""
    def __init__(self, token_to_idx=None, 
                 mask_token="<MASK>", add_unk=True, 
                 unk_token="<UNK>"):
        """
        Args:
            token_to_idx (dict): a pre-existing map of tokens to indices
            mask_token (str): the MASK token to add into the Vocabulary; indicates
                a position that will not be used in updating the model's parameters
            add_unk (bool): a flag that indicates whether to add the UNK token
            unk_token (str): the UNK token to add into the Vocabulary
        """
        if token_to_idx is None:
            token_to_idx = {}
        self._token_to_idx = token_to_idx
        self._idx_to_token = {idx: token 
                              for token, idx in self._token_to_idx.items()}
        
        self._add_unk   = add_unk
        self._unk_token = unk_token    
        self._mask_token = mask_token
        
        ### the mask_token, i.e, "<MASK>" is the first added token
        self.mask_index = self.add_token(self._mask_token)
        
        self.unk_index  = -999
        ### the unk_token, i.e, "<UNK>" is the second added token if add_unk=True
        ### self.unk_index is changed from -999 to 1
        if add_unk:
            self.unk_index = self.add_token(unk_token) 
        

    def add_token(self, token):
        """Update mapping dicts based on the token.

        Args:
            token (str): the item to add into the Vocabulary
        Returns:
            index (int): the integer corresponding to the token
        """
        if token in self._token_to_idx:
            index = self._token_to_idx[token]
        else:
            index = len(self._token_to_idx)
            ### add a new element to _token_to_idx
            self._token_to_idx[token] = index
            ### add a new element to _idx_to_token
            self._idx_to_token[index] = token
        return index
   
    def lookup_token(self, token):
        """Retrieve the index associated with the token 
          or the UNK index if token isn't present.
        
        Args:
            token (str): the token to look up 
        Returns:
            index (int): the index corresponding to the token
        Notes:
            `unk_index` needs to be >=0 (having been added into the Vocabulary) 
              for the UNK functionality 
        """
        if self.unk_index >= 0:
            ### .get(): return self.unk_index if the key "token" does not exist. 
            return self._token_to_idx.get(token, self.unk_index)
        else:
            return self._token_to_idx[token]
    
    def lookup_index(self, index):
        """Return the token associated with the index
        
        Args: 
            index (int): the index to look up
        Returns:
            token (str): the token corresponding to the index
        Raises:
            KeyError: if the index is not in the Vocabulary
        """
        if index not in self._idx_to_token:
            raise KeyError("the index (%d) is not in the Vocabulary" % index)
        return self._idx_to_token[index]
    
    def __len__(self):
        return len(self._token_to_idx)

    
class CBOWVectorizer(object):
    """ The Vectorizer which coordinates the Vocabularies and puts them to use"""
    def __init__(self, cbow_vocab):
        """
        Args:
            cbow_vocab (Vocabulary): maps words to integers
        """
        self.cbow_vocab = cbow_vocab
         
    @classmethod
    def from_dataframe(cls, cbow_df):
        """Instantiate the vectorizer from the dataset dataframe
        
        Args:
            cbow_df (pandas.DataFrame): the target dataset
        Returns:
            an instance of the CBOWVectorizer
        """
        cbow_vocab = Vocabulary()

        ########## Add tokens to cbow_vocab
        for index, row in cbow_df.iterrows():
            for token in row.context.split(' '):
                cbow_vocab.add_token(token)
            cbow_vocab.add_token(row.target)
            
        return cls(cbow_vocab)

    ### This is the key functionality of the Vectorizer.
    ### It takes as an argument a string representing a text,
    ### and returns a vectorized representation of the text.
    def vectorize(self, context, vector_length=-1):
        """
        Args:
            context (str): the string of words separated by a space
            vector_length (int): an argument for forcing the length of index vector
        """

        indices = [self.cbow_vocab.lookup_token(token) for token in context.split(' ')]
        if vector_length < 0:
            vector_length = len(indices)
        
        ### if vector_length = len(indices), out_vector = indices
        ### if vector_length != len(indices), the out_vector is defined in the following lines
        out_vector = np.zeros(vector_length, dtype=np.int64)
        out_vector[:len(indices)] = indices
        out_vector[len(indices):] = self.cbow_vocab.mask_index

        return out_vector

# 1. CBOWDataset class
### - The Dataset class will characterize the key features of the dataset.
### - In the initialization function of the class, make the class inherit the properties of torch.utils.data.Dataset so that we can later leverage its functionalities.
### - In the \_\_init\_\_() function and the set_split() function, store important information such as labels and the features that we wish to generate at each pass.
### - Each call requests a sample index for which the upperbound is specified in the \_\_len\_\_() method.
### - When the sample corresponding to a given index is called, the generator executes the \_\_getitem\_\_() method to generate it.

In [4]:
class CBOWDataset(Dataset):
    def __init__(self,cbow_df,vectorizer):
        """
        Args:
            cbow_df (pandas.DataFrame): the dataset
            vectorizer (CBOWVectorizer): vectorizer instatiated from dataset
        """
        self.cbow_df     = cbow_df
        self._vectorizer = vectorizer
        
        ### CBOWVectorizer.vectorize() with be used with the parameter 
        ### vector_length = self._max_seq_length (the max length among all comments),
        ### so that the vectors for different rows will have the same length.
        measure_len = lambda context: len(context.split(" "))
        self._max_seq_length = max(map(measure_len, cbow_df.context))
        
        self.train_df    = self.cbow_df[self.cbow_df.split=='train']
        self.train_size  = len(self.train_df)

        self.val_df      = self.cbow_df[self.cbow_df.split=='val']
        self.validation_size = len(self.val_df)

        self.test_df     = self.cbow_df[self.cbow_df.split=='test']
        self.test_size   = len(self.test_df)
        
        self._lookup_dict = {'train': (self.train_df, self.train_size),
                             'val'  : (self.val_df, self.validation_size),
                             'test' : (self.test_df, self.test_size)}
        self.set_split('train')
        
    @classmethod
    def load_csv_and_make_vectorizer(cls,cbow_csv):
        """Load dataset and make a new vectorizer from scratch
        Args:
            cbow_csv (str): location of the dataset
        Returns:
            an instance of CBOWDataset
        """
        cbow_csv = pd.read_csv(cbow_csv)
        ### make vectorizer using training dataset
        train_cbow_df   = cbow_df[cbow_df.split=='train']
        new_vectorizer  = CBOWVectorizer.from_dataframe(train_cbow_df)
        return cls(cbow_df,new_vectorizer)
    
    @classmethod
    def load_df_and_make_vectorizer(cls,cbow_df):
        """Load dataset and make a new vectorizer from scratch
        Args:
            cbow_df: dataset
        Returns:
            an instance of CBOWDataset
        """
        ### make vectorizer using training dataset
        train_cbow_df  = cbow_df[cbow_df.split=='train']
        new_vectorizer = CBOWVectorizer.from_dataframe(train_cbow_df)
        return cls(cbow_df,new_vectorizer)
    
    def set_split(self, split="train"):
        """ selects the splits in the dataset using a column in the dataframe 
        Args:
            split (str): one of "train", "val", or "test"
        """
        self._target_split = split
        ### when split = 'train', _target_df means the training set
        self._target_df, self._target_size = self._lookup_dict[split]
        
    def __len__(self):
        ### _target_size is defined in set_split() 
        return self._target_size        
        
    def __getitem__(self, index):
        """the primary entry point method for PyTorch datasets
        
        Args:
            index (int): the index to the data point 
        Returns:
            a dictionary holding the data point's features (x_data) and label (y_target)
        """
        
        row = self._target_df.iloc[index]

        context_vector = \
            self._vectorizer.vectorize(row.context, self._max_seq_length)

        target_index   = self._vectorizer.cbow_vocab.lookup_token(row.target)

        return {'x_data': context_vector,
                'y_target': target_index}

    def get_vectorizer(self):
        """ returns the vectorizer """
        return self._vectorizer
    
    def get_num_batches(self, batch_size):
        """Given a batch size, return the number of batches in the dataset
        Args:
            batch_size (int)
        Returns:
            number of batches in the dataset
        """
        return len(self) // batch_size  

# 2. Instantiate a CBOWDataset from the training data
### There are two classmethods can be used to instantiate a CBOWDataset: load_csv_and_make_vectorizer() and load_df_and_make_vectorizer(). The difference is whether the input data is from a csv file or a pd.DataFrame file. 

### First draw a (static, fixed random seed) from the entire datas

In [5]:
df_sample = df_all.sample(100,random_state=100)

In [6]:
df_sample.head()

Unnamed: 0,context,target,split
5877,", as mine been .",has,train
65522,", this sudden of life rushed",certainty,val
49249,to be is indeed to,friendless,train
42861,at once drew of sorrow and,tears,train
24701,i soon felt rain coming slowly,the,train


### Create a CBOWDataset.

In [7]:
dataset_sample = CBOWDataset.load_df_and_make_vectorizer(df_sample)

## 2.1 - Attributes of a CBOWDataset

### .cbow_df: the input dataframe

In [8]:
dataset_sample.cbow_df

Unnamed: 0,context,target,split
5877,", as mine been .",has,train
65522,", this sudden of life rushed",certainty,val
49249,to be is indeed to,friendless,train
42861,at once drew of sorrow and,tears,train
24701,i soon felt rain coming slowly,the,train
...,...,...,...
59428,"soothed me , i could thus",and,train
2764,"of the common of men ,",pathways,train
59364,but blight had come,a,train
56103,from taking the step in an,first,train


In [9]:
dataset_sample.cbow_df.equals(df_sample)

True

### ._max_seq_length: max number of tokens in a context

In [10]:
dataset_sample._max_seq_length

6

### ._vectorizer

In [11]:
### Note that the vectorizer is derived from the training split. 
v = dataset_sample._vectorizer

In [12]:
example_text = "the sun is shining and it is a beautiful day"
vector       = v.vectorize(example_text)

In [13]:
print(f'CBOW Vocabulary')
print('-'*100)
print("_idx_to_token: ", v.cbow_vocab._idx_to_token)
print('-'*100)
print(example_text)
print('vector representation:', vector)

CBOW Vocabulary
----------------------------------------------------------------------------------------------------
_idx_to_token:  {0: '<MASK>', 1: '<UNK>', 2: ',', 3: 'as', 4: 'mine', 5: 'been', 6: '.', 7: '', 8: 'has', 9: 'to', 10: 'be', 11: 'is', 12: 'indeed', 13: 'friendless', 14: 'at', 15: 'once', 16: 'drew', 17: 'of', 18: 'sorrow', 19: 'and', 20: 'tears', 21: 'i', 22: 'soon', 23: 'felt', 24: 'rain', 25: 'coming', 26: 'slowly', 27: 'the', 28: 'my', 29: 'aunt', 30: nan, 31: 'contrast', 32: 'perpetually', 33: 'presented', 34: 'eyes', 35: 'had', 36: 'are', 37: 'canvassed', 38: 'many', 39: 'lights', 40: 'thrown', 41: 'so', 42: 'on', 43: 'you', 44: 'only', 45: 'any', 46: 'claim', 47: 'prospect', 48: 'such', 49: 'shells', 50: 'beside', 51: 'unexplored', 52: 'ocean', 53: 'great', 54: 'said', 55: '!', 56: 'calm', 57: 'joyous', 58: 'faces', 59: 'back', 60: 'despair', 61: 'brought', 62: 'for', 63: 'figure', 64: 'saw', 65: 'shores', 66: 'como', 67: 'lake', 68: 'an', 69: 'insurmountable', 7

### ._target_df, _target_size
**Defined by method set_split()**

In [14]:
dataset_sample._target_df

Unnamed: 0,context,target,split
5877,", as mine been .",has,train
49249,to be is indeed to,friendless,train
42861,at once drew of sorrow and,tears,train
24701,i soon felt rain coming slowly,the,train
20535,my aunt .,,train
...,...,...,...
59428,"soothed me , i could thus",and,train
2764,"of the common of men ,",pathways,train
59364,but blight had come,a,train
56103,from taking the step in an,first,train


In [15]:
dataset_sample._target_size

67

### ._lookup_dict - will be used in the method set_split()

In [16]:
dataset_sample._lookup_dict

{'train': (                                context      target  split
  5877                  , as mine been .          has  train
  49249                to be is indeed to  friendless  train
  42861        at once drew of sorrow and       tears  train
  24701    i soon felt rain coming slowly         the  train
  20535                         my aunt .         NaN  train
  ...                                 ...         ...    ...
  59428         soothed me , i could thus         and  train
  2764             of the common of men ,    pathways  train
  59364               but blight had come           a  train
  56103        from taking the step in an       first  train
  14410  great proficiency in study and i        that  train
  
  [67 rows x 3 columns],
  67),
 'val': (                                  context      target split
  65522        , this sudden of life rushed   certainty   val
  65585           my course towards land .          the   val
  75275    head upon her and a 

In [17]:
### A dictionary which contains a df and a scalar
dataset_sample._lookup_dict['train']

(                                context      target  split
 5877                  , as mine been .          has  train
 49249                to be is indeed to  friendless  train
 42861        at once drew of sorrow and       tears  train
 24701    i soon felt rain coming slowly         the  train
 20535                         my aunt .         NaN  train
 ...                                 ...         ...    ...
 59428         soothed me , i could thus         and  train
 2764             of the common of men ,    pathways  train
 59364               but blight had come           a  train
 56103        from taking the step in an       first  train
 14410  great proficiency in study and i        that  train
 
 [67 rows x 3 columns],
 67)

In [18]:
### the dataframe
dataset_sample._lookup_dict['train'][0]

Unnamed: 0,context,target,split
5877,", as mine been .",has,train
49249,to be is indeed to,friendless,train
42861,at once drew of sorrow and,tears,train
24701,i soon felt rain coming slowly,the,train
20535,my aunt .,,train
...,...,...,...
59428,"soothed me , i could thus",and,train
2764,"of the common of men ,",pathways,train
59364,but blight had come,a,train
56103,from taking the step in an,first,train


In [19]:
### the sample size
dataset_sample._lookup_dict['train'][1]

67

## 2.2 - Methods of a CBOWDataset

### \_\_len()\_\_

In [20]:
len(dataset_sample)

67

### \_\_getitem()\_\_

In [21]:
### The 4th element in the "train" split
### In the __init__ function, self.set_split('train') defines ._target_df
dataset_sample[3]

{'x_data': array([21, 22, 23, 24, 25, 26]), 'y_target': 27}

In [22]:
df_sample.head(5)

Unnamed: 0,context,target,split
5877,", as mine been .",has,train
65522,", this sudden of life rushed",certainty,val
49249,to be is indeed to,friendless,train
42861,at once drew of sorrow and,tears,train
24701,i soon felt rain coming slowly,the,train


In [23]:
for i in range(21,28):
    print(dataset_sample._vectorizer.cbow_vocab._idx_to_token[i])

i
soon
felt
rain
coming
slowly
the


### set_split()

In [24]:
dataset_sample = CBOWDataset.load_df_and_make_vectorizer(df_sample)

In [25]:
### Now the split for ._target_df and _target_size is 'train'
dataset_sample._target_df

Unnamed: 0,context,target,split
5877,", as mine been .",has,train
49249,to be is indeed to,friendless,train
42861,at once drew of sorrow and,tears,train
24701,i soon felt rain coming slowly,the,train
20535,my aunt .,,train
...,...,...,...
59428,"soothed me , i could thus",and,train
2764,"of the common of men ,",pathways,train
59364,but blight had come,a,train
56103,from taking the step in an,first,train


In [26]:
len(dataset_sample)

67

In [27]:
### The 4th element in the "train" split
dataset_sample[3]

{'x_data': array([21, 22, 23, 24, 25, 26]), 'y_target': 27}

In [28]:
### run set_split, switch the split to 'val'
dataset_sample.set_split('val')
# or 
# CBOWDataset.set_split(dataset_sample,'val')

In [29]:
### Now the split for ._target_df and _target_size is 'val'
dataset_sample._target_df

Unnamed: 0,context,target,split
65522,", this sudden of life rushed",certainty,val
65585,my course towards land .,the,val
75275,head upon her and a handkerchief,arm,val
74545,to me forever,.,val
69493,the season of assizes approached .,the,val
68934,my !,father,val
67606,"i remember , i thus awoke",when,val
72247,the daemon employ art to destroy,every,val
75157,where it is hated .,most,val
65437,some hours thus but by,passed,val


In [30]:
len(dataset_sample)

16

In [31]:
### The 4th element in the "val" split
dataset_sample[3]

{'x_data': array([  9,  72, 212,   7,   0,   0]), 'y_target': 6}

### get_vectorizer()

In [32]:
dataset_sample.get_vectorizer()

<__main__.CBOWVectorizer at 0x7ff8afd2f4f0>

In [33]:
### Equivalently
dataset_sample._vectorizer

<__main__.CBOWVectorizer at 0x7ff8afd2f4f0>

### get_num_batches()

In [34]:
dataset_sample = CBOWDataset.load_df_and_make_vectorizer(df_sample)
### Switch the split to 'train'
dataset_sample.set_split('train')

In [35]:
dataset_sample.get_num_batches(10)

6

In [36]:
len(dataset_sample._target_df)/10

6.7

In [37]:
len(dataset_sample._target_df)//10

6

In [38]:
### Switch the split to 'val'
dataset_sample.set_split('val')

In [39]:
dataset_sample.get_num_batches(10)

1

In [40]:
len(dataset_sample._target_df)/10

1.6

In [41]:
len(dataset_sample)/10

1.6

# 3. Define a batch generator
### - Wrap the DataLoader
### - Switch the data between the CPU and the GPU.

In [42]:
def generate_batches(dataset, batch_size, shuffle=True,
                     drop_last=True, device='cpu'):
    """
    A generator function which wraps the PyTorch DataLoader. It will 
      ensure each tensor is on the write device location.
    """
    dataloader = DataLoader(dataset=dataset, batch_size=batch_size,
                            shuffle=shuffle, drop_last=drop_last)

    for data_dict in dataloader:
        out_data_dict = {}
        for name, tensor in data_dict.items():
            out_data_dict[name] = data_dict[name].to(device)
        yield out_data_dict

## 3.1 Dataset Class
### - The Dataset class characterizes the key features of the dataset you want to generate.
### - The class uses \_\_init\_\_(), \_\_len\_\_(), and \_\_getitem\_\_() to store important information, and generate samples. 
### - The Dataset class is an important argument of the DataLoader class.

In [43]:
data = {'x1': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12],
        'x2': [13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24],
        'y': [0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0]}
data
df = pd.DataFrame(data)
print("data:" ,data)
print("-"*60)
print("df:",df)

data: {'x1': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], 'x2': [13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24], 'y': [0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0]}
------------------------------------------------------------
df:     x1  x2  y
0    1  13  0
1    2  14  1
2    3  15  0
3    4  16  1
4    5  17  1
5    6  18  0
6    7  19  0
7    8  20  1
8    9  21  1
9   10  22  0
10  11  23  1
11  12  24  0


In [44]:
##### Define Dataset class
class CustomDataset(Dataset):
    def __init__(self, dataframe):
        self.data = dataframe

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        sample = torch.tensor(self.data.iloc[index, :-1].values, dtype=torch.float32)
        target = torch.tensor(self.data.iloc[index, -1], dtype=torch.float32)
        return sample, target

##### Instantiate the Dataset class
custom_dataset = CustomDataset(df)

##### Instantiate the DataLoader class
batch_size  = 3
data_loader = DataLoader(dataset=custom_dataset, batch_size=batch_size, shuffle=False)

##### Obtain the batch
i = 0
for batch in data_loader:
    print('Batch '+str(i))
    i+=1
    print(batch)
    print('-' * 60)

Batch 0
[tensor([[ 1., 13.],
        [ 2., 14.],
        [ 3., 15.]]), tensor([0., 1., 0.])]
------------------------------------------------------------
Batch 1
[tensor([[ 4., 16.],
        [ 5., 17.],
        [ 6., 18.]]), tensor([1., 1., 0.])]
------------------------------------------------------------
Batch 2
[tensor([[ 7., 19.],
        [ 8., 20.],
        [ 9., 21.]]), tensor([0., 1., 1.])]
------------------------------------------------------------
Batch 3
[tensor([[10., 22.],
        [11., 23.],
        [12., 24.]]), tensor([0., 1., 0.])]
------------------------------------------------------------


### An alternative is to use TensorDataset() directly

In [45]:
from torch.utils.data import TensorDataset

In [46]:
x1 = torch.from_numpy(df['x1'].values).float()
x2 = torch.from_numpy(df['x2'].values).float()
y  = torch.from_numpy(df['y'].values).float()
print("x1:", x1)
print("x2:", x2)
print("y:", y)

x1: tensor([ 1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11., 12.])
x2: tensor([13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24.])
y: tensor([0., 1., 0., 1., 1., 0., 0., 1., 1., 0., 1., 0.])


In [47]:
features = torch.stack([x1, x2], dim=1)
features

tensor([[ 1., 13.],
        [ 2., 14.],
        [ 3., 15.],
        [ 4., 16.],
        [ 5., 17.],
        [ 6., 18.],
        [ 7., 19.],
        [ 8., 20.],
        [ 9., 21.],
        [10., 22.],
        [11., 23.],
        [12., 24.]])

In [48]:
##### Create Tensor dataset
dataset     = TensorDataset(features, y)
batch_size  = 3

##### Instantiate the DataLoader class
data_loader = DataLoader(dataset=dataset, batch_size=batch_size, shuffle=False)

##### Obtain the batch
i = 0
for batch in data_loader:
    print('Batch '+str(i))
    i+=1
    print(batch)
    print('-' * 60)

Batch 0
[tensor([[ 1., 13.],
        [ 2., 14.],
        [ 3., 15.]]), tensor([0., 1., 0.])]
------------------------------------------------------------
Batch 1
[tensor([[ 4., 16.],
        [ 5., 17.],
        [ 6., 18.]]), tensor([1., 1., 0.])]
------------------------------------------------------------
Batch 2
[tensor([[ 7., 19.],
        [ 8., 20.],
        [ 9., 21.]]), tensor([0., 1., 1.])]
------------------------------------------------------------
Batch 3
[tensor([[10., 22.],
        [11., 23.],
        [12., 24.]]), tensor([0., 1., 0.])]
------------------------------------------------------------


### The two methods below are equivalent

In [49]:
x1 = torch.from_numpy(df['x1'].values).float()
x2 = torch.from_numpy(df['x2'].values).float()
torch.stack([x1, x2], dim=1)

tensor([[ 1., 13.],
        [ 2., 14.],
        [ 3., 15.],
        [ 4., 16.],
        [ 5., 17.],
        [ 6., 18.],
        [ 7., 19.],
        [ 8., 20.],
        [ 9., 21.],
        [10., 22.],
        [11., 23.],
        [12., 24.]])

In [50]:
numpy_array = df[['x1', 'x2']].to_numpy()
torch.from_numpy(numpy_array)

tensor([[ 1, 13],
        [ 2, 14],
        [ 3, 15],
        [ 4, 16],
        [ 5, 17],
        [ 6, 18],
        [ 7, 19],
        [ 8, 20],
        [ 9, 21],
        [10, 22],
        [11, 23],
        [12, 24]])

## 3.2 DataLoader
### - batch_size: denotes the number of samples contained in each generated batch.
### - shuffle: if set to True, we will get a new order of exploration at each pass (or just keep a linear exploration scheme otherwise). Shuffling the order in which examples are fed to the classifier is helpful so that batches between epochs do not look alike. Doing so will eventually make our model more robust.
### - drop_last: set to True to drop the last incomplete batch, if the dataset size is not divisible by the batch size. If False and the size of dataset is not divisible by the batch size, then the last batch will be smaller. (default: False)

In [51]:
type(dataset_sample[0]['x_data'])

numpy.ndarray

In [52]:
dataset_sample = CBOWDataset.load_df_and_make_vectorizer(df_sample)
batch_size     = 10
shuffle        = True
drop_last      = True
dataloader     = DataLoader(dataset=dataset_sample, batch_size=batch_size,
                            shuffle=shuffle, drop_last=drop_last)

In [53]:
one_batch = next(iter(dataloader))
print('x in one batch')
print(one_batch['x_data'])
print('size of x_data:', one_batch['x_data'].shape)
print('-' * 60)
print('y in one batch')
print(one_batch['y_target'])
print('size of y_data:', one_batch['y_target'].shape)

x in one batch
tensor([[  2,  19, 110,   8, 111,   5],
        [ 17,  96,  97,   7,   0,   0],
        [139,  16, 179,   0,   0,   0],
        [162, 163, 164,   2, 165,   9],
        [125, 126, 127,  21,  74, 128],
        [ 21, 166, 167, 168, 169, 165],
        [  9, 184,  83,  17,  27, 185],
        [ 53, 240,  76, 241,  19,  21],
        [138, 139, 114, 140,   2,  21],
        [  2,   3,   4,   5,   6,   7]])
size of x_data: torch.Size([10, 6])
------------------------------------------------------------
y in one batch
tensor([112,   6,   3,  76,   2,  83, 186, 242, 141,   8])
size of y_data: torch.Size([10])


### In this example, dataloader utilizes the return from the \_\_getitem\_\_() method, which extracts related rows from the _target_df of dataset, with _target_size=65. Also, batch_size=10, and drop_last=True so there are 6 batches created (the last 5 rows are dropped)

In [54]:
print('number of rows in the target_df: ', len(dataset_sample._target_df))
print('number of rows in the target_df: ', dataset_sample._target_size)
print("The number of batches is:",dataset_sample.get_num_batches(batch_size = 10))

number of rows in the target_df:  67
number of rows in the target_df:  67
The number of batches is: 6


In [55]:
i = 0
for data_dict in dataloader:
    print('Batch '+str(i))
    i+=1
    print(data_dict)
    print(data_dict['x_data'].shape)
    print('-' * 60)

Batch 0
{'x_data': tensor([[ 28,  74,  12,  27,   0,   0],
        [ 21, 166, 167, 168, 169, 165],
        [ 49,  50,  27,  19,  51,  52],
        [194,  19, 195, 196,   2, 197],
        [ 53, 240,  76, 241,  19,  21],
        [ 36,  37,  19,  38,  39,  40],
        [165, 234,  35, 235,   0,   0],
        [156, 157,   6,   0,   0,   0],
        [138, 224, 225,   7,   0,   0],
        [203,   2, 118,  27, 204, 114]]), 'y_target': tensor([ 75,  83,  53,  76, 242,  41,  83,  30,   6, 205])}
torch.Size([10, 6])
------------------------------------------------------------
Batch 1
{'x_data': tensor([[ 64,  68,  69,  70,  71,  72],
        [ 42,  43,  44,  21,  45,  46],
        [  2,  19, 110,   8, 111,   5],
        [  2,  21,  74, 113, 114, 115],
        [142, 143,   6,   0,   0,   0],
        [ 74,   9, 153, 154, 155,  17],
        [ 17,  96,  97,   7,   0,   0],
        [  2,   3,   4,   5,   6,   7],
        [ 76,  27,  77,  78,  79,  80],
        [139,  16, 179,   0,   0,   0]]), 'y_ta

### This is equvalent to defining and using the generator function generate_batches().

In [56]:
i = 0
for data_dict in dataloader:
    print('Batch '+str(i))
    i+=1
    print(data_dict)
    print(data_dict['x_data'].shape)
    print('-' * 60)

Batch 0
{'x_data': tensor([[213, 214, 215, 165,  83, 216],
        [ 14,  15,  16,  17,  18,  19],
        [ 98,  99, 100, 101,   9, 102],
        [156, 157,   6,   0,   0,   0],
        [  2,  21,  74, 113, 114, 115],
        [ 21,   2, 107, 108, 109,  76],
        [  2,  62,  21,  27,  63,  17],
        [ 28,  74,  12,  27,   0,   0],
        [  2, 206,  27,   0,   0,   0],
        [ 21, 104, 105, 102,   9,  10]]), 'y_target': tensor([ 74,  20, 103,  30, 116,  14,  64,  75, 207, 106])}
torch.Size([10, 6])
------------------------------------------------------------
Batch 1
{'x_data': tensor([[  2,  19, 180, 104, 181, 182],
        [125, 126, 127,  21,  74, 128],
        [ 54,   2,  10,  55,   7,   0],
        [  2,  19, 110,   8, 111,   5],
        [176,  17,  27, 177,  19,  14],
        [139,  16, 179,   0,   0,   0],
        [162, 163, 164,   2, 165,   9],
        [ 21,  74, 226, 114, 139,   6],
        [ 74,   9, 153, 154, 155,  17],
        [ 28, 121,   6,   0,   0,   0]]), 'y_ta

## 3.3 Generator
### - Generator functions declare a function that behaves like an iterator, i.e. it can be used in a for loop.
### - A generator function is defined just like a normal function, but whenever it needs to generate a value, it does so with the yield keyword rather than return. 
### - Yield is used in Python generators. If the body of a def contains yield, the function automatically becomes a generator function. 
### - *return* sends a specified value back to its caller whereas *yield* can produce a sequence of values. We should use *yield* when we want to iterate over a sequence, but don’t want to store the entire sequence in memory.

### Consider a task to calculate the sum of the first n integers

In [57]:
##### The function below builds the full list in memory
def first_n(n):
    num, nums = 0, []
    while num < n:
        nums.append(num)
        num += 1
    return nums
sum(first_n(100))

4950

In [58]:
##### The following implements generator as an iterable object.
class first_n(object):

    def __init__(self, n):
        self.n = n
        self.num = 0

    def __iter__(self):
        return self

    # Python 3 compatibility
    def __next__(self):
        return self.next()

    def next(self):
        if self.num < self.n:
            cur, self.num = self.num, self.num+1
            return cur
        raise StopIteration
        
a = first_n(10)
print('vars(a):', vars(a))
print('sum(a):', sum(a))

vars(a): {'n': 10, 'num': 0}
sum(a): 45


In [59]:
##### a generator that yields items instead of returning a list

def first_n(n):
    num = 0
    while num < n:
        yield num
        num += 1

a = first_n(10)

print('next(a):', next(a))
print('sum(a):', sum(a))
##### In Python, some built-in functions like sum(a), max(a), list(a) iterates through each element
##### in 'a' and calculate the sum/max/list. This means sum(a) traverses all elements in the iterator
##### 'a' until the iteration is completed. If the generator has already produced all its values,
##### calling next() again will raise a StopIteration exception, indicating that the generator has
##### been exhausted. use next(generator, default) to provide a default value, avoiding the occurrence
##### of an exception. 

print('next(a):', next(a,None))

next(a): 0
sum(a): 45
next(a): None


In [60]:
##### Now next(a) = None so the code will not print anything 
for i in a:
    print (i)

In [61]:
##### using a new generator
a = first_n(10)
for i in a:
    print (i)

0
1
2
3
4
5
6
7
8
9


In [62]:
##### The next() will raise StopIteration Exception
##### since all items are iterated in the max()
a = first_n(10)
print(max(a))
next(a,'StopIteration')

9


'StopIteration'

In [63]:
##### The next() will raise StopIteration Exception
##### since all items are iterated in the list()
a = first_n(10)
print(list(a))
next(a,'StopIteration')

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]


'StopIteration'

In [64]:
##### The next() will raise StopIteration Exception
##### since all items are iterated in the sorted()
a = first_n(10)
print(sorted(a))
next(a,'StopIteration')

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]


'StopIteration'