In [1]:
import pandas as pd
import collections
from collections import Counter
import string
import numpy as np
from argparse import Namespace
import torch
from torch.utils.data import Dataset, DataLoader

In [2]:
df_all = pd.read_csv('news_with_splits.csv')
print("shape of the data: ", df_all.shape)
print('-'*60)
print(df_all.head())

shape of the data:  (120000, 3)
------------------------------------------------------------
   category  split                                 title
0  Business  train    Jobs, tax cuts key issues for Bush
1  Business  train  Jarden Buying Mr. Coffee #39;s Maker
2  Business  train     Retail sales show festive fervour
3  Business  train   Intervoice's Customers Come Calling
4  Business  train     Boeing Expects Air Force Contract


In [3]:
args = Namespace(
    cutoff = 1,
    device = 'cpu'
)

# Define two relevent classes
### - Vocabulary ([see a walkthrough here](https://github.com/houzhj/Machine_Learning/blob/main/ipynb/AGNews/class_Vocabulary.ipynb))
### - SequenceVocabulary ([see a walkthrough here](https://github.com/houzhj/Machine_Learning/blob/main/ipynb/AGNews/class_SequenceVocabulary.ipynb))
### - NewsVectorizer ([see a walkthrough here](https://github.com/houzhj/Machine_Learning/blob/main/ipynb/AGNews/class_Vectorizer.ipynb))

In [4]:
class Vocabulary(object):

    def __init__(self, token_to_idx=None):
        """
        Args:
            token_to_idx (dict): a pre-existing map of tokens to indices
        """
        if token_to_idx is None:
            token_to_idx = {}
            
        self._token_to_idx = token_to_idx
        
        self._idx_to_token = {idx: token 
                              for token, idx in self._token_to_idx.items()}
        

    def add_token(self, token):
        """Update mapping dicts based on the token.

        Args:
            token (str): the item to add into the Vocabulary
        Returns:
            index (int): the integer corresponding to the token
        """
        if token in self._token_to_idx:
            index = self._token_to_idx[token]
        else:
            index = len(self._token_to_idx)
            ### add a new element to _token_to_idx
            self._token_to_idx[token] = index
            ### add a new element to _idx_to_token
            self._idx_to_token[index] = token
        return index
   
    def lookup_token(self, token):
        """Retrieve the index associated with the token 
        
        Args:
            token (str): the token to look up 
        Returns:
            index (int): the index corresponding to the token
        """
        return self._token_to_idx[token]
    
    def lookup_index(self, index):
        """Return the token associated with the index
        
        Args: 
            index (int): the index to look up
        Returns:
            token (str): the token corresponding to the index
        Raises:
            KeyError: if the index is not in the Vocabulary
        """
        if index not in self._idx_to_token:
            raise KeyError("the index (%d) is not in the Vocabulary" % index)
        return self._idx_to_token[index]
    
    def __len__(self):
        return len(self._token_to_idx)
    
class SequenceVocabulary(Vocabulary):
    def __init__(self, 
                 token_to_idx    = None, 
                 unk_token       = "<UNK>",
                 mask_token      = "<MASK>", 
                 begin_seq_token = "<BEGIN>",
                 end_seq_token   = "<END>"):
        
        
        super().__init__(token_to_idx)
        """
        The follow attributes have been defined in the Vocabulary class:
            - ._token_to_idx
            - ._idx_to_token
        """

        self._mask_token      = mask_token      # default: "<MASK>"
        self._unk_token       = unk_token       # default: "<UNK>"
        self._begin_seq_token = begin_seq_token # default: "<BEGIN>"
        self._end_seq_token   = end_seq_token   # default: "<END>"

        self.mask_index       = self.add_token(self._mask_token)      # return 0
        self.unk_index        = self.add_token(self._unk_token)       # return 1
        self.begin_seq_index  = self.add_token(self._begin_seq_token) # return 2
        self.end_seq_index    = self.add_token(self._end_seq_token)   # return 3
        
    
    ### Overriding the self.lookup_token() method
    def lookup_token(self, token):
        """Retrieve the index associated with the token 
          or the UNK index if token isn't present.
        
        Args:
            token (str): the token to look up 
        Returns:
            index (int): the index corresponding to the token
        Notes:
            `unk_index` needs to be >=0 (having been added into the Vocabulary) 
              for the UNK functionality 
        """
        if self.unk_index >= 0:
            return self._token_to_idx.get(token, self.unk_index)
        else:
            return self._token_to_idx[token]

        
class NewsVectorizer(object):
    
    def __init__(self, title_vocab, category_vocab):
        self.title_vocab    = title_vocab
        self.category_vocab = category_vocab
         
    @classmethod
    def from_dataframe(cls, news_df, cutoff):
        """Instantiate the vectorizer from the dataset dataframe
        
        Args:
            news_df (pandas.DataFrame): the news dataset
            cutoff (int): the parameter for frequency-based filtering
        Returns:
            an instance of the NewsVectorizer
        """
        category_vocab = Vocabulary()
        title_vocab    = SequenceVocabulary()
        
        ########## Add tokens to category_vocab ('Business','Sci/Tech','Sports','World')
        for category in sorted(set(news_df.category)):
            category_vocab.add_token(category)
            
        ########## Add tokens to title_vocab
        ### Create a Counter() to count all tokens appears in news_df.title
        word_counts = Counter()
        for title in news_df.title:
            for word in title.split(" "):
                if word not in string.punctuation:
                    word_counts[word] += 1
        ### execute add_token if a word appears more than "cutoff" times
        for word, count in word_counts.items():
            if count > cutoff:
                title_vocab.add_token(word)
                
        return cls(title_vocab, category_vocab)
    
    ### This is the key functionality of the Vectorizer.
    ### It takes as an argument a string representing a text,
    ### and returns a vectorized representation of the text.
    def vectorize(self, title, vector_length=-1):
        """
        Args:
            context (str): the string of words separated by a space
            vector_length (int): an argument for forcing the length of index vector
        """
        ### set the first index to be begin_seq_index=2 (defined in SequenceVocabulary)
        indices = [self.title_vocab.begin_seq_index]
        
        ### adding the indeces for the title after the first index
        indices.extend(self.title_vocab.lookup_token(token)
                       for token in title.split(" "))
        
        ### set the last index to be end_seq_index=3 (defined in SequenceVocabulary)
        indices.append(self.title_vocab.end_seq_index)

        if vector_length < 0:
            vector_length = len(indices)

        out_vector = np.zeros(vector_length, dtype=np.int64)
        out_vector[:len(indices)] = indices
        out_vector[len(indices):] = self.title_vocab.mask_index

        return out_vector

# 1. NewsDataset class
### - The Dataset class will characterize the key features of the dataset.
### - In the initialization function of the class, make the class inherit the properties of torch.utils.data.Dataset so that we can later leverage its functionalities.
### - In the \_\_init\_\_() function and the set_split() function, store important information such as labels and the features that we wish to generate at each pass.
### - Each call requests a sample index for which the upperbound is specified in the \_\_len\_\_() method.
### - When the sample corresponding to a given index is called, the generator executes the \_\_getitem\_\_() method to generate it.

In [5]:
class NewsDataset(Dataset):
    def __init__(self,news_df,vectorizer):
        """
        Args:
            news_df (pandas.DataFrame): the dataset
            vectorizer (NewsVectorizer): vectorizer instatiated from dataset
        """
        self.news_df     = news_df
        self._vectorizer = vectorizer
        
        ### NewsVectorizer.vectorize() with be used with the parameter 
        ### vector_length = self._max_seq_length (the max length among all comments),
        ### so that the vectors for different rows will have the same length.
        measure_len = lambda text: len(text.split(" "))
        ### +1 if only using begin_seq, +2 if using both begin and end seq tokens
        self._max_seq_length = max(map(measure_len, news_df.title)) + 2
        
        self.train_df    = self.news_df[self.news_df.split=='train']
        self.train_size  = len(self.train_df)

        self.val_df      = self.news_df[self.news_df.split=='val']
        self.validation_size = len(self.val_df)

        self.test_df     = self.news_df[self.news_df.split=='test']
        self.test_size   = len(self.test_df)
        
        self._lookup_dict = {'train': (self.train_df, self.train_size),
                             'val'  : (self.val_df, self.validation_size),
                             'test' : (self.test_df, self.test_size)}
        self.set_split('train')
        
        # Class weights
        class_counts = news_df.category.value_counts().to_dict()
        def sort_key(item):
            return self._vectorizer.category_vocab.lookup_token(item[0])
        sorted_counts = sorted(class_counts.items(), key=sort_key)
        frequencies = [count for _, count in sorted_counts]
        self.class_weights = 1.0 / torch.tensor(frequencies, dtype=torch.float32)
        
    @classmethod
    def load_csv_and_make_vectorizer(cls,news_csv):
        """Load dataset and make a new vectorizer from scratch
        Args:
            news_csv (str): location of the dataset
        Returns:
            an instance of NewsDataset
        """
        news_csv = pd.read_csv(news_csv)
        ### make vectorizer using training dataset
        train_news_df   = news_df[news_df.split=='train']
        vectorizer  = NewsVectorizer.from_dataframe(train_news_df,args.cutoff)
        return cls(news_df,vectorizer)
    
    @classmethod
    def load_df_and_make_vectorizer(cls,news_df):
        """Load dataset and make a new vectorizer from scratch
        Args:
            news_df: dataset
        Returns:
            an instance of NewsDataset
        """
        ### make vectorizer using training dataset
        train_news_df  = news_df[news_df.split=='train']
        vectorizer = NewsVectorizer.from_dataframe(train_news_df,args.cutoff)
        return cls(news_df,vectorizer)
    
    def set_split(self, split="train"):
        """ selects the splits in the dataset using a column in the dataframe 
        Args:
            split (str): one of "train", "val", or "test"
        """
        self._target_split = split
        ### when split = 'train', _target_df means the training set
        self._target_df, self._target_size = self._lookup_dict[split]
        
    def __len__(self):
        ### _target_size is defined in set_split() 
        return self._target_size        
        
    def __getitem__(self, index):
        """the primary entry point method for PyTorch datasets
        
        Args:
            index (int): the index to the data point 
        Returns:
            a dictionary holding the data point's features (x_data) and label (y_target)
        """
        
        row = self._target_df.iloc[index]

        title_vector = \
            self._vectorizer.vectorize(row.title, self._max_seq_length)

        category_index   = self._vectorizer.category_vocab.lookup_token(row.category)

        return {'x_data': title_vector,
                'y_target': category_index}

    def get_vectorizer(self):
        """ returns the vectorizer """
        return self._vectorizer
    
    def get_num_batches(self, batch_size):
        """Given a batch size, return the number of batches in the dataset
        Args:
            batch_size (int)
        Returns:
            number of batches in the dataset
        """
        return len(self) // batch_size  

# 2. Instantiate a NewsDataset from the training data
### There are two classmethods can be used to instantiate a NewsDataset: load_csv_and_make_vectorizer() and load_df_and_make_vectorizer(). The difference is whether the input data is from a csv file or a pd.DataFrame file. 

In [6]:
df_sample = df_all.sample(200, random_state=1)
df_sample.head()

Unnamed: 0,category,split,title
76467,Sports,train,Jets remain unbeaten
27991,Business,test,AstraZeneca in drive to slash costs
62646,Sports,train,R. Williams Could Be Eligible to Start Next Se...
28374,Business,test,So much for the #39;soft patch #39;
98305,World,train,Bush urges N. Ireland leaders to accept Anglo-...


### Create a CBOWDataset.

In [7]:
dataset_sample = NewsDataset.load_df_and_make_vectorizer(df_sample)

## 2.1 - Attributes of a CBOWDataset

### .cbow_df: the input dataframe

In [8]:
dataset_sample.news_df

Unnamed: 0,category,split,title
76467,Sports,train,Jets remain unbeaten
27991,Business,test,AstraZeneca in drive to slash costs
62646,Sports,train,R. Williams Could Be Eligible to Start Next Se...
28374,Business,test,So much for the #39;soft patch #39;
98305,World,train,Bush urges N. Ireland leaders to accept Anglo-...
...,...,...,...
36523,Sci/Tech,train,Apple iMac G5
66196,Sports,train,Petacchi sprints to third victory
51981,Sci/Tech,val,Bare cupboards force space station crew to diet
85956,Sports,test,Strong to Coach Gators in the Peach Bowl (AP)


In [9]:
dataset_sample.news_df.equals(df_sample)

True

### ._max_seq_length: max number of tokens in a context

In [10]:
dataset_sample._max_seq_length

18

### ._vectorizer

In [11]:
### Note that the vectorizer is derived from the training split. 
v = dataset_sample._vectorizer

In [12]:
example_text = "the sun is shining and it is a beautiful day"
vector       = v.vectorize(example_text)

In [13]:
print(f'Title Vocabulary: number of tokens: {len(v.title_vocab)}')
print('-'*100)
print(f"The {len(v.title_vocab)} items in '_idx_to_token': ", list(v.title_vocab._token_to_idx.items()))
print('-'*100)
print("Example text:", example_text)
print('vector representation:', vector)

Title Vocabulary: number of tokens: 100
----------------------------------------------------------------------------------------------------
The 100 items in '_idx_to_token':  [('<MASK>', 0), ('<UNK>', 1), ('<BEGIN>', 2), ('<END>', 3), ('to', 4), ('Start', 5), ('N.', 6), ('(AFP)', 7), ('German', 8), ('nuclear', 9), ('be', 10), ('a', 11), ('#39;', 12), ('Mac', 13), ('start', 14), ('Out', 15), ('New', 16), ('of', 17), ('#39;s', 18), ('fall', 19), ('Deal', 20), ('Open', 21), ('in', 22), ('Up', 23), ('as', 24), ('beat', 25), ('Say', 26), ('Star', 27), ('(Reuters)', 28), ('Since', 29), ('up', 30), ('with', 31), ('the', 32), ('door', 33), ('saves', 34), ('by', 35), ('Oil', 36), ('EU', 37), ('on', 38), ('Stocks', 39), ('Data', 40), ('Is', 41), ('for', 42), ('IBM', 43), ('China', 44), ('over', 45), ('Microsoft', 46), ('first', 47), ('day', 48), ('Down', 49), ('lead', 50), ('Six', 51), ('vow', 52), ('Boeing', 53), ('Percent', 54), ('Powell', 55), ('no', 56), ('U.S.', 57), ('supporters', 58), ('

In [14]:
print("The first index: ('<BEGIN>', 2)")
print("The last index: ('<END>', 3)")

The first index: ('<BEGIN>', 2)
The last index: ('<END>', 3)


In [15]:
print("The indeces in the middle corresponed to the mapping stored in title_vocab.")
print("Example - the second last index in the vector representation:")
print(f"The index of 'day': {v.title_vocab._token_to_idx['day']}")
print(f"The token of '48': {v.title_vocab._idx_to_token[48]}")

The indeces in the middle corresponed to the mapping stored in title_vocab.
Example - the second last index in the vector representation:
The index of 'day': 48
The token of '48': day


### ._target_df, _target_size
**Defined by method set_split()**

In [16]:
dataset_sample._target_df

Unnamed: 0,category,split,title
76467,Sports,train,Jets remain unbeaten
62646,Sports,train,R. Williams Could Be Eligible to Start Next Se...
98305,World,train,Bush urges N. Ireland leaders to accept Anglo-...
97960,World,train,German FM: Iranian nuclear arms buildup would ...
12114,Business,train,UPDATE 1-Freddie Mac to start payments to form...
...,...,...,...
67695,Sports,train,Spring change slows Cup cars a bit
48961,Sci/Tech,train,BlackBerry in sync with Mac OS X
36523,Sci/Tech,train,Apple iMac G5
66196,Sports,train,Petacchi sprints to third victory


In [17]:
dataset_sample._target_size

150

### ._lookup_dict - will be used in the method set_split()

In [18]:
dataset_sample._lookup_dict

{'train': (       category  split                                              title
  76467    Sports  train                               Jets remain unbeaten
  62646    Sports  train  R. Williams Could Be Eligible to Start Next Se...
  98305     World  train  Bush urges N. Ireland leaders to accept Anglo-...
  97960     World  train  German FM: Iranian nuclear arms buildup would ...
  12114  Business  train  UPDATE 1-Freddie Mac to start payments to form...
  ...         ...    ...                                                ...
  67695    Sports  train                 Spring change slows Cup cars a bit
  48961  Sci/Tech  train                   BlackBerry in sync with Mac OS X
  36523  Sci/Tech  train                                      Apple iMac G5
  66196    Sports  train                  Petacchi sprints to third victory
  18176  Business  train               Nokia plans 40 new models in  #39;05
  
  [150 rows x 3 columns],
  150),
 'val': (        category split           

In [19]:
### A dictionary which contains a df and a scalar
dataset_sample._lookup_dict['train']

(       category  split                                              title
 76467    Sports  train                               Jets remain unbeaten
 62646    Sports  train  R. Williams Could Be Eligible to Start Next Se...
 98305     World  train  Bush urges N. Ireland leaders to accept Anglo-...
 97960     World  train  German FM: Iranian nuclear arms buildup would ...
 12114  Business  train  UPDATE 1-Freddie Mac to start payments to form...
 ...         ...    ...                                                ...
 67695    Sports  train                 Spring change slows Cup cars a bit
 48961  Sci/Tech  train                   BlackBerry in sync with Mac OS X
 36523  Sci/Tech  train                                      Apple iMac G5
 66196    Sports  train                  Petacchi sprints to third victory
 18176  Business  train               Nokia plans 40 new models in  #39;05
 
 [150 rows x 3 columns],
 150)

In [20]:
### the dataframe
dataset_sample._lookup_dict['train'][0]

Unnamed: 0,category,split,title
76467,Sports,train,Jets remain unbeaten
62646,Sports,train,R. Williams Could Be Eligible to Start Next Se...
98305,World,train,Bush urges N. Ireland leaders to accept Anglo-...
97960,World,train,German FM: Iranian nuclear arms buildup would ...
12114,Business,train,UPDATE 1-Freddie Mac to start payments to form...
...,...,...,...
67695,Sports,train,Spring change slows Cup cars a bit
48961,Sci/Tech,train,BlackBerry in sync with Mac OS X
36523,Sci/Tech,train,Apple iMac G5
66196,Sports,train,Petacchi sprints to third victory


In [21]:
### the sample size
dataset_sample._lookup_dict['train'][1]

150

## 2.2 - Methods of a CBOWDataset

### \_\_len()\_\_

In [22]:
len(dataset_sample)

150

### \_\_getitem()\_\_

In [23]:
### The 4th element in the "train" split
### In the __init__ function, self.set_split('train') defines ._target_df
dataset_sample[3]

{'x_data': array([ 2,  8,  1,  1,  9,  1,  1,  1, 10, 11,  1,  1, 12,  3,  0,  0,  0,
         0]),
 'y_target': 3}

In [24]:
df_sample.loc[df_sample['split']=='train',].head(4)

Unnamed: 0,category,split,title
76467,Sports,train,Jets remain unbeaten
62646,Sports,train,R. Williams Could Be Eligible to Start Next Se...
98305,World,train,Bush urges N. Ireland leaders to accept Anglo-...
97960,World,train,German FM: Iranian nuclear arms buildup would ...


In [25]:
df_sample.loc[df_sample['split']=='train',].iloc[3,2]

'German FM: Iranian nuclear arms buildup would be a  #39;nightmare #39;'

In [26]:
for i in range(7,14):
    print(dataset_sample._vectorizer.title_vocab._idx_to_token[i])

(AFP)
German
nuclear
be
a
#39;
Mac


### set_split()

In [27]:
dataset_sample = NewsDataset.load_df_and_make_vectorizer(df_sample)

In [28]:
### Now the split for ._target_df and _target_size is 'train'
dataset_sample._target_df

Unnamed: 0,category,split,title
76467,Sports,train,Jets remain unbeaten
62646,Sports,train,R. Williams Could Be Eligible to Start Next Se...
98305,World,train,Bush urges N. Ireland leaders to accept Anglo-...
97960,World,train,German FM: Iranian nuclear arms buildup would ...
12114,Business,train,UPDATE 1-Freddie Mac to start payments to form...
...,...,...,...
67695,Sports,train,Spring change slows Cup cars a bit
48961,Sci/Tech,train,BlackBerry in sync with Mac OS X
36523,Sci/Tech,train,Apple iMac G5
66196,Sports,train,Petacchi sprints to third victory


In [29]:
len(dataset_sample)

150

In [30]:
### The 4th element in the "train" split
dataset_sample[3]

{'x_data': array([ 2,  8,  1,  1,  9,  1,  1,  1, 10, 11,  1,  1, 12,  3,  0,  0,  0,
         0]),
 'y_target': 3}

In [31]:
### run set_split, switch the split to 'val'
dataset_sample.set_split('val')
# or 
# CBOWDataset.set_split(dataset_sample,'val')

In [32]:
### Now the split for ._target_df and _target_size is 'val'
dataset_sample._target_df

Unnamed: 0,category,split,title
23632,Business,val,Marsh to Make Payments More Transparent
83309,Sports,val,Rams QB Leaves Game With Shoulder Injury (AP)
114704,World,val,Farooqi key link between Pakistan and al Qaeda...
115317,World,val,China's Land Grabs Raise Specter of Unrest
24902,Business,val,Select Comfort down; lowers sales expectations
54089,Sci/Tech,val,Australia Says It's Starting to Win Its Locust...
81400,Sports,val,MRFIXIT: EASTER PARADE
112105,World,val,"Mortars hit Baghdad safe zone, killing one"
114698,World,val,Japanese hostage in Iraq reportedly killed
51546,Sci/Tech,val,Tiny memory card for mobiles launched


In [33]:
len(dataset_sample)

28

In [34]:
### The 4th element in the "val" split
dataset_sample[3]

{'x_data': array([ 2,  1,  1,  1,  1,  1, 17,  1,  3,  0,  0,  0,  0,  0,  0,  0,  0,
         0]),
 'y_target': 3}

### get_vectorizer()

In [35]:
dataset_sample.get_vectorizer()

<__main__.NewsVectorizer at 0x7fc708b50190>

In [36]:
### Equivalently
dataset_sample._vectorizer

<__main__.NewsVectorizer at 0x7fc708b50190>

### get_num_batches()

In [37]:
dataset_sample = NewsDataset.load_df_and_make_vectorizer(df_sample)
### Switch the split to 'train'
dataset_sample.set_split('train')

In [38]:
dataset_sample.get_num_batches(10)

15

In [39]:
len(dataset_sample._target_df)/10

15.0

In [40]:
len(dataset_sample._target_df)//10

15

In [41]:
### Switch the split to 'val'
dataset_sample.set_split('val')

In [42]:
dataset_sample.get_num_batches(10)

2

In [43]:
len(dataset_sample._target_df)/10

2.8

In [44]:
len(dataset_sample)/10

2.8

# 3. Define a batch generator
### - Wrap the DataLoader
### - Switch the data between the CPU and the GPU.

In [45]:
def generate_batches(dataset, batch_size, shuffle=True,
                     drop_last=True, device='cpu'):
    """
    A generator function which wraps the PyTorch DataLoader. It will 
      ensure each tensor is on the write device location.
    """
    dataloader = DataLoader(dataset=dataset, batch_size=batch_size,
                            shuffle=shuffle, drop_last=drop_last)

    for data_dict in dataloader:
        out_data_dict = {}
        for name, tensor in data_dict.items():
            out_data_dict[name] = data_dict[name].to(device)
        yield out_data_dict

## 3.1 Dataset Class
### - The Dataset class characterizes the key features of the dataset you want to generate.
### - The class uses \_\_init\_\_(), \_\_len\_\_(), and \_\_getitem\_\_() to store important information, and generate samples. 
### - The Dataset class is an important argument of the DataLoader class.

In [46]:
data = {'x1': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12],
        'x2': [13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24],
        'y': [0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0]}
data
df = pd.DataFrame(data)
print("data:" ,data)
print("-"*60)
print("df:",df)

data: {'x1': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], 'x2': [13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24], 'y': [0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0]}
------------------------------------------------------------
df:     x1  x2  y
0    1  13  0
1    2  14  1
2    3  15  0
3    4  16  1
4    5  17  1
5    6  18  0
6    7  19  0
7    8  20  1
8    9  21  1
9   10  22  0
10  11  23  1
11  12  24  0


In [47]:
##### Define Dataset class
class CustomDataset(Dataset):
    def __init__(self, dataframe):
        self.data = dataframe

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        sample = torch.tensor(self.data.iloc[index, :-1].values, dtype=torch.float32)
        target = torch.tensor(self.data.iloc[index, -1], dtype=torch.float32)
        return sample, target

##### Instantiate the Dataset class
custom_dataset = CustomDataset(df)

##### Instantiate the DataLoader class
batch_size  = 3
data_loader = DataLoader(dataset=custom_dataset, batch_size=batch_size, shuffle=False)

##### Obtain the batch
i = 0
for batch in data_loader:
    print('Batch '+str(i))
    i+=1
    print(batch)
    print('-' * 60)

Batch 0
[tensor([[ 1., 13.],
        [ 2., 14.],
        [ 3., 15.]]), tensor([0., 1., 0.])]
------------------------------------------------------------
Batch 1
[tensor([[ 4., 16.],
        [ 5., 17.],
        [ 6., 18.]]), tensor([1., 1., 0.])]
------------------------------------------------------------
Batch 2
[tensor([[ 7., 19.],
        [ 8., 20.],
        [ 9., 21.]]), tensor([0., 1., 1.])]
------------------------------------------------------------
Batch 3
[tensor([[10., 22.],
        [11., 23.],
        [12., 24.]]), tensor([0., 1., 0.])]
------------------------------------------------------------


### An alternative is to use TensorDataset() directly

In [48]:
from torch.utils.data import TensorDataset

In [49]:
x1 = torch.from_numpy(df['x1'].values).float()
x2 = torch.from_numpy(df['x2'].values).float()
y  = torch.from_numpy(df['y'].values).float()
print("x1:", x1)
print("x2:", x2)
print("y:", y)

x1: tensor([ 1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11., 12.])
x2: tensor([13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24.])
y: tensor([0., 1., 0., 1., 1., 0., 0., 1., 1., 0., 1., 0.])


In [50]:
features = torch.stack([x1, x2], dim=1)
features

tensor([[ 1., 13.],
        [ 2., 14.],
        [ 3., 15.],
        [ 4., 16.],
        [ 5., 17.],
        [ 6., 18.],
        [ 7., 19.],
        [ 8., 20.],
        [ 9., 21.],
        [10., 22.],
        [11., 23.],
        [12., 24.]])

In [51]:
##### Create Tensor dataset
dataset     = TensorDataset(features, y)
batch_size  = 3

##### Instantiate the DataLoader class
data_loader = DataLoader(dataset=dataset, batch_size=batch_size, shuffle=False)

##### Obtain the batch
i = 0
for batch in data_loader:
    print('Batch '+str(i))
    i+=1
    print(batch)
    print('-' * 60)

Batch 0
[tensor([[ 1., 13.],
        [ 2., 14.],
        [ 3., 15.]]), tensor([0., 1., 0.])]
------------------------------------------------------------
Batch 1
[tensor([[ 4., 16.],
        [ 5., 17.],
        [ 6., 18.]]), tensor([1., 1., 0.])]
------------------------------------------------------------
Batch 2
[tensor([[ 7., 19.],
        [ 8., 20.],
        [ 9., 21.]]), tensor([0., 1., 1.])]
------------------------------------------------------------
Batch 3
[tensor([[10., 22.],
        [11., 23.],
        [12., 24.]]), tensor([0., 1., 0.])]
------------------------------------------------------------


### The two methods below are equivalent

In [52]:
x1 = torch.from_numpy(df['x1'].values).float()
x2 = torch.from_numpy(df['x2'].values).float()
torch.stack([x1, x2], dim=1)

tensor([[ 1., 13.],
        [ 2., 14.],
        [ 3., 15.],
        [ 4., 16.],
        [ 5., 17.],
        [ 6., 18.],
        [ 7., 19.],
        [ 8., 20.],
        [ 9., 21.],
        [10., 22.],
        [11., 23.],
        [12., 24.]])

In [53]:
numpy_array = df[['x1', 'x2']].to_numpy()
torch.from_numpy(numpy_array)

tensor([[ 1, 13],
        [ 2, 14],
        [ 3, 15],
        [ 4, 16],
        [ 5, 17],
        [ 6, 18],
        [ 7, 19],
        [ 8, 20],
        [ 9, 21],
        [10, 22],
        [11, 23],
        [12, 24]])

## 3.2 DataLoader
### - batch_size: denotes the number of samples contained in each generated batch.
### - shuffle: if set to True, we will get a new order of exploration at each pass (or just keep a linear exploration scheme otherwise). Shuffling the order in which examples are fed to the classifier is helpful so that batches between epochs do not look alike. Doing so will eventually make our model more robust.
### - drop_last: set to True to drop the last incomplete batch, if the dataset size is not divisible by the batch size. If False and the size of dataset is not divisible by the batch size, then the last batch will be smaller. (default: False)

In [54]:
type(dataset_sample[0]['x_data'])

numpy.ndarray

In [55]:
dataset_sample = NewsDataset.load_df_and_make_vectorizer(df_sample)
batch_size     = 30
shuffle        = True
drop_last      = True
dataloader     = DataLoader(dataset=dataset_sample, batch_size=batch_size,
                            shuffle=shuffle, drop_last=drop_last)

In [56]:
one_batch = next(iter(dataloader))
print('x in one batch')
print(one_batch['x_data'])
print('size of x_data:', one_batch['x_data'].shape)
print('-' * 60)
print('y in one batch')
print(one_batch['y_target'])
print('size of y_data:', one_batch['y_target'].shape)

x in one batch
tensor([[ 2,  1,  1, 34,  1, 35,  1,  1,  1,  3,  0,  0,  0,  0,  0,  0,  0,  0],
        [ 2,  1,  1, 41,  1,  1,  1, 31, 78,  1,  3,  0,  0,  0,  0,  0,  0,  0],
        [ 2,  1,  1,  1,  1, 99,  1, 22,  1,  1,  1,  7,  3,  0,  0,  0,  0,  0],
        [ 2,  1,  1,  1,  3,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [ 2,  1,  1,  1,  1, 38, 11,  1,  3,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [ 2, 98,  1,  1,  3,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [ 2,  1,  1, 34, 48, 42,  1,  3,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [ 2,  1,  1,  1, 50,  1,  1,  1,  3,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [ 2,  1,  1,  1,  1,  1,  1,  3,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [ 2,  1,  1,  1, 42, 32,  1,  3,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [ 2,  1,  1,  1, 57,  1,  3,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [ 2,  1, 58,  1,  1,  1, 22,  1,  1,  1,  1,  4,  1, 59, 60,  3,  0,  0],
 

### In this example, dataloader utilizes the return from the \_\_getitem\_\_() method, which extracts related rows from the _target_df of dataset, with _target_size=65. Also, batch_size=10, and drop_last=True so there are 6 batches created (the last 5 rows are dropped)

In [57]:
print('number of rows in the target_df: ', len(dataset_sample._target_df))
print('number of rows in the target_df: ', dataset_sample._target_size)
print("The number of batches is:",dataset_sample.get_num_batches(batch_size = batch_size))

number of rows in the target_df:  150
number of rows in the target_df:  150
The number of batches is: 5


In [58]:
i = 0
for data_dict in dataloader:
    print('Batch '+str(i))
    i+=1
    print(data_dict)
    print(data_dict['x_data'].shape)
    print('-' * 60)

Batch 0
{'x_data': tensor([[ 2,  1, 94,  1,  1, 38,  1, 24,  1,  1,  4,  1,  3,  0,  0,  0,  0,  0],
        [ 2,  1,  1,  1,  1, 79, 42,  1, 68,  3,  0,  0,  0,  0,  0,  0,  0,  0],
        [ 2,  1,  1,  1,  3,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [ 2,  1,  1, 23,  1,  1,  3,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [ 2,  1, 39,  1,  1, 38, 40,  3,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [ 2, 83,  1,  4,  1,  1,  1,  1,  1,  1,  3,  0,  0,  0,  0,  0,  0,  0],
        [ 2,  1,  1,  1, 57,  1,  3,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [ 2,  1,  1, 24,  1, 25,  1,  1,  1,  3,  0,  0,  0,  0,  0,  0,  0,  0],
        [ 2,  1,  1,  1,  1,  1, 93,  1,  1,  3,  0,  0,  0,  0,  0,  0,  0,  0],
        [ 2, 61, 18,  1,  1, 24, 62,  1, 63,  1,  3,  0,  0,  0,  0,  0,  0,  0],
        [ 2,  1,  1,  4,  1,  1, 21,  3,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [ 2,  1,  1, 80,  3,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0

### This is equvalent to defining and using the generator function generate_batches().

In [59]:
i = 0
for data_dict in dataloader:
    print('Batch '+str(i))
    i+=1
    print(data_dict)
    print(data_dict['x_data'].shape)
    print('-' * 60)

Batch 0
{'x_data': tensor([[ 2,  1, 23, 84, 49,  3,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [ 2,  1, 65,  1,  1,  1,  1, 93,  1, 45,  1,  1,  3,  0,  0,  0,  0,  0],
        [ 2,  1,  1,  1,  1,  1, 93,  1,  1,  3,  0,  0,  0,  0,  0,  0,  0,  0],
        [ 2,  1,  1,  1, 17,  1, 42,  1,  1,  1,  1,  3,  0,  0,  0,  0,  0,  0],
        [ 2,  1,  1,  3,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [ 2,  1,  1, 78,  1,  1,  1,  1,  3,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [ 2,  1, 41,  1, 22,  1,  1,  3,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [ 2,  1, 27,  1, 23,  1,  1,  1,  1, 12,  1,  3,  0,  0,  0,  0,  0,  0],
        [ 2,  1,  1,  1,  1, 20,  1,  1,  3,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [ 2,  1,  1,  1, 31,  1,  3,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [ 2,  1,  1,  1,  1,  1,  1,  1,  1,  3,  0,  0,  0,  0,  0,  0,  0,  0],
        [ 2,  1,  1,  1, 50,  1,  1,  1,  3,  0,  0,  0,  0,  0,  0,  0,  0,  0

## 3.3 Generator
### - Generator functions declare a function that behaves like an iterator, i.e. it can be used in a for loop.
### - A generator function is defined just like a normal function, but whenever it needs to generate a value, it does so with the yield keyword rather than return. 
### - Yield is used in Python generators. If the body of a def contains yield, the function automatically becomes a generator function. 
### - *return* sends a specified value back to its caller whereas *yield* can produce a sequence of values. We should use *yield* when we want to iterate over a sequence, but don’t want to store the entire sequence in memory.

### Consider a task to calculate the sum of the first n integers

In [60]:
##### The function below builds the full list in memory
def first_n(n):
    num, nums = 0, []
    while num < n:
        nums.append(num)
        num += 1
    return nums
sum(first_n(100))

4950

In [61]:
##### The following implements generator as an iterable object.
class first_n(object):

    def __init__(self, n):
        self.n = n
        self.num = 0

    def __iter__(self):
        return self

    # Python 3 compatibility
    def __next__(self):
        return self.next()

    def next(self):
        if self.num < self.n:
            cur, self.num = self.num, self.num+1
            return cur
        raise StopIteration
        
a = first_n(10)
print('vars(a):', vars(a))
print('sum(a):', sum(a))

vars(a): {'n': 10, 'num': 0}
sum(a): 45


In [62]:
##### a generator that yields items instead of returning a list

def first_n(n):
    num = 0
    while num < n:
        yield num
        num += 1

a = first_n(10)

print('next(a):', next(a))
print('sum(a):', sum(a))
##### In Python, some built-in functions like sum(a), max(a), list(a) iterates through each element
##### in 'a' and calculate the sum/max/list. This means sum(a) traverses all elements in the iterator
##### 'a' until the iteration is completed. If the generator has already produced all its values,
##### calling next() again will raise a StopIteration exception, indicating that the generator has
##### been exhausted. use next(generator, default) to provide a default value, avoiding the occurrence
##### of an exception. 

print('next(a):', next(a,None))

next(a): 0
sum(a): 45
next(a): None


In [63]:
##### Now next(a) = None so the code will not print anything 
for i in a:
    print (i)

In [64]:
##### using a new generator
a = first_n(10)
for i in a:
    print (i)

0
1
2
3
4
5
6
7
8
9


In [65]:
##### The next() will raise StopIteration Exception
##### since all items are iterated in the max()
a = first_n(10)
print(max(a))
next(a,'StopIteration')

9


'StopIteration'

In [66]:
##### The next() will raise StopIteration Exception
##### since all items are iterated in the list()
a = first_n(10)
print(list(a))
next(a,'StopIteration')

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]


'StopIteration'

In [67]:
##### The next() will raise StopIteration Exception
##### since all items are iterated in the sorted()
a = first_n(10)
print(sorted(a))
next(a,'StopIteration')

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]


'StopIteration'