## Load Data to Memory

train&test: 2d array, each element is a list of \[rating, review_text\]

In [1]:
list2 = []
file_data = open('amazon_total_review_and_rate.csv')
for row in file_data:
    list2.append(row)## Data pre-processing module


In [30]:
list3 = []
for i in range(len(list2)):
    list3.append((list2[i][-2],list2[i][:-3]))
    

In [31]:
lenth = len(list3)
train = list3[0:int(lenth*0.8)]
test = list3[int(lenth*0.8):]
print(len(test), len(train))
test[1003000]

1035937 4143748


('5',
 "I wanted a way to store my daughter's toys without just throwing them all in a toybox. This was EXACTLY right for us! She loves taking toys out of one bucket and putting them in another. It was simple to put together and is light enough to carry around from room to room.")

In [32]:
l = []
for i,j in list3:
    if i not in l:
        l.append(i)
l

['5', '4', '3', '1', '2']

## Data pre-processing module

It is very similar to what I do in Project2.

In [79]:
'''
The following link is useful for understanding sampler, batching, and sequence padding work.
    https://www.scottcondron.com/jupyter/visualisation/audio/2020/12/02/dataloaders-samplers-collate.html#Custom-Sampler
'''

import numpy as np

import torch
from torch.utils.data.sampler import BatchSampler, Sampler
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

import random
device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')

SEED=4321
random.seed(SEED)
np.random.seed(SEED)

class Corpora():
    """
    The class holds training and test corpora.
    """

    def __init__(self):
        """
        Constructor
        """
        # word to index (1-based integers) mapping
        self.word_index = {"N0O0N":0}
        # list of reviews tuples, each of which is (sentence_list, rate),
        self.training_reviews = []
        # (sentence_list, rate) Same format as training_sentences
        self.test_reviews = []

        self.max_len = 0


    # input: a tuple (reviewList, rate)
    # todo: insert values into fields
    # Return the list representing all index of words in a review.
    def insert_fields(self, input):   
        # Sentence list
        word_indexes = []
        for word in input:
            if word not in self.word_index.keys():
                self.word_index.update({word:len(self.word_index.keys())}) #No add 1 because 0 is already in
            # find the index of this word, add to return list
            word_indexes.append(self.word_index[word])
        if len(word_indexes)>self.max_len:
            self.max_len = len(word_indexes)
        return word_indexes
    
    
    
    
    # Different than P2, here we 
    def read_corpus(self, is_training):
        if is_training is True:
            target = train
        else:
            target = test
        for rate, text in target:
            input = text.split(" ")
            tuple = (self.insert_fields(input), rate)
            if is_training: 
                self.training_reviews.append(tuple)
            else:
                self.test_reviews.append(tuple)
                    
                
# Inherient Dataset, convert list and int to tensors, load to GPU.
class ReviewRateDataset(Dataset):
   
    def __init__(self, review_rate_pairs): # NB: sequence_pairs is corpora.training_reviews, 
        # list of (sentence_list, rate)
        self.review_rate_pairs = review_rate_pairs

    def __len__(self):
        return len(self.review_rate_pairs)

    def __getitem__(self, idx):
        sentence_list, rate = self.review_rate_pairs[idx] 
        return torch.tensor(sentence_list).to(device), torch.tensor(int(rate)).to(device)

# NB! This class will be in DataLoader function as a parameter for batch_sampler
class SortedBatchSampler(Sampler):
    """
        Each sequence in a mini-batch must of the same lengths, while our sentences
        are of various lengths.
        We can pad the sentences to the same lengths in each mini-batch.
        But if a short and long sentences are in the same mini-batch, more paddings
        are needed.
        We sort the sentences based on their lengths (in descending order)
            and then put sentences with similar lengths in a batch to reduce the paddings.
    """
    def __init__(self, dataset, batch_size):
        """
            dataset: an torch.utils.data.DataSet object containing all training sequences
            batch_size: the number of sequences to put in a mini-batch
        """

        # The sorting and batching go within this function.      
        self.batch_size = batch_size 
        # Sort the dataset (Based on the length of sentence.)
        dataset.review_rate_pairs  = sorted(dataset.review_rate_pairs,key=lambda x:len(x[0]), reverse=True)
        self.sorted_lengths = len(dataset)
        # Batching: Split the dataset into a list of datasets
        self.index_batches = []  
        # -- NB: Collate function does not work, so I pad it directly.
        for i in range(self.__len__()):
            self.index_batches.append(padding_collate_func(ReviewRateDataset(dataset.review_rate_pairs[i*batch_size:i*batch_size+batch_size])))
        # Now, each mini-batches is a ReviewRateDataset object
        # If else format is needed, may change it latter.
        
    def __iter__(self):
        """
            return a Python iterator object that iterates the mini-batchs of
                training data indices (not individual indices)
        """
        return iter(self.index_batches)

    def __len__(self):
        return self.sorted_lengths // self.batch_size

# NB! This function will be in DataLoader function as a parameter for collate_fn
def padding_collate_func(batch):
    """
        Transform pairs of input-output sequences in the batch to be of the same length using the function
            torch.nn.utils.rnn.pad_sequence.
        batch: An iterator and each element is a pair of (input_sequence, output_sequence).
        For POS tagging, len(input_sequence) = len(output_sequence). But for different
        pairs in batch, their lengths can differ.

        Example: a batch of 3 pairs of input/output sequences
                [([1,2,3],[1,1,1]), ([1,2,3,4],[2,2,2,2]), ([1,2,3,4,5],[3,3,3,3,3])]
                Note: [] encloses tensors (not numpy arra ys)
                
                
                !!!!!NB QUESTION:  it is the inner [] that encloses tensors, right?
                Comment: Batch is an element of a Sampler (see test_p1.py, a l[0] is a batch)
                
                
        return: two tensors (one for input sequence batch and another for output sequence batch).
                These tensors are padded with zeros so that all sequences in the same batch
                are of the same length.
        Example: input_sequence_batch = [[1,2,3,0,0], [1,2,3,4,0], [1,2,3,4,5]],
                 output_sequence_batch = [[1,1,1,0,0], [2,2,2,2,0], [3,3,3,3,3]]

    """
    ### Your codes go here (5 points) ###
    # Hint: read the article linked at the top of this cell.
    
    # NOTe
    # len(batch[0][0]) == len(batch.sequence_pairs[0]) == the target value (the maximum length for each batch)
    # I fill it might be easier to pad sequence_pairs with tuple (0,0), but let's see.
    
    # Doubles the memory takes by this batch, which is not good
    new_l = []
    rate = []
    for i,j in batch.review_rate_pairs:
        new_l.append( torch.tensor(i))
        rate.append( torch.tensor(int(j)))
    padded = pad_sequence(new_l, batch_first=True, padding_value=0)
    #print("len ", len(padded), len(rate)) #BUG: NEED ZIP HERE
    obj = ReviewRateDataset(list(zip(padded,rate)))
    # NB: For now the outter [] is neither tensor nor list, it is an obj!
    # Change latter if necessary
    # Yes, let's change it to two tensors return
    ret1 = []
    ret2 = []
    for i in obj:
        ret1.append(i[0])
        ret2.append(i[1])
    ret1 = torch.stack(ret1)
    ret2 = torch.stack(ret2)
    return ret1, ret2


In [36]:
a = Corpora()
a.read_corpus(True)
a.read_corpus(False)

In [51]:
print("numbers of test reviews", len(a.test_reviews))
print("numbers of Unique words", len(a.word_index.keys()))
print(f'Maximal sentence length = {a.max_len}')

numbers of test reviews 1035937
numbers of Unique words 1130764
Maximal sentence length = 5013


In [59]:
import os
print(os.getpid())

21031


In [70]:
test_dataset = ReviewRateDataset(a.test_reviews)

In [71]:
test_dataset[0]

(tensor([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
         19, 20,  3, 21, 22,  8, 23, 24,  3, 25, 26, 27, 28, 29, 30, 31, 22, 32,
         33, 11, 34, 19, 20,  3, 35, 16, 36, 16, 33, 37, 38, 39, 40, 41,  2, 42,
         43, 41,  2, 44, 45, 46, 47, 48, 20, 49, 50, 13, 29, 51, 22, 12, 52, 53,
         44, 54, 16, 55, 31, 56, 57, 58, 59, 60, 61, 62, 63, 16, 22, 64, 42, 24,
         14, 65, 24, 66, 67, 68, 69, 11, 70, 71], device='cuda:1'),
 tensor(5, device='cuda:1'))

In [80]:
test_sampler = SortedBatchSampler(test_dataset, batch_size=256)



In [83]:
try_some = list(test_sampler)[0]

In [84]:
print(try_some[0])

tensor([[231654,  31260,   9952,  ...,    265,     20,  17081],
        [231654,  31260,   9952,  ...,      0,      0,      0],
        [115126,     22,    124,  ...,      0,      0,      0],
        ...,
        [  2326,   3638,     16,  ...,      0,      0,      0],
        [  6946,    177,    144,  ...,      0,      0,      0],
        [ 96443,    906,  23059,  ...,      0,      0,      0]],
       device='cuda:1')


In [85]:
print(try_some[0].shape)

torch.Size([256, 5013])


In [86]:
print(try_some[1])

tensor([5, 5, 5, 4, 2, 5, 3, 5, 4, 5, 4, 4, 4, 5, 4, 4, 4, 5, 5, 5, 5, 4, 2, 4,
        4, 5, 5, 5, 3, 5, 3, 1, 5, 5, 5, 5, 3, 5, 5, 4, 5, 5, 5, 5, 5, 1, 4, 5,
        5, 5, 5, 5, 3, 4, 4, 5, 4, 1, 5, 5, 5, 5, 1, 5, 5, 4, 5, 4, 4, 4, 5, 2,
        5, 4, 5, 5, 5, 5, 4, 5, 5, 5, 4, 5, 5, 5, 5, 4, 5, 5, 5, 5, 4, 3, 4, 5,
        5, 1, 2, 5, 3, 5, 5, 4, 4, 5, 5, 3, 4, 3, 4, 5, 5, 5, 3, 3, 5, 4, 5, 5,
        5, 4, 2, 3, 5, 5, 2, 5, 5, 4, 4, 5, 3, 2, 5, 2, 5, 4, 5, 1, 3, 5, 5, 5,
        5, 5, 5, 5, 1, 2, 4, 4, 5, 2, 5, 5, 5, 4, 5, 5, 2, 5, 3, 5, 5, 3, 4, 5,
        3, 4, 5, 5, 5, 4, 5, 4, 4, 1, 4, 5, 3, 5, 3, 5, 5, 5, 4, 4, 4, 2, 4, 5,
        5, 5, 5, 5, 5, 5, 5, 5, 4, 3, 1, 1, 4, 3, 5, 5, 5, 5, 5, 4, 4, 1, 5, 5,
        5, 5, 5, 5, 5, 1, 5, 3, 5, 4, 3, 5, 5, 4, 5, 5, 5, 1, 5, 4, 5, 4, 5, 5,
        5, 1, 5, 3, 4, 5, 5, 5, 5, 2, 3, 5, 4, 5, 5, 5], device='cuda:1')


In [87]:
print(try_some[1].shape)

torch.Size([256])
