In [1]:
from argparse import Namespace
from collections import Counter
import json
import os
import re
import string

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm_notebook

In [2]:
from typing import *

In [3]:
class ReviewVectorizer(object):
    def __init__(self):
        pass

In [4]:
class ReviewDataset(Dataset):
    def __init__(self, dataset:pd.DataFrame, vectorizer:ReviewVectorizer):
        self.dataset = dataset
        self._vectorizer = vectorizer
        
        train = dataset[dataset.split=="train"]
        validation = dataset[dataset.split=="val"]
        test = dataset[dataset.split=="test"]
        
        self._lookup_dict = {
            "train": (train, len(train)),
            "validation": (validation, len(validation)),
            "test": (test, len(test))
        }
        
        self.set_split("train")
    
    @classmethod
    def load_dataset_and_make_vectorizer(cls:ReviewDataset, dataset_path:str, vectorizer_path:str):
        dataset = pd.read_csv(dataset_path)
        vectorizer = self.load_vectorizer(vectorizer_path)
        return cls(dataset, vectorizer)

    @staticmethod
    def load_vectorizer(vectorizer_path:str):
        with open(vectorizer_path) as f:
            return ReviewVectorizer.from_serializable(json.load(f))
    
    def save_vectorizer(self, vectorizer_path:str):
        with open(vectorizer_path) as f:
            json.dump(self._vectorizer.to_serializable(), f)
            
    def get_vectorizer(self):
        return self._vectorizer
    
    def set_split(self, split:str="train"):
        self._target_split = split
        self._target_data, self._target_size = self._lookup_dict[split]
        
    def __len__(self):
        return self._target_size
    
    def __getitem__(self, index:int):
        row = self._target_data.iloc[index]
        vector_review = self._vectorizer.vectorize(row.review)
        index_rating = self._vectorizer.rating_vocab.lookup_token(row.rating)
        return {"x":vector_review, "y":index_rating}
    
    def get_number_batches(self, batch_size:int):
        return len(self) // batch_size
    
def generate_batches(dataset:pd.DataFrame, batch_size:int, shuffle:bool=True, drop_last:bool=True, device:str="cpu"):
    dataloader = DataLoader(dataset=dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last)

    for data_dict in dataloader:
        out_data_dict = {}
        for name, tensor in data_dict.items():
            out_data_dict[name] = data_dict[name].to(device)
        yield out_data_dict