In [28]:
from argparse import Namespace
import collections
import json
import os
import re

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm_notebook

from vocabulary import Vocabulary

%matplotlib inline

plt.style.use('fivethirtyeight')
plt.rcParams['figure.figsize'] = (14, 6)

START_TOKEN = "^"
END_TOKEN = "_"

In [29]:
SEED = 0
TRAIN_PROP = 0.7
VAL_PROP = 0.15
TEST_PROP = 0.15

In [30]:
df = pd.read_csv("../data/amazon_train_small.csv", header=None)
df.columns = ['label', 'title', 'body']

In [31]:
df.head(n=20)

Unnamed: 0,label,title,body
0,2,Right on the money,We are using the this book to get 100+ certifi...
1,2,Serves its Purpose!,Couldn't go without it. My 3 1/2 year still we...
2,2,Trailer Park Bwoys!!!,we get to see it on paramount in ol' LND UK an...
3,1,buyer beware,There are companies selling Bosch knock-offs o...
4,2,Great for those cold winters,If you are looking to keep your water liquifie...
5,1,keeps breaking!,I own a Nomad II 64 MP3 player and it has brok...
6,1,Not Happy,Thought this was in English but it is in Germa...
7,1,mount doesn't stay put,I saw quite a few very positive reviews for th...
8,2,"Finally , Some Common Sense!","I was afraid this book would just bash media, ..."
9,2,"Good value, time saver",My wife is a lifelong weightwatcher. She has b...


In [50]:
class AmazonReviewsDataset(object):
    """Amazon Reviews text dataset for language modeling.
    
       Args:
        data_path (str): Path to Amazon reviews data file.
        num_samples (int): Number of amazon reviews to load.
        max_review_length (int): Filters reviews longer than specified length.
            [default=400]
        max_sequence_length (int): Max length of sequences for use in training language model.
            [default=40]
        sentiment (int): sentiment of reviews to select, 1 (negative) or 2 (positive).
            [default=2]
    """

    def __init__(self, data_path):
        data = pd.read_csv(data_path, names=['sentiment', 'title', 'review'])
        self.data = self.preprocess(data)
        
    def preprocess(self, review_df):
        def _preprocess_func(text):
            text = text.lower()
            text = re.sub(r"([.,!?])", r" \1 ", text)
            text = re.sub(r"[^a-zA-Z.,!?]+", r" ", text)
            return text
        
        # Splitting the subset by sentiment to create our new train, val, and test splits
        by_sentiment = collections.defaultdict(list)
        for _, row in review_df.iterrows():
            by_sentiment[row.sentiment].append(row.to_dict())

        final_list = []
        np.random.seed(SEED)

        for _, item_list in sorted(by_sentiment.items()):

            np.random.shuffle(item_list)

            n_total = len(item_list)
            n_train = int(TRAIN_PROP * n_total)
            n_val = int(VAL_PROP * n_total)
            n_test = int(TEST_PROP * n_total)

            # Give data point a split attribute
            for item in item_list[:n_train]:
                item['split'] = 'train'

            for item in item_list[n_train:n_train+n_val]:
                item['split'] = 'val'

            for item in item_list[n_train+n_val:n_train+n_val+n_test]:
                item['split'] = 'test'
            
            # Add to final list
            final_list.extend(item_list)
        
        output_df = pd.DataFrame(final_list)
        output_df['review'] = output_df.review.apply(_preprocess_func)
        return output_df
        
    def get_data(self):
        return self.data

class AmazonReviewsVectorizer(object):
    def __init__(self, word_vocab, max_seq_length):
        self.word_vocab = word_vocab
        self.max_seq_length = max_seq_length

    @classmethod
    def fit(cls, review_df):
        """
        """
        vocab = Vocabulary(use_unks=False,
                           use_start_end=True,
                           use_mask=True,
                           start_token=START_TOKEN,
                           end_token=END_TOKEN)
        max_seq_length = 0
        for review in review_df['review'].values:
            review_split = review.split(" ")
            for word in review_split:
                vocab.add(word)
            if len(review_split) > max_seq_length:
                max_seq_length = len(review_split)
        max_seq_length += 2
        return cls(vocab, max_seq_length)

    def transform(self, review_df, split='train'):
        review_df = review_df[review_df.split==split].reset_index()
        num_data = len(review_df)
        
        x_words = np.zeros((num_data, self.max_seq_length), dtype=np.int64)
        y_sentiment = np.zeros(num_data, dtype=np.int64)

        for index, row in review_df.iterrows():
            x_indices = list(self.word_vocab.map(row['review'].split(' '), include_start_end=True))
            x_words[index, :len(x_indices)] = x_indices 
            y_sentiment[index] = row['sentiment']
            
        return VectorizedAmazonReviews(x_words, y_sentiment)

class VectorizedAmazonReviews(Dataset):
    def __init__(self, x_input, y_target):
        self.x_input = x_input
        self.y_target = y_target

    def __len__(self):
        return len(self.x_input)

    def __getitem__(self, index):
        return {'x_input': self.x_input[index],
                'y_target': self.y_target[index],
                'x_lengths': len(self.x_input[index].nonzero()[0])}
    

def generate_batches(dataset, batch_size, shuffle=True,
                     drop_last=True, device="cpu"): 
    """
    A generator function which wraps the PyTorch DataLoader. It will 
      ensure each tensor is on the write device location.
    """
    dataloader = DataLoader(dataset=dataset, batch_size=batch_size,
                            shuffle=shuffle, drop_last=drop_last)

    for data_dict in dataloader:
        out_data_dict = {}
        for name, tensor in data_dict.items():
            out_data_dict[name] = data_dict[name].to(device)
        yield out_data_dict

In [51]:
dataset = AmazonReviewsDataset("../data/amazon_train_small.csv")

In [52]:
vectorizer = AmazonReviewsVectorizer.fit(dataset.get_data())

In [53]:
train_dataset = vectorizer.transform(dataset.get_data(), split='train')

In [54]:
val_dataset = vectorizer.transform(dataset.get_data(), split='val')

In [56]:
batch_dict = next(generate_batches(train_dataset, 8))

## TASK

1. create an embedding layer and get it to work with the batch_dict above
2. use either:
    1. a deep averaging network
    2. a convnet
    3. a RNN