In [1]:
from  torch.utils.data import Dataset
import pandas as pd
import torch
from torch.utils.data import DataLoader
import torch.nn as nn 
import torch.nn.functional as F
from argparse import Namespace
import torch.optim as optim
from collections import Counter
import string
import numpy as np

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [3]:
class Vocabulary:

    def __init__(self,token_to_idx=None,add_unk=True,unk_token="<UNK>"):

        if token_to_idx is None:
            token_to_idx = {}

        self.token_to_idx= token_to_idx
        self.idx_to_token={idx:token for token,idx in token_to_idx.items()}
        self.add_unk=add_unk
        self.unk_token=unk_token
        self.unk_index=-1
        if add_unk:
            self.unk_index= self.add_token(unk_token) 

    def to_serailizable(self):
        return {
            "token_to_idx": self.token_to_idx,
            "add_unk":self.add_unk,
            "unk_token":self.unk_token
        }

    @classmethod
    def from_serailizable(cls,contents):
        return cls(**contents)

    def add_token(self,word):

        if word not in self.token_to_idx:
            self.token_to_idx[word]=len(self.token_to_idx)
            self.idx_to_token[self.token_to_idx[word]]=word
        return self.token_to_idx[word]

    def lookup_token(self,token):
        
        if token not in self.token_to_idx:
            return self.token_to_idx.get(token,self.unk_token)
        else:
            return self.token_to_idx[token]

    def lookup_idx(self,idx):

        if idx not in self.idx_to_token:
            raise KeyError("the index (%d) is not in the Vocabulary" % idx)
        else:
            return self.idx_to_token[idx]

    def __len__(self):
        return len(self.token_to_idx)

    def __str__(self):
        return "<Vocabulary(size=%d)>" % len(self)

In [4]:
class Vectorizer:

    def __init__(self,text_vocab,rating_vocab):
        self.text_vocab=text_vocab
        self.rating_vocab=rating_vocab

    def vectorize(self,text):
        
        one_hot=np.zeros(len(self.text_vocab),dtype=np.float32)
        
        for word in text.split():
            if word not in string.punctuation:
                if word in self.text_vocab.token_to_idx:
                    one_hot[self.text_vocab.lookup_token(word)]=1

        return one_hot
    
    def vectorize_y(self,rating):   
        one_hot=np.zeros(len(self.rating_vocab),dtype=np.float32)
        one_hot[self.rating_vocab.lookup_token(rating)]=1
        return one_hot
        

    @classmethod
    def from_dataframe(cls,review_df,cutoff=25):

        text_vocab=Vocabulary()
        rating_vocab=Vocabulary(add_unk=False)

        for rating in review_df.stars:
            rating_vocab.add_token(rating)

        words=Counter()

        for texts in review_df.text:
            for word in texts.split():
                if word not in string.punctuation:
                    words[word] +=1

        for word,counts in words.items():
            if counts > cutoff:
                text_vocab.add_token(word)

        return cls(text_vocab,rating_vocab)

    @classmethod
    def from_serailizable(cls,contents):

        text_vocab=Vocabulary.from_serailizable(contents["review_vocab"])
        rating_vocab=Vocabulary.from_serailizable(contents["rating_vocab"])

        return cls(text_vocab=text_vocab,rating_vocab=rating_vocab)

    def to_serailizable(self):

        return {
                "text_vocab": self.text_vocab.to_serailizable(),
                "rating_vocab":self.rating_voca.to_serailizable()
                }

In [5]:
class Yelp(Dataset):

    def __init__(self,target_df,vectorizer,trainable):
        self.target_df=target_df
        self.vectorizer=vectorizer
        self.trainable=trainable

    def get_vectorizer(self):
        return self.vectorizer

    @classmethod
    def creat_new(cls,path,trainable):
        target=pd.read_csv(path)
        if trainable:
            vectorizer=Vectorizer.from_dataframe(target)
        else:
            vectorizer=None
        return cls(target,vectorizer,trainable)

    def set_vectorizer(self,vectorizer):
        self.vectorizer=vectorizer

    def __len__(self):
        return len(self.target_df)

    def __getitem__(self, index):
        
        row=self.target_df.iloc[index]
        text_vector= \
            self.vectorizer.vectorize(row.text)
        rating_vector=\
            self.vectorizer.vectorize_y(row.stars)
        
        return {
            "x_data" : text_vector,
            "y_target": rating_vector        
            }
    
    def get_num_batches(self,batch_size):

        return len(self)//batch_size

In [6]:
def generate_batches(dataset,batch_size,shuffle=True,drop_last=True,device="cpu"):

    dataloader= DataLoader(dataset=dataset,batch_size=batch_size,shuffle=shuffle,drop_last=drop_last)

    for data_dict in dataloader:
        out_data_dict={}
        for name,tensor in data_dict.items():
            out_data_dict[name]=data_dict[name].to(device)
        yield out_data_dict



In [7]:
class ReviewClassifier(nn.Module):

    def __init__(self,num_features):
        super().__init__()
        self.fc1=nn.Linear(in_features=num_features,out_features=5)

    def forward(self,x_in,apply_sigmoid=False):

        y_out=self.fc1(x_in)#.squezze
        if apply_sigmoid:
            y_out=F.sigmoid(y_out)

        return y_out

In [8]:
args = Namespace(
frequency_cutoff=25,
model_state_file='model.pth',
review_csv='data/yelp/reviews_with_splits_lite.csv',
save_dir='model_storage/ch3/yelp/',
vectorizer_file='vectorizer.json',
batch_size=128,
early_stopping_criteria=5,
learning_rate=0.001,
num_epochs=100,
seed=1337,
)

In [9]:
def make_train_state(arge):
    return {
        "epoch_index":0,
        "train_loss":[],
        "train_acc":[],
        "val_loss":[],
        "val_acc":[],
        "test_loss":-1,
        "test_acc":-1
        }

In [None]:
train_state=make_train_state(args)

args.device=device
train_dataset=Yelp.creat_new("train_dataset.csv",trainable=True)
test_dataset=Yelp.creat_new("test_dataset.csv",trainable=False)
valid_dataset=Yelp.creat_new("valid_dataset.csv",trainable=False)

vectorizer=train_dataset.get_vectorizer()
test_dataset.set_vectorizer(vectorizer)
valid_dataset.set_vectorizer(vectorizer)

classifier= ReviewClassifier(num_features=len(vectorizer.text_vocab))
classifier.to(device)

loss_func = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(classifier.parameters(), lr=args.learning_rate)


In [None]:
def compute_accuracy(y_pred, y_target):
    n_correct = sum(sum(sum([y_pred.detach()-y_target])))
    return n_correct / (len(y_pred)) * 100

In [None]:
for epoch_index in range(args.num_epochs):
    train_state["epoch_index"]=epoch_index
    batch_generator=generate_batches(train_dataset,batch_size=args.batch_size,device=args.device)
    running_loss=0
    running_acc=0
    classifier.train()
    for batch_index,batch_dict in enumerate(batch_generator):
        optimizer.zero_grad()
        y_pred=classifier(x_in=batch_dict["x_data"].float())
        
        loss=loss_func(y_pred,batch_dict["y_target"].float())
        loss_batch=loss.item()
        running_loss += (loss_batch - running_loss) / (batch_index + 1)
        loss.backward()
        optimizer.step()
        acc_batch=compute_accuracy(y_pred,batch_dict["y_target"])
        running_acc += (acc_batch-running_acc)/(batch_index+1)
    train_state["train_loss"].append(running_loss)
    train_state["train_acc"].append(running_acc)
    
    batch_generator=generate_batches(valid_dataset,batch_size=args.batch_size,device=args.device)
    running_loss=0
    running_acc=0
    classifier.eval()
    for batch_index,batch_dict in enumerate(batch_generator):
        y_pred=classifier(xin=batch_dict["x_data"].float())
        loss=loss_func(y_pred,batch_dict["y_target"].float())
        loss_batch=loss.item()
        running_loss += (loss_batch - running_loss) / (batch_index + 1)
        acc_batch=compute_accuracy(y_pred,batch_dict["y_target"])
        running_acc += (acc_batch-running_acc)/(batch_index+1)
        
    train_state["valid_loss"].append(running_loss)
    train_state["valid_acc"].append(running_acc)

        
        