In [4]:
import torch.nn as nn
import re
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
from argparse import Namespace
from collections import Counter
import numpy as np
import string
import torch
import os

from torch.utils.data import Dataset, DataLoader
import collections

### Create Vocabulary class

In [64]:
class Vocabulary(object):
    """
    The class that is used to create the mapping of "token to index" and "index to toekn".
    It implements the following functions:
    
    - add_token:
        add one specified token into the mapping (self._token_to_idx and self._idx_to_token)
    - add_many:
        add a list of tokens, return indices(list)
    - lookup_token:
        return the index of the token, if the token is in the vocabulary
    - lookup_idx:
        return the token of the index, if the index is in the self._idx_to_token
    -__len__:
        return the size of the vocabulary
    
    """
    def __init__(self,token_to_idx = None,add_unk = True,unk_token = '<unk>'):
        """
        Args:
        token_to_idx (dict), the existing dictionary mapping token into idx
        add_unk(bool), add unknow token into the vocabulary or not
        unk_token(str), the unkown token   
        """ 
        if token_to_idx is None:
            token_to_idx = {}
        
        self._token_to_idx = token_to_idx
        self._idx_to_token = {idx: token for idx,token in self._token_to_idx.items()}
        
        # Add unk_token into the vocabulary
        self._add_unk = add_unk
        self._unk_token = unk_token
        self._unk_idx = -1
        
        if self._add_unk:
            self._unk_idx = self.add_token(self._unk_token)
            
    def add_token(self,token):
        
        """
        Args:
            token: The token that is going to be added
        Output:
            the index of the token
        """
        
        if token in self._token_to_idx:
            index = self._token_to_idx[token]
        else:
            index = len(self._token_to_idx)
            self._token_to_idx[token] = index
            self._idx_to_token[index] = token
            
        return index
    
    def add_many(self,tokens):
        """
        Add the list of tokens into the vocabulary
        Args:
            tokens(str),a list of tokens that are going to be added into the vocabulary
        Output:
            indices(list), the list of indices corresponding to tokens
        """
        indices = [self._idx_to_token[token] for token in tokens]
        return indices
    
    def lookup_token(self,token):
        """
        Retrieve the index of the token, if the token does not exist,then return the unkown token
        
        Output:
            the index of the token
        """
        
        if self._unk_idx > 0:
            return self._token_to_idx.get(token,self._unk_idx)
        else:
            return self._token_to_idx[token]
        
    @classmethod
    def from_serializable(cls, contents):
        """ instantiates the Vocabulary from a serialized dictionary """
        return cls(**contents)

    def to_serializable(self):
        """ returns a dictionary that can be serialized """
        return {'token_to_idx': self._token_to_idx, 
                'add_unk': self._add_unk, 
                'unk_token': self._unk_token}
    
    def look_index(self,index):
        """
        Return the index of the token. if the index does not exist in the vocabulary, then 
        raise the error
        
        Args:
            the wanted index of the token
        Output:
            the assoicated index
        
        """
        if index in self._index_to_token:
            return self._index_to_token[index]
        else:
            raise KeyError("the index (%d) is not in the vocabulary", index)
    
    def __len__(self):
        return len(self._token_to_idx)
    
    def to_serializable(self):
        """ returns a dictionary that can be serialized """
        return {'token_to_idx': self._token_to_idx, 
                'add_unk': self._add_unk, 
                'unk_token': self._unk_token}

    @classmethod
    def from_serializable(cls, contents):
        """ instantiates the Vocabulary from a serialized dictionary """
        return cls(**contents)

In [7]:
class Vectorizer(object):
    """
    This class is to vectorize the input text into vectors
    
    """
    def __init__(self, review_vocab, label_vocab):
        """
        Args:
            review_vocab ( the instance of Vocabulary): mapping tokens into integers
            label_vocab (the instance of Vocabulary): mapping the label into integers
        
        """
        self.review_vocal = review_vocab
        self.label_vocab = label_vocab
    
    @classmethod
    def from_dataframe(cls,review_data,cutoff = 20):
        """
        Args:
            review_data (pandas.DataFrame) : the dataset
            cutoff(int): only tokens that have higher frequencies can be added into the Vocabulary 
        Output:
            the instance of Vectorizer
        """
        review_vocab = Vocabulary(add_unk=True)
        label_vocab = Vocabulary(add_unk=False)
        
        word_counter = Counter()
        for review in review_data.review:
            for word in review.split(" "):
                if word not in string.punctuation:
                    word_counter[word] += 1
                    
        for word,count in word_counter.items():
            if count >= cutoff:
                review_vocab.add_token(word)
        
        return cls(review_vocab,label_vocab)
    
    def vectorizeOnehot(self,review):
        """
        Args:
            review(str), the text
        Output:
            the one-hot vector representing the text
        """
        
        one_hot = np.zeros(len(self.review_vocab))
        
        for token in review.split(" "):
            if token not in string.punctuation:
                one_hot[self.review_vocab.lookup_token(token)] = 1
        return one_hot
    
    @classmethod
    def from_serializable(cls, contents):
        """Instantiate a Vectorizer from a serializable dictionary
        
        Args:
            contents (dict): the serializable dictionary
        Returns:
            an instance of the Vectorizer class
        """
        review_vocab = Vocabulary.from_serializable(contents['review_vocab'])
        label_vocab =  Vocabulary.from_serializable(contents['label_vocab'])

        return cls(review_vocab=review_vocab, label_vocab=label_vocab)

    def to_serializable(self):
        """Create the serializable dictionary for caching
        
        Returns:
            contents (dict): the serializable dictionary
        """
        return {'review_vocab': self.review_vocab.to_serializable(),
                'label_vocab': self.label_vocab.to_serializable()}

In [63]:
class ReviewDateset(Dataset):
    def __init__(self,review_df,vectorizer):
        """
        Args:
            review_df(dataFrame), the review text
            vectorizer: the instance of Vectorizer
        
        """
        
        self.review_df = review_df
        self._vectorizer = vectorizer
        
        self.train_df = self.review_df[self.review_df.split == 'train']
        self.train_size = len(self.train_df)
        
        self.val_df = self.review_df[self.review_df.split == 'val']
        self.val_size = len(self.val_df)
        
        self.test_df = self.review_df[self.review_df.split == 'test']
        self.test_size = len(self.test_df)
        
        self._lookup_dict = {
                            "train": (self.train_df,self.train_size),
                            "test": (self.test_df,self.test_size),
                            "val": (self.val_df,self.val_size)
                            }
        
        self.set_split("train")
    
    
    @classmethod
    def load_dataset_and_make_vectorizer(cls, review_csv):
        """
        Args:
            review_csv(str): location of the dataset
        """
        
        review_df = pd.read_csv(review_csv)
        train_review_df = review_df[review_df.split == "train"]
        
        return cls(review_df, Vectorizer.from_dataframe(train_review_df))
    
        
    def set_split(self,split = "train"):
        """
        Select the data whose split attribute is "split"
        Args:
            split(string), one of "train","test","val"
        """
        self._target_split = split
        self._target_df, self._target_size = self._lookup_dict[split]
        
    def __len__(self):
        return self._target_size

    def __getitem__(self,index):
        """
        Args:
            index: the index of the data
        Return:
            the dictionary of features(X) and the label(Y)
        """
        row = self._target_df.iloc[index]
        
        review_vec = self._vectorizer.vectorizeOnehot(row.review)
        
        label_index = self._vectorizer.label_vocab.lookup_token(row.label)
        
        return {"X_data":review_vec,
                "Y_label": label_index}
    
    def get_num_batches(self,batch_size):
        """
        Args:
            batch_size(int), given a batch size, return how many batches in the dataset
        """
        return len(self) // batch_size
    
    def save_vectorizer(self, vectorizer_filepath):
        """saves the vectorizer to disk using json
        
        Args:
            vectorizer_filepath (str): the location to save the vectorizer
        """
        with open(vectorizer_filepath, "w") as fp:
            json.dump(self._vectorizer.to_serializable(), fp)
        
def generate_batches(dataset,batch_size, device = 'cpu',
                     shuffle = True, drop_last = True):
    """
    Args:
        a generator that wraps the DataLoader. It will ensure the tensors are one the right device
    """
    
    dataloader = DataLoader(dataset = dataset, batch_size= batch_size, shuffle=shuffle,
                            drop_last = drop_last)
    
    for data_dict in dataloader:
        out_data_dict = {}
        for name, tensor in data_dict.items():
            out_data_dict[name] = data_dict[name].to(device)
        yield out_data_dict
    

In [9]:
def make_train_state(args):
    return {'stop_early': False,
            'early_stopping_step': 0,
            'early_stopping_best_val': 1e8,
            'learning_rate': args.learning_rate,
            'epoch_index': 0,
            'train_loss': [],
            'train_acc': [],
            'val_loss': [],
            'val_acc': [],
            'test_loss': -1,
            'test_acc': -1,
            'model_filename': args.model_state_file}

def update_train_state(args, model, train_state):
    """Handle the training state updates.

    Components:
     - Early Stopping: Prevent overfitting.
     - Model Checkpoint: Model is saved if the model is better

    :param args: main arguments
    :param model: model to train
    :param train_state: a dictionary representing the training state values
    :returns:
        a new train_state
    """

    # Save one model at least
    if train_state['epoch_index'] == 0:
        torch.save(model.state_dict(), train_state['model_filename'])
        train_state['stop_early'] = False

    # Save model if performance improved
    elif train_state['epoch_index'] >= 1:
        loss_tm1, loss_t = train_state['val_loss'][-2:]

        # If loss worsened
        if loss_t >= train_state['early_stopping_best_val']:
            # Update step
            train_state['early_stopping_step'] += 1
        # Loss decreased
        else:
            # Save the best model
            if loss_t < train_state['early_stopping_best_val']:
                torch.save(model.state_dict(), train_state['model_filename'])

            # Reset early stopping step
            train_state['early_stopping_step'] = 0

        # Stop early ?
        train_state['stop_early'] = \
            train_state['early_stopping_step'] >= args.early_stopping_criteria

    return train_state

def compute_accuracy(y_pred, y_target):
    y_target = y_target.cpu()
    y_pred_indices = (torch.sigmoid(y_pred)>0.5).cpu().long()#.max(dim=1)[1]
    n_correct = torch.eq(y_pred_indices, y_target).sum().item()
    return n_correct / len(y_pred_indices) * 100

### Classifier Model

In [10]:
class ReviewClassifier(nn.Module):
    """ a simple perceptron based classifier """
    def __init__(self, num_features):
        """
        Args:
            num_features (int): the size of the input feature vector
        """
        super(ReviewClassifier, self).__init__()
        self.fc1 = nn.Linear(in_features=num_features, 
                             out_features=1)

    def forward(self, x_in, apply_sigmoid=False):
        """The forward pass of the classifier
        
        Args:
            x_in (torch.Tensor): an input data tensor. 
                x_in.shape should be (batch, num_features)
            apply_sigmoid (bool): a flag for the sigmoid activation
                should be false if used with the Cross Entropy losses
        Returns:
            the resulting tensor. tensor.shape should be (batch,)
        """
        y_out = self.fc1(x_in).squeeze()
        if apply_sigmoid:
            y_out = torch.sigmoid(y_out)
        return y_out

### Set random seed and make path

In [11]:
def set_seed_everywhere(seed, cuda):
    np.random.seed(seed)
    torch.manual_seed(seed)
    if cuda:
        torch.cuda.manual_seed_all(seed)

def handle_dirs(dirpath):
    if not os.path.exists(dirpath):
        os.makedirs(dirpath)

In [12]:
args = Namespace(
    # Data and Path information
    frequency_cutoff=25,
    model_state_file='model.pth',
    review_csv= 'D:/Study/Jupyter/nlp begineer/task1/processed_data.csv',
    # review_csv='data/yelp/reviews_with_splits_full.csv',
    save_dir='model_storage/',
    vectorizer_file='vectorizer.json',
    # No Model hyper parameters
    # Training hyper parameters
    batch_size=128,
    early_stopping_criteria=5,
    learning_rate=0.001,
    num_epochs=100,
    seed=1337,
    # Runtime options
    catch_keyboard_interrupt=True,
    cuda=True,
    expand_filepaths_to_save_dir=True,
    reload_from_files=False,
)

if args.expand_filepaths_to_save_dir:
    args.vectorizer_file = os.path.join(args.save_dir,
                                        args.vectorizer_file)

    args.model_state_file = os.path.join(args.save_dir,
                                         args.model_state_file)
    
    print("Expanded filepaths: ")
    print("\t{}".format(args.vectorizer_file))
    print("\t{}".format(args.model_state_file))
    
# Check CUDA
if not torch.cuda.is_available():
    args.cuda = False

print("Using CUDA: {}".format(args.cuda))

args.device = torch.device("cuda" if args.cuda else "cpu")

# Set seed for reproducibility
set_seed_everywhere(args.seed, args.cuda)

# handle dirs
handle_dirs(args.save_dir)

Expanded filepaths: 
	model_storage/vectorizer.json
	model_storage/model.pth
Using CUDA: False


### The initialization

In [37]:
tem_data = pd.read_csv(args.review_csv)
print(tem_data.columns)

Index(['Phrase', 'Sentiment', 'split'], dtype='object')


In [62]:
dataset = ReviewDateset.load_dataset_and_make_vectorizer(args.review_csv)
dataset.save_vectorizer(args.vectorizer_file)    
vectorizer = dataset.get_vectorizer()

classifier = ReviewClassifier(num_features=len(vectorizer.review_vocab))

AttributeError: 'ReviewDateset' object has no attribute 'save_vectorizer'

In [None]:
classifier = classifier.to(args.device)

loss_func = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(classifier.parameters(), lr=args.learning_rate)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer,
                                                 mode='min', factor=0.5,
                                                 patience=1)

train_state = make_train_state(args)

epoch_bar = tqdm_notebook(desc='training routine', 
                          total=args.num_epochs,
                          position=0)

dataset.set_split('train')
train_bar = tqdm_notebook(desc='split=train',
                          total=dataset.get_num_batches(args.batch_size), 
                          position=1, 
                          leave=True)
dataset.set_split('val')
val_bar = tqdm_notebook(desc='split=val',
                        total=dataset.get_num_batches(args.batch_size), 
                        position=1, 
                        leave=True)

try:
    for epoch_index in range(args.num_epochs):
        train_state['epoch_index'] = epoch_index

        # Iterate over training dataset

        # setup: batch generator, set loss and acc to 0, set train mode on
        dataset.set_split('train')
        batch_generator = generate_batches(dataset, 
                                           batch_size=args.batch_size, 
                                           device=args.device)
        running_loss = 0.0
        running_acc = 0.0
        classifier.train()

        for batch_index, batch_dict in enumerate(batch_generator):
            # the training routine is these 5 steps:

            # --------------------------------------
            # step 1. zero the gradients
            optimizer.zero_grad()

            # step 2. compute the output
            y_pred = classifier(x_in=batch_dict['x_data'].float())

            # step 3. compute the loss
            loss = loss_func(y_pred, batch_dict['y_target'].float())
            loss_t = loss.item()
            running_loss += (loss_t - running_loss) / (batch_index + 1)

            # step 4. use loss to produce gradients
            loss.backward()

            # step 5. use optimizer to take gradient step
            optimizer.step()
            # -----------------------------------------
            # compute the accuracy
            acc_t = compute_accuracy(y_pred, batch_dict['y_target'])
            running_acc += (acc_t - running_acc) / (batch_index + 1)

            # update bar
            train_bar.set_postfix(loss=running_loss, 
                                  acc=running_acc, 
                                  epoch=epoch_index)
            train_bar.update()

        train_state['train_loss'].append(running_loss)
        train_state['train_acc'].append(running_acc)

        # Iterate over val dataset

        # setup: batch generator, set loss and acc to 0; set eval mode on
        dataset.set_split('val')
        batch_generator = generate_batches(dataset, 
                                           batch_size=args.batch_size, 
                                           device=args.device)
        running_loss = 0.
        running_acc = 0.
        classifier.eval()

        for batch_index, batch_dict in enumerate(batch_generator):

            # compute the output
            y_pred = classifier(x_in=batch_dict['x_data'].float())

            # step 3. compute the loss
            loss = loss_func(y_pred, batch_dict['y_target'].float())
            loss_t = loss.item()
            running_loss += (loss_t - running_loss) / (batch_index + 1)

            # compute the accuracy
            acc_t = compute_accuracy(y_pred, batch_dict['y_target'])
            running_acc += (acc_t - running_acc) / (batch_index + 1)
            
            val_bar.set_postfix(loss=running_loss, 
                                acc=running_acc, 
                                epoch=epoch_index)
            val_bar.update()

        train_state['val_loss'].append(running_loss)
        train_state['val_acc'].append(running_acc)

        train_state = update_train_state(args=args, model=classifier,
                                         train_state=train_state)

        scheduler.step(train_state['val_loss'][-1])

        train_bar.n = 0
        val_bar.n = 0
        epoch_bar.update()

        if train_state['stop_early']:
            break

        train_bar.n = 0
        val_bar.n = 0
        epoch_bar.update()
except KeyboardInterrupt:
    print("Exiting loop")

### Read datasets

In [15]:
raw_train = pd.read_csv(args.raw_train_csv,sep = '\t')
raw_test = pd.read_csv(args.raw_test_csv,sep = '\t')

In [16]:
print(len(raw_train),len(raw_test))

156060 66292


In [17]:
print(raw_train.columns)
set(raw_train.Sentiment.value_counts())

Index(['PhraseId', 'SentenceId', 'Phrase', 'Sentiment'], dtype='object')


{7072, 9206, 27273, 32927, 79582}

### Split the subset by sentiments to create new_train and val splits
This step is to create new_train, val sets to ensure the same distribution of sentiments 

In [18]:
raw_train_data = raw_train[["Sentiment","Phrase"]]
raw_test_data = raw_test[["Phrase"]]
print(raw_train_data.columns)

Index(['Sentiment', 'Phrase'], dtype='object')


In [21]:
by_sentiment = collections.defaultdict(list)
for _,row in raw_train_data.iterrows():
    by_sentiment[row.Sentiment].append(row.to_dict())

In [22]:
print(by_sentiment[0][0:3])

[{'Sentiment': 0, 'Phrase': 'would have a hard time sitting through this one'}, {'Sentiment': 0, 'Phrase': 'have a hard time sitting through this one'}, {'Sentiment': 0, 'Phrase': 'Aggressive self-glorification and a manipulative whitewash'}]


### Create split data
a = {2:[{'rat':1,'bcd':2}],1:[{'rat':3,'bcd':4}]} the form of by_sentiment

In [25]:
data_args = Namespace(
    raw_train_csv = 'D:/Study/Jupyter/nlp begineer/task1/train.tsv',
    raw_test_csv = 'D:/Study/Jupyter/nlp begineer/task1/test.tsv',
    train_proportion = 0.7 ,
    val_proportion = 0.15,
    test_proportion = 0.15,
    output_csv = 'D:/Study/Jupyter/nlp begineer/task1/processed_data.csv',
    seed = 131
)

In [41]:
fin_list = []
np.random.seed(args.seed)

for _,item_list in by_sentiment.items():
    #item_list = [{'Sentiment: n, Phrase: xxx'},{'Sentiment: n, Phrase: xxx'},..] Each dict is a data
    np.random.shuffle(item_list)
    
    n_total = len(item_list)
    n_train = int(args.train_proportion * n_total)
    n_val = int(args.val_proportion * n_total)
    n_test = int(args.test_proportion * n_total)
    
    # Give the corresponding attribute to data
    for item in item_list[:n_train]:
        # item = {'Sentiment: n, Phrase: xxx'} is a dict, assign a attribute
        item['split'] = 'train'
    
    for item in item_list[n_train:n_train + n_test]:
        item['split'] = 'test'
    
    for item in item_list[n_train + n_test:]:
        item['split'] = 'val'
    
    fin_list.extend(item_list)
fin_data = pd.DataFrame(fin_list)

In [43]:
def pre_process(text):
    # lowercase the text
    text =text.lower()
    text = re.sub(r'([.,!?])',r' \1 ',text)
    text = re.sub(r'[^a-z.,!?]+',r' ',text)
    return text

fin_data.Phrase = fin_data.Phrase.apply(pre_process)

In [53]:
fin_data.Sentiment.value_counts()

2    79582
3    32927
1    27273
4     9206
0     7072
Name: Sentiment, dtype: int64

In [61]:
fin_data.to_csv(data_args.output_csv,index =False)

In [52]:
tem_data = pd.read_csv(data_args.output_csv)

In [57]:
tem_data = tem_data.rename(columns= {'Phrase':'review','Sentiment':'label','Split':'split'})

In [56]:
tem_data.columns

Index(['review', 'label', 'Split'], dtype='object')

In [58]:
tem_data.to_csv(data_args.output_csv,index = False)