In [None]:
# this mounts your Google Drive to the Colab VM.
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
# 請輸入資料夾之所在位置
FOLDERNAME = 'Colab\ Notebooks/Amazon Fine Food'
assert FOLDERNAME is not None, "[!] Enter the foldername."

In [None]:
# now that we've mounted your Drive, this ensures that
# the Python interpreter of the Colab VM can load
# python files from within it.
import sys
sys.path.append('/content/drive/MyDrive/{}'.format(FOLDERNAME))

In [None]:
# Get to the folder we are at
%cd drive/MyDrive/$FOLDERNAME/

/content/drive/MyDrive/Colab Notebooks/Amazon Fine Food


In [None]:
!pip install transformers
!pip install transformers[sentencepiece]

In [None]:
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('using device:', device)

using device: cuda


In [None]:
if hasattr(torch.cuda, 'empty_cache'):
    torch.cuda.empty_cache()

In [None]:
import numpy as np
import pandas as pd
df = pd.read_csv('./Amazon_Fine_Food_Reviews.csv')

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...
...,...,...,...,...,...,...,...,...,...,...
568449,568450,B001EO7N10,A28KG5XORO54AY,Lettie D. Carter,0,0,5,1299628800,Will not do without,Great for sesame chicken..this is a good if no...
568450,568451,B003S1WTCU,A3I8AFVPEE8KI5,R. Sawyer,0,0,2,1331251200,disappointed,I'm disappointed with the flavor. The chocolat...
568451,568452,B004I613EE,A121AA1GQV751Z,"pksd ""pk_007""",2,2,5,1329782400,Perfect for our maltipoo,"These stars are small, so you can give 10-15 o..."
568452,568453,B004I613EE,A3IBEVCTXKNOH,"Kathy A. Welch ""katwel""",1,1,5,1331596800,Favorite Training and reward treat,These are the BEST treats for training and rew...


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 568454 entries, 0 to 568453
Data columns (total 10 columns):
 #   Column                  Non-Null Count   Dtype 
---  ------                  --------------   ----- 
 0   Id                      568454 non-null  int64 
 1   ProductId               568454 non-null  object
 2   UserId                  568454 non-null  object
 3   ProfileName             568438 non-null  object
 4   HelpfulnessNumerator    568454 non-null  int64 
 5   HelpfulnessDenominator  568454 non-null  int64 
 6   Score                   568454 non-null  int64 
 7   Time                    568454 non-null  int64 
 8   Summary                 568427 non-null  object
 9   Text                    568454 non-null  object
dtypes: int64(5), object(5)
memory usage: 43.4+ MB


In [None]:
reviews_data = pd.read_csv('./reviews_data.csv')
reviews_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 568454 entries, 0 to 568453
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   Id         568454 non-null  int64 
 1   ProductId  568454 non-null  object
 2   Score      568454 non-null  int64 
 3   Text       568454 non-null  object
dtypes: int64(2), object(2)
memory usage: 17.3+ MB


In [None]:
train = pd.read_csv('./train.csv')
seq_length = 0
for line in train.Text:
  tokens = line.split()
  num_token = len(tokens)
  if seq_length < num_token:
    seq_length = num_token

print(seq_length)

3432


In [None]:
# 01
import argparse

def parse_args():
    parser = argparse.ArgumentParser(description='Using BERT for Amazon Fine Food analysis')
    parser.add_argument('corpus', type=str,
                        help='training corpus file')
    parser.add_argument('output_model', type=str,
                        help='output model file') 
    parser.add_argument('--seq-length', type=int, default=512,
                        help='input sequence length (default: 512)')           
    parser.add_argument('--hidden-dim', type=int, default=256,
                        help='hidden dimension (default: 256)')
    parser.add_argument('--batch-size', type=int, default=10,
                        help='training batch size (default: 10)')                 
    parser.add_argument('--lr', type=float, default=0.0001,
                        help='learning rate (default: 0.0001)')                 
    parser.add_argument('--dropout', type=float, default=0.2,
                        help='dropout rate (default: 0.2)')
    parser.add_argument('--epochs', type=int, default=1,                       
                        help='number of epochs to train (default: 1)')
    parser.add_argument('--log-interval', type=int, default=10, 
                        help='number of batches to wait before logging status (default: 10)') 
    
    return parser.parse_args(args=['reviews_data.csv', 'output.pt'])  

In [None]:
# 02
import csv
import torch
from torch.utils.data import Dataset
from transformers import AlbertTokenizer

class ReviewsDataset(Dataset):
    def __init__(self, path, seq_length=512):
        super(ReviewsDataset).__init__()
        self.path = path
        self.seq_length = seq_length
        self.tokenizer = AlbertTokenizer.from_pretrained('albert-base-v1')     

        self.parse_corpus()

    def __len__(self):
        return self.n

    def __getitem__(self, i):           
        return (self.input_ids[i], self.attention_mask[i], self.token_type_ids[i], self.y[i])

    def parse_corpus(self):             
        '''
        Parse raw corpus text into input-output pairs, output tokenized by BertTokenizer
        '''

        input_ids, attention_mask, token_type_ids, y = [], [], [], []

        # Read csv from file
        product_set = set()                                 
        with open(self.path, 'r') as f:
            rd = csv.reader(f, delimiter=',')
            next(rd) # ignore header
            # count = 1
            for row in rd:
              # print(row)
              id, product_id, score, text = row                
              # print(id, product_id, score, text)
              # count += 1
              # if count == 10:
              #   break
              
              # Only keep complete sentences
              if product_id in product_set:
                  continue
              product_set.add(product_id)

              # Tokenize each batch of phrases, truncate or pad to max length specified
              x = self.tokenizer(text, return_tensors='pt', max_length=self.seq_length, truncation=True, padding='max_length')  # pt stands for PyTorch

              input_ids.append(x.input_ids.squeeze(0))
              attention_mask.append(x.attention_mask.squeeze(0))       
              token_type_ids.append(x.token_type_ids.squeeze(0))       
              y.append(int(score))

        self.input_ids, self.attention_mask, self.token_type_ids, self.y = input_ids, attention_mask, token_type_ids, y
        self.n = len(y)

In [None]:
# 03
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import AlbertModel

class BertFoodReviews(nn.Module):
    def __init__(self, n_class, hidden_dim=256, dropout=0.2):          
        super(BertFoodReviews, self).__init__()

        self.enc = AlbertModel.from_pretrained('albert-base-v1')          
        # self.fc = nn.Linear(self.enc.config.hidden_size, hidden_dim)    
        # self.dropout = nn.Dropout(dropout)
        # self.out = nn.Linear(hidden_dim, n_class)
        self.out = nn.Linear(self.enc.config.hidden_size, n_class)      

    def freeze_bert(self, freeze):                                        
        for param in self.enc.parameters():
          param.requires_grad = not freeze                                

    def forward(self, input_ids, attention_mask, token_type_ids):         
        output = self.enc(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        # y = self.fc(output.pooler_output)                               
        # y = F.relu(y)
        # y = self.dropout(y)
        # y = self.out(y)
        y = self.out(output.pooler_output)                                
        return y

In [None]:
# 04
def train(model, optimizer, data, args):              # data = train_dataloader
    model.train()
    #model.freeze_bert(True) # don't fine-tune bert to speedup model training

    n_batch = len(data)                                   # batch size
    n_data = len(data.dataset)                        
    losses = []
    accs = []
    n_iters = 0
    for epoch in range(args.epochs):                  
        total_loss = 0
        total_correct = 0
        for batch_i, (input_ids, attention_mask, token_type_ids, target) in enumerate(data):
            input_ids = input_ids.to(device)                      
            attention_mask = attention_mask.to(device)            
            token_type_ids = token_type_ids.to(device)          
            target = target.to(device)

            # Train
            optimizer.zero_grad()                                 # clear the previous G.D.
            output = model(input_ids, attention_mask, token_type_ids)     
            loss = F.cross_entropy(output, target)          # F = torch.nn.functional
            loss.backward()
            optimizer.step()

            # Log training status
            n_iters += 1
            total_loss += loss.item()
            pred = output.argmax(dim=1, keepdim=True)  
            correct = pred.eq(target.view_as(pred)).sum().item()
            total_correct += correct
             
            if batch_i % args.log_interval == 0:
                print('Train epoch: {} ({:2.0f}%)\tLoss: {:.6f}\tAccuracy: {}/{} ({:.0f}%)'.format(
                    epoch, 100. * batch_i / n_batch, loss.item(),
                    correct, len(target), 100. * correct / len(target)))
        losses.append(total_loss / len(data))
        accs.append(total_correct / n_data)

In [None]:
# 05
def test(model, data, args):
    model.eval()

    n_batch = len(data)
    n_data = len(data.dataset)
    total_loss = 0
    correct = 0
    with torch.no_grad():
        for input_ids, attention_mask, token_type_ids, target in data:
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            token_type_ids = token_type_ids.to(device)
            target = target.to(device)

            output = model(input_ids, attention_mask, token_type_ids)
            loss = F.cross_entropy(output, target)
            total_loss += loss.item()
            pred = output.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
            correct += pred.eq(target.view_as(pred)).sum().item() # how many predictions in this batch are correct

    avg_loss = total_loss / n_batch
    print('Test Loss: {:.6f}'.format(avg_loss))
    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
        avg_loss, correct, n_data,
        100. * correct / n_data))

In [None]:
# 06
import torch, gc
import transformers
from torch.utils.data import Dataset
from transformers import AlbertTokenizer
import torch.nn.functional as F
from torch.utils.data import DataLoader, random_split

def main():

  args = parse_args()

  # Prepare data & split
  dataset = ReviewsDataset(args.corpus, seq_length=args.seq_length)
  train_set_size = int(len(dataset) * 0.8)                          
  train_set, val_set = random_split(dataset, [train_set_size, len(dataset) - train_set_size])
  train_dataloader = DataLoader(train_set, batch_size=args.batch_size, shuffle=True)           
  val_dataloader = DataLoader(val_set, batch_size=args.batch_size)
  # torch.cuda.empty_cache()                                       
  print('Training set size: {}'.format(len(train_set)))           
  print('Val set size: {}'.format(len(val_set)))
  print('----------------------------')

  # Create model & optimizer
  OUTPUT = 5
  model = BertFoodReviews(n_class = OUTPUT, hidden_dim=args.hidden_dim, dropout=args.dropout).to(device)
  optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)

  # Train
  train(model, optimizer, train_dataloader, args)

  # Save model
  torch.save(model.state_dict(), args.output_model)           

  # Test
  test(model, val_dataloader, args)


if __name__ == '__main__':
  main()