In [1]:
!pip install transformers





In [2]:
!pip install tabulate





In [3]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np

from tabulate import tabulate
from tqdm import trange
import random

In [4]:
from tqdm import tqdm, tqdm_notebook

In [5]:
# file_path = './drive/MyDrive/movie_reviews.xlsx'
file_path = './movie_reviews.xlsx'
df = pd.read_excel(file_path)
print(df.dtypes)
df.head()

label       int64
url        object
rating      int64
date       object
content    object
dtype: object


Unnamed: 0,label,url,rating,date,content
0,0,/review/rw1142613/,6,4 August 2005,"It's an ""emotional manipulation for dummies"" t..."
1,0,/review/rw1587268/,6,30 January 2007,"Tim Robbins plays Andy Dufresne, a man convict..."
2,0,/review/rw0349436/,6,18 December 2003,It is the most basic of all principles of film...
3,0,/review/rw0349359/,6,30 July 2003,Little more than average drama for popular tas...
4,0,/review/rw0348197/,6,10 September 1998,An UNDER-RATED movie?? Gimme a break! This is ...


In [6]:
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

[nltk_data] Downloading package punkt to C:\Users\Youngchan
[nltk_data]     Kim\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [7]:
'''1. 전체 문장'''
# text = df.content.values
# labels = df.label.values

'1. 전체 문장'

In [8]:
%%time
'''2. 두문장 씩 묶어서'''
text = []
labels = []
content = df.content.values
temp = df.label.values
for i, sentence in enumerate(content):
    if temp[i] >= 50: 
        break
    try:
        # print(content)
        sentences = sent_tokenize(str(sentence))
        for j in range(0, len(sentences)):
            pair = sentences[j]+sentences[j + 1] if j + 1 < len(sentences) else sentences[j]
            text.append(pair)
            labels.append(temp[i])
    except Exception as e:
        print(e)
        print(content)

Wall time: 3.96 s


In [9]:
len(text)

148367

In [10]:
len(labels)

148367

In [11]:
tokenizer = BertTokenizer.from_pretrained(
    'bert-base-multilingual-uncased',
    do_lower_case = True # True/모두 소문자로 변환. False: 대소문자 구분
    )

bert-base-uncased  
bert-base-cased  
bert-base-multilingual-uncased  
bert-base-multilingual-cased  


In [12]:
def print_rand_sentence():
    '''Displays the tokens and respective IDs of a random text sample'''
    index = random.randint(0, len(text)-1)
    print(text[index])
    table = np.array([tokenizer.tokenize(text[index]),
                      tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text[index]))]).T
    print(tabulate(table, headers = ['Tokens', 'Token IDs'], tablefmt = 'fancy_grid'))

print_rand_sentence()

I watched the extended version of Ridley Scott's Gladiator, and despite my low interest on films related to Medieval era, I was simply mesmerized by its sheer magnitude, its energy, and the protagonist's (Russel Crowe) audacity to fight back in the face of failure, all of which could only be topped by the almost perfect casting of Joaquin Phoenix, his eyes, and his incredible performance as the brutal and henpecked emperor of the falling Roman empire.TN.
╒═════════════╤═════════════╕
│ Tokens      │   Token IDs │
╞═════════════╪═════════════╡
│ i           │         151 │
├─────────────┼─────────────┤
│ watched     │       84447 │
├─────────────┼─────────────┤
│ the         │       10103 │
├─────────────┼─────────────┤
│ extended    │       19164 │
├─────────────┼─────────────┤
│ version     │       10947 │
├─────────────┼─────────────┤
│ of          │       10108 │
├─────────────┼─────────────┤
│ ridley      │       56140 │
├─────────────┼─────────────┤
│ scott       │       13064 │
├

In [13]:
def print_sentence(index):
    print(text[index])
    table = np.array([tokenizer.tokenize(text[index]),
                      tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text[index]))]).T
    print(tabulate(table, headers = ['Tokens', 'Token IDs'], tablefmt = 'fancy_grid'))

print_sentence(0)

It's an "emotional manipulation for dummies" type of film, with boring direction and cardboard characters.Nice performances by the actors though, some great moments, but nothing outside the "Hallmark meets Stephen King meets Prison" feel of the entire movie.I saw it one and a half times, and would see it again only if I got paid for the trouble.Similarly, I hated 'The Green Mile", only later did I find out it's the work of Darabont.My recommendation: Skip this, and go see OZ again.
╒══════════════╤═════════════╕
│ Tokens       │   Token IDs │
╞══════════════╪═════════════╡
│ it           │       10197 │
├──────────────┼─────────────┤
│ '            │         112 │
├──────────────┼─────────────┤
│ s            │         161 │
├──────────────┼─────────────┤
│ an           │       10144 │
├──────────────┼─────────────┤
│ "            │         107 │
├──────────────┼─────────────┤
│ emotional    │       48740 │
├──────────────┼─────────────┤
│ mani         │       34016 │
├──────────────┼─

In [14]:
%%time
token_id = []
attention_masks = []

def preprocessing(input_text, tokenizer):
  '''
  Returns <class transformers.tokenization_utils_base.BatchEncoding> with the following fields:
    - input_ids: list of token ids
    - token_type_ids: list of token type ids
    - attention_mask: list of indices (0,1) specifying which tokens should considered by the model (return_attention_mask = True).
  '''
  return tokenizer.encode_plus(
                        input_text,
                        add_special_tokens = True,
                        max_length = 128,
                        pad_to_max_length = True,
                        return_attention_mask = True,
                        return_tensors = 'pt'
                   )


for sample in text:
    encoding_dict = preprocessing(sample, tokenizer)
    token_id.append(encoding_dict['input_ids']) 
    attention_masks.append(encoding_dict['attention_mask'])

token_id = torch.cat(token_id, dim = 0)
attention_masks = torch.cat(attention_masks, dim = 0)
labels = torch.tensor(labels)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Wall time: 2min 22s


In [15]:
token_id[1]

tensor([  101, 24242, 22187, 10151, 10103, 26826, 14325,   117, 10970, 11838,
        36723,   117, 10502, 20587, 16751, 10103,   107, 11672, 22824, 31587,
        14235, 11479, 31587, 18357,   107, 23333, 10108, 10103, 19401, 13113,
          119,   151, 16289, 10197, 10399, 10110,   143, 13460, 11471,   117,
        10110, 11008, 11811, 10197, 12590, 10902, 11526,   151, 15517, 25033,
        10139, 10103, 29868,   119, 45535,   117,   151, 39487, 10163,   112,
        10103, 12535, 17623,   107,   117, 10902, 10844, 12266,   151, 16595,
        10871, 10197,   112,   161, 10103, 11497, 10108, 79729, 20923,   119,
        11153, 44909, 55667, 77393,   131, 38825, 10372,   117, 10110, 11335,
        11811, 17704, 12590,   119,   102,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0])

In [16]:
%%time
def print_rand_sentence_encoding():
  '''Displays tokens, token IDs and attention mask of a random text sample'''
  index = random.randint(0, len(text) - 1)
  tokens = tokenizer.tokenize(tokenizer.decode(token_id[index]))
  token_ids = [i.numpy() for i in token_id[index]]
  attention = [i.numpy() for i in attention_masks[index]]

  table = np.array([tokens, token_ids, attention]).T
  print(tabulate(table, headers = ['Tokens', 'Token IDs', 'Attention Mask'], tablefmt = 'fancy_grid'))

print_rand_sentence_encoding()

╒═══════════════╤═════════════╤══════════════════╕
│ Tokens        │   Token IDs │   Attention Mask │
╞═══════════════╪═════════════╪══════════════════╡
│ [CLS]         │         101 │                1 │
├───────────────┼─────────────┼──────────────────┤
│ martial       │       37363 │                1 │
├───────────────┼─────────────┼──────────────────┤
│ arts          │       12180 │                1 │
├───────────────┼─────────────┼──────────────────┤
│ skills        │       30504 │                1 │
├───────────────┼─────────────┼──────────────────┤
│ ,             │         117 │                1 │
├───────────────┼─────────────┼──────────────────┤
│ cun           │       35962 │                1 │
├───────────────┼─────────────┼──────────────────┤
│ ##ning        │       11459 │                1 │
├───────────────┼─────────────┼──────────────────┤
│ ,             │         117 │                1 │
├───────────────┼─────────────┼──────────────────┤
│ great         │       11838 │

In [17]:
val_ratio = 0.25
batch_size = 16

# Indices of the train and validation splits stratified by labels
train_idx, val_idx = train_test_split(
    np.arange(len(labels)),
    test_size = val_ratio,
    shuffle = True,
    stratify = labels)

# Train and validation sets
train_set = TensorDataset(token_id[train_idx], 
                          attention_masks[train_idx], 
                          labels[train_idx])

val_set = TensorDataset(token_id[val_idx], 
                        attention_masks[val_idx], 
                        labels[val_idx])

# Prepare DataLoader
train_dataloader = DataLoader(
            train_set,
            sampler = RandomSampler(train_set),
            batch_size = batch_size
        )

validation_dataloader = DataLoader(
            val_set,
            sampler = SequentialSampler(val_set),
            batch_size = batch_size
        )

In [18]:
def b_tp(preds, labels):
    '''Returns True Positives (TP): count of correct predictions of actual class 1'''
    return sum([preds == labels and preds == 1 for preds, labels in zip(preds, labels)])

def b_fp(preds, labels):
    '''Returns False Positives (FP): count of wrong predictions of actual class 1'''
    return sum([preds != labels and preds == 1 for preds, labels in zip(preds, labels)])

def b_tn(preds, labels):
    '''Returns True Negatives (TN): count of correct predictions of actual class 0'''
    return sum([preds == labels and preds == 0 for preds, labels in zip(preds, labels)])

def b_fn(preds, labels):
    '''Returns False Negatives (FN): count of wrong predictions of actual class 0'''
    return sum([preds != labels and preds == 0 for preds, labels in zip(preds, labels)])

def b_metrics(preds, labels):
    '''
    Returns the following metrics:
        - accuracy    = (TP + TN) / N
        - precision   = TP / (TP + FP)
        - recall      = TP / (TP + FN)
        - specificity = TN / (TN + FP)
    '''
    preds = np.argmax(preds, axis = 1).flatten()
    labels = labels.flatten()
    tp = b_tp(preds, labels)
    tn = b_tn(preds, labels)
    fp = b_fp(preds, labels)
    fn = b_fn(preds, labels)
    b_accuracy = (tp + tn) / len(labels)
    b_precision = tp / (tp + fp) if (tp + fp) > 0 else 'nan'
    b_recall = tp / (tp + fn) if (tp + fn) > 0 else 'nan'
    b_specificity = tn / (tn + fp) if (tn + fp) > 0 else 'nan'
    return b_accuracy, b_precision, b_recall, b_specificity

In [19]:
%%time
model = BertForSequenceClassification.from_pretrained(
    'bert-base-multilingual-uncased',
    num_labels = 50,
    output_attentions = False,
    output_hidden_states = False,
)

# Recommended learning rates (Adam): 5e-5, 3e-5, 2e-5. See: https://arxiv.org/pdf/1810.04805.pdf
optimizer = torch.optim.AdamW(model.parameters(), 
                              lr = 5e-5,
                              eps = 1e-08
                              )

# Run on GPU
model.cuda()

Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model 

Wall time: 1.9 s


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(105879, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elemen

In [20]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [21]:
%%time
# Recommended number of epochs: 2, 3, 4. See: https://arxiv.org/pdf/1810.04805.pdf
epochs = 3

for _ in trange(epochs, desc = 'Epoch'):
    
    # ========== Training ==========
    
    # Set model to training mode
    model.train()
    
    # Tracking variables
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0

    for step, batch in enumerate(tqdm_notebook(train_dataloader)):
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        optimizer.zero_grad()
        # Forward pass
        train_output = model(b_input_ids, 
                             token_type_ids = None, 
                             attention_mask = b_input_mask, 
                             labels = b_labels)
        # Backward pass
        train_output.loss.backward()
        optimizer.step()
        # Update tracking variables
        tr_loss += train_output.loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1

    # ========== Validation ==========

    # Set model to evaluation mode
    model.eval()

    # Tracking variables 
    val_accuracy = []
    val_precision = []
    val_recall = []
    val_specificity = []

    for batch in validation_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        with torch.no_grad():
          # Forward pass
          eval_output = model(b_input_ids, 
                              token_type_ids = None, 
                              attention_mask = b_input_mask)
        logits = eval_output.logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        # Calculate validation metrics
        b_accuracy, b_precision, b_recall, b_specificity = b_metrics(logits, label_ids)
        val_accuracy.append(b_accuracy)
        # Update precision only when (tp + fp) !=0; ignore nan
        if b_precision != 'nan': val_precision.append(b_precision)
        # Update recall only when (tp + fn) !=0; ignore nan
        if b_recall != 'nan': val_recall.append(b_recall)
        # Update specificity only when (tn + fp) !=0; ignore nan
        if b_specificity != 'nan': val_specificity.append(b_specificity)

    print('\n\t - Train loss: {:.4f}'.format(tr_loss / nb_tr_steps))
    print('\t - Validation Accuracy: {:.4f}'.format(sum(val_accuracy)/len(val_accuracy)))
    print('\t - Validation Precision: {:.4f}'.format(sum(val_precision)/len(val_precision)) if len(val_precision)>0 else '\t - Validation Precision: NaN')
    print('\t - Validation Recall: {:.4f}'.format(sum(val_recall)/len(val_recall)) if len(val_recall)>0 else '\t - Validation Recall: NaN')
    print('\t - Validation Specificity: {:.4f}\n'.format(sum(val_specificity)/len(val_specificity)) if len(val_specificity)>0 else '\t - Validation Specificity: NaN')

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


  0%|          | 0/6955 [00:00<?, ?it/s]


Epoch:  33%|███████████████████████████████████████████████████████████████████▎                                                                                                                                      | 1/3 [56:38<1:53:16, 3398.05s/it]


	 - Train loss: 1.6473
	 - Validation Accuracy: 0.0146
	 - Validation Precision: 0.4818
	 - Validation Recall: 0.9161
	 - Validation Specificity: 0.1950



  0%|          | 0/6955 [00:00<?, ?it/s]


Epoch:  67%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                   | 2/3 [1:53:04<56:31, 3391.41s/it]


	 - Train loss: 1.1347
	 - Validation Accuracy: 0.0163
	 - Validation Precision: 0.5223
	 - Validation Recall: 0.7349
	 - Validation Specificity: 0.2836



  0%|          | 0/6955 [00:00<?, ?it/s]

Epoch: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [2:49:30<00:00, 3390.25s/it]


	 - Train loss: 0.8879
	 - Validation Accuracy: 0.0166
	 - Validation Precision: 0.6824
	 - Validation Recall: 0.8962
	 - Validation Specificity: 0.4529

Wall time: 2h 49min 30s





In [22]:
def predict(sentence):
    # We need Token IDs and Attention Mask for inference on the new sentence
    test_ids = []
    test_attention_mask = []

    # Apply the tokenizer
    encoding = preprocessing(new_sentence, tokenizer)

    # Extract IDs and Attention Mask
    test_ids.append(encoding['input_ids'])
    test_attention_mask.append(encoding['attention_mask'])
    test_ids = torch.cat(test_ids, dim = 0)
    test_attention_mask = torch.cat(test_attention_mask, dim = 0)

    # Forward pass, calculate logit predictions
    with torch.no_grad():
      output = model(test_ids.to(device), token_type_ids = None, attention_mask = test_attention_mask.to(device))

    prediction = np.argmax(output.logits.cpu().numpy()).flatten().item()

    print('Input Sentence: ', new_sentence)
    print('Predicted Class: ', prediction)


new_sentence = 'a fantasy movie'
predict(new_sentence)

Input Sentence:  a fantasy movie
Predicted Class:  8




In [23]:
PATH = './' 
torch.save(model, PATH + 'BERT_movie50_1.pt')  # 전체 모델 저장
torch.save(model.state_dict(), PATH + 'model_state_dict50_1.pt')  # 모델 객체의 state_dict 저장