In [1]:
import torch
from transformers import BertTokenizer
from transformers import BertForSequenceClassification, AdamW, BertConfig
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from transformers import get_linear_schedule_with_warmup

import pandas as pd
import numpy as np
import random
import time
import datetime
import os

from utils import (
    tokenizer_setting,
    preprocessing,
    GPU_setting,
    hyperparmeter_setting,
    flat_accuracy,
    format_time,
    initial_setting,
    run_train,
    run_test,
    convert_input_data,
    test_sentences
)

### 1. Initial Setting
- Load the data
- Split train, test data
- Load tokenizer

In [2]:
path = './../../data/rotten_tomato/sentiment_analysis_data/'
os.listdir(path)

['predictions.csv',
 'sampleSubmission.csv',
 'test.tsv',
 'test.tsv.zip',
 'train.tsv',
 'train.tsv.zip']

In [3]:
train_df = pd.read_csv(path + 'train.tsv', sep="\t")
test_df = pd.read_csv(path + 'test.tsv', sep="\t")

In [4]:
train_df.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2


In [5]:
test_df.head()

Unnamed: 0,PhraseId,SentenceId,Phrase
0,156061,8545,An intermittently pleasing but mostly routine ...
1,156062,8545,An intermittently pleasing but mostly routine ...
2,156063,8545,An
3,156064,8545,intermittently pleasing but mostly routine effort
4,156065,8545,intermittently pleasing but mostly routine


In [7]:
X_train = train_df['Phrase']
y_train = train_df['Sentiment']

X_test  = test_df['Phrase']
y_test = pd.read_csv(path + 'predictions.csv')

In [8]:
# X_train, X_test, y_train, y_test = train_test_split(feature_df, 
#                                                     target_df, 
#                                                     test_size=0.3, 
#                                                     random_state=42)

In [9]:
tokenizer = tokenizer_setting()

---
### 2. Pre-processing

In [11]:
# train_dataloader, validation_dataloader = preprocessing(X_train.review_content, y_train, tokenizer, process_type='train')
train_dataloader, validation_dataloader = preprocessing(X_train, y_train, tokenizer, process_type='train')

In [12]:
# test_dataloader = preprocessing(X_test.review_content, y_test, tokenizer, process_type='test')
test_dataloader = preprocessing(X_test, y_test, tokenizer, process_type='test')

---
### 3. Pre-training

In [13]:
start_time = time.time()
model = BertForSequenceClassification.from_pretrained("bert-base-multilingual-cased", num_labels = 5) #label개수에 따라 변경
model.cuda()
print("  Loading took: {:}".format(format_time(time.time() - start_time)))

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model ch

  Loading took: 0:51:40


In [14]:
device = GPU_setting()

There are 1 GPU(s) available.
Device 0 : NVIDIA GeForce RTX 3080


In [15]:
optimizer, epochs, total_steps, scheduler = hyperparmeter_setting(model, train_dataloader, lr=2e-5, eps=1e-8, epochs=3)

In [16]:
model = initial_setting(model, seed_val=42)

---
### 4. Fine-tuning

In [23]:
def save_checkpoint(state, path, file_name='checkpoint.pth.tar'):
    file_path = path + filename
    print(f"file_path: {file_path}")
    torch.save(state, file_path)

In [24]:
def run_train(model, epochs, train_dataloader, validation_dataloader, optimizer, scheduler, device, path):
    first_start_time = time.time()
    
    # 에폭 수만큼 반복
    for epoch in range(epochs):

        # ========================================
        #               1. Training
        # ========================================
        print("")
        print('======== Epoch {:} / {:} ========'.format(epoch + 1, epochs))
        print('Training...')

        # 시작 시간 설정
        start_time = time.time()

        # 로스 초기화
        total_loss = 0

        # 훈련모드로 변경
        model.train()

        for step, batch in enumerate(train_dataloader):
            # 경과 정보 표시 (step 500번마다 출력)
            if step % 500 == 0 and not step == 0:
                elapsed = format_time(time.time() - start_time)
                print('Batch {:>5,}  of  {:>5,}. Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

            # 배치를 GPU에 올림
            batch = tuple(b.to(device) for b in batch)

            # 배치에서 데이터 추출 (input, mask, label 순으로 넣었었음)
            b_input_ids, b_input_mask, b_labels = batch

            # forward 수행
            outputs = model(b_input_ids,
                            attention_mask=b_input_mask,
                           token_type_ids=None,
                            labels=b_labels)

            # 로스 구함
            loss = outputs.loss # outputs[0]

            # 총 로스 계산
            total_loss += loss.item()

            # Backward 수행으로 그래디언트 계산 (Back-propagation)
            loss.backward()

            # 그래디언트 클리핑
            torch.nn.utils.clip_grad_norm_(model.parameters(), 2.0) # 예제 코드에서는 1.0이었음

            # 그래디언트를 이용해 가중치 파라미터를 lr만큼 업데이트
            optimizer.step()

            # 스케줄러로 학습률 감소
            scheduler.step()

            # 그래디언트 초기화
            ## (호출시 경사값을 0으로 설정. 이유 : 반복 때마다 기울기를 새로 계산하기 때문)
            model.zero_grad()

        # 1 에폭이 끝나면 평균 train 로스 계산 (전체 loss / 배치 수)
        avg_train_loss = total_loss / len(train_dataloader)

        print("")
        print("  Average training loss: {0:.2f}".format(avg_train_loss))
        print("  Training epoch took: {:}".format(format_time(time.time() - start_time)))
        
        # 체크포인트 저장
        print("  Model Checkpoint Save")
        save_checkpoint({
            'epoch': epoch,
            'state_dict': model.state_dict(),
            'optimizer' : optimizer.state_dict()
            }, path)
        
        # ========================================
        #               2. Validation
        # ========================================

        # 1 에폭이 끝나면 validation 시행
        print("")
        print("Running Validation...")

        # 시작 시간 설정
        start_time = time.time()

        # 평가 모드로 변경
        model.eval()

        # 변수 초기화
        total_valid_accuracy = 0
        nb_valid_steps = 0

        # valid 데이터로더에서 배치만큼 반복하여 가져옴
        for batch in validation_dataloader:

            # 배치를 GPU에 넣음
            batch = tuple(b.to(device) for b in batch)

            # 배치에서 데이터 추출
            b_input_ids, b_input_mask, b_labels = batch

            # 그래디언트 계산 안함!
            with torch.no_grad():
                # Forward 수행
                outputs = model(b_input_ids, 
                                token_type_ids=None, 
                                attention_mask=b_input_mask)

            # 로스 구함 (train할 때는 loss, validation할 때는 logits)
            ## logits은 softmax를 거치기 전의 classification score를 반환합니다. shape: (batch_size, config.num_labels)
            logits = outputs.logits

            # CPU로 데이터 이동
            logits = logits.detach().cpu().numpy()
            label_ids = b_labels.to('cpu').numpy()

            # 출력 로짓과 라벨을 비교하여 정확도 계산
            valid_accuracy = flat_accuracy(logits, label_ids)
            total_valid_accuracy += valid_accuracy

        print("  Accuracy: {0:.2f}".format(total_valid_accuracy/len(validation_dataloader)))
        print("  Validation took: {:}".format(format_time(time.time() - start_time)))

    print("")
    print("Total took: {:}".format(format_time(time.time() - first_start_time)))
    print("Training complete!")
    
    return model

In [None]:
model = run_train(model, epochs, train_dataloader, validation_dataloader, optimizer, scheduler, device, path)


Training...


### 5. Save the model

In [None]:
torch.save(model.state_dict(), "./../../data/rotten_tomato/sentiment_analysis_data/model_new.pth")

In [None]:
model.load_state_dict(torch.load("./../../data/rotten_tomato/sentiment_analysis_data/model_new.pth"))
model.eval()

In [None]:
import os

# Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()

output_dir = path + '/model_save/'

# Create output directory if needed
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

print("Saving model to %s" % output_dir)

# Save a trained model, configuration and tokenizer using `save_pretrained()`.
# They can then be reloaded using `from_pretrained()`
model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
model_to_save.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

- Load the model

In [None]:
# Load a trained model and vocabulary that you have fine-tuned
model_new = BertForSequenceClassification.from_pretrained(output_dir)
tokenizer = BertTokenizer.from_pretrained(output_dir)

# Copy the model to the GPU.
model_new.to(device)

---
### 6. Testing

In [None]:
run_test(model, test_dataloader, device)

In [None]:
sentence = "Holofcener always gives us more to chew on than originally meets the eye in her films and she writes female characters with an intelligence, sensitivity and realism that blows the Sex and the City foursome away any day of the week."
logits = test_sentences(model, device, tokenizer, [sentence])

print(logits)
print(np.argmax(logits))

---
### 7. Pretraining 층으로만 학습

In [None]:
# model_no_finetuning = BertForSequenceClassification.from_pretrained("bert-base-multilingual-cased", num_labels=2)
# model_no_finetuning.cuda()

In [None]:
# #시작 시간 설정
# start_time = time.time()

# # 평가모드로 변경
# model_rotten2.eval()

# # 변수 초기화
# eval_loss, eval_accuracy = 0, 0
# nb_eval_steps, nb_eval_examples = 0, 0

# # 데이터로더에서 배치만큼 반복하여 가져옴
# for step, batch in enumerate(test_dataloader):
#     # 경과 정보 표시
#     if step % 100 == 0 and not step == 0:
#         elapsed = format_time(time.time() - start_time)
#         print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(test_dataloader), elapsed))

#     # 배치를 GPU에 넣음
#     batch = tuple(b.to(device) for b in batch)
    
#     # 배치에서 데이터 추출
#     b_input_ids, b_input_mask, b_labels = batch
    
#     # 그래디언트 계산 안함
#     with torch.no_grad():     
#         # Forward 수행
#         outputs = model_rotten2(b_input_ids, 
#                         token_type_ids=None, 
#                         attention_mask=b_input_mask)
    
#     # 로스 구함
#     logits = outputs[0]

#     # CPU로 데이터 이동
#     logits = logits.detach().cpu().numpy()
#     label_ids = b_labels.to('cpu').numpy()
    
#     # 출력 로짓과 라벨을 비교하여 정확도 계산
#     tmp_eval_accuracy = flat_accuracy(logits, label_ids)
#     eval_accuracy += tmp_eval_accuracy

# print("")
# print("Accuracy: {0:.2f}".format(eval_accuracy/len(test_dataloader)))
# print("Test took: {:}".format(format_time(time.time() - start_time)))

---
### 8. 데이터 필터링(새 버전)

In [None]:
merged_df = pd.read_csv(path+'rotten_rating_review_table.csv')

In [None]:
rating_df = merged_df[['user_id', 'movie_id', 'review_score', 'review_content', 'review_date']]

In [None]:
rating_df.head()

---
### 8. 데이터 필터링(이전 버전)

In [33]:
df = pd.read_csv(path+'rotten_review_scaled_label.csv')

In [34]:
df.head()

Unnamed: 0,rotten_tomatoes_link,critic_name,top_critic,publisher_name,review_type,review_score,review_date,review_content,label
0,m/0814255,Ben McEachen,False,Sunday Mail (Australia),Fresh,1.0,2010-02-09,Whether audiences will get behind The Lightnin...,1
1,m/0814255,Nick Schager,False,Slant Magazine,Rotten,0.25,2010-02-10,Harry Potter knockoffs don't come more transpa...,0
2,m/0814255,Bill Goodykoontz,True,Arizona Republic,Fresh,1.0,2010-02-10,"Percy Jackson isn't a great movie, but it's a ...",1
3,m/0814255,Jordan Hoffman,False,UGO,Fresh,0.7,2010-02-10,"Fun, brisk and imaginative",1
4,m/0814255,Mark Adams,False,Daily Mirror (UK),Fresh,0.8,2010-02-10,"This action-packed fantasy adventure, based on...",1


In [38]:
df_review = df[['rotten_tomatoes_link','review_content', 'label']]

In [42]:
df_review.head()

Unnamed: 0,rotten_tomatoes_link,review_content,label
0,m/0814255,Whether audiences will get behind The Lightnin...,1
1,m/0814255,Harry Potter knockoffs don't come more transpa...,0
2,m/0814255,"Percy Jackson isn't a great movie, but it's a ...",1
3,m/0814255,"Fun, brisk and imaginative",1
4,m/0814255,"This action-packed fantasy adventure, based on...",1


In [126]:
pivot_review = df_review.pivot_table(index='rotten_tomatoes_link', columns='label', aggfunc=len, fill_value=0)

In [137]:
pivot_review

Unnamed: 0_level_0,review_content,review_content
label,0,1
rotten_tomatoes_link,Unnamed: 1_level_2,Unnamed: 2_level_2
m/+_one_2019,0,33
m/+h,2,2
m/-_man,1,3
m/-cule_valley_of_the_lost_ants,0,5
m/0814255,12,34
...,...,...
m/zoom_2006,22,3
m/zootopia,2,166
m/zorba_the_greek,0,3
m/zulu,0,4


In [145]:
pivot_review.loc['m/0814255']

                label
review_content  0        12
                1        34
Name: m/0814255, dtype: int64

In [144]:
pivot_review.loc['m/10000_bc']

                label
review_content  0        48
                1        12
Name: m/10000_bc, dtype: int64

- Movie1: 'm/0814255'

In [102]:
def movie_sentiment_filter(df, movie_link, file_name):
    condition = df['rotten_tomatoes_link'] == movie_link
    df = df[condition]
    
    pos_review_list = df[df['label'] == 1].reset_index(drop=True).review_content
    movie1_pos = " ".join(pos_review_list)
    
    neg_review_list = df[df['label'] == 0].reset_index(drop=True).review_content
    movie1_neg = " ".join(neg_review_list)
    
    f = open(path + f"{file_name}_pos.txt", 'w')
    f.write(movie1_pos)
    f.close()
    
    f = open(path + f"{file_name}_neg.txt", 'w')
    f.write(movie1_neg)
    f.close()
    
    print(f"{file_name}_pos/neg save finish!!")

In [96]:
movie_sentiment_filter(df=df_review, movie_link='m/0814255', file_name='movie1')

movie1_pos/neg save finish!!


In [104]:
movie_sentiment_filter(df=df_review, movie_link='m/0878835', file_name='movie2')

movie2_pos/neg save finish!!


In [105]:
movie_sentiment_filter(df=df_review, movie_link='m/10000_bc', file_name='movie3')

movie3_pos/neg save finish!!
