In [1]:
import re
import random
import time
import datetime

import pandas as pd
import numpy as np
import torch
import matplotlib.pyplot as plt

from scipy import stats
from transformers import BertTokenizer
from transformers import BertForSequenceClassification, AdamW, BertConfig
from transformers import get_linear_schedule_with_warmup
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras_preprocessing.sequence import pad_sequences
from tensorflow.keras.datasets import imdb
from sklearn.model_selection import train_test_split

In [2]:
import os
from tqdm import tqdm
tqdm.pandas()

In [3]:
def separator(text):
    return re.sub(pattern="<b.*/>|\.|\?|\!", string=text, repl=" [SEP] ")

In [5]:
df = pd.read_parquet('files/book_user_14_20.parquet', engine='pyarrow')

In [6]:
def overall_count(a):
    return a.overall.nunique()
user_overall_nunique = df.groupby('reviewerID').apply(overall_count)
uon_vc = pd.DataFrame(user_overall_nunique)[0].value_counts()
overall_5_users = list(user_overall_nunique[user_overall_nunique==5].index)
df = df[df.reviewerID.isin(overall_5_users)]

In [7]:
df = df[['reviewText','overall']]

In [8]:
df = df.rename(columns={'reviewText':'review','overall':'sentiment'})

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 693237 entries, 4 to 3647853
Data columns (total 2 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   review     693153 non-null  object
 1   sentiment  693237 non-null  int16 
dtypes: int16(1), object(1)
memory usage: 11.9+ MB


In [10]:
df = df[(df.sentiment==5)|(df.sentiment<3)]

In [12]:
df = df[~pd.isnull(df.review)]

In [13]:
df.sentiment.value_counts()

5    306697
2     51240
1     41123
Name: sentiment, dtype: int64

In [14]:
pos_random_ind = random.sample(list(df[df.sentiment==5].index), len(df[df.sentiment==5])//3)

In [15]:
pos_df = df.loc[pos_random_ind]

In [16]:
neg_ind = list(df[df.sentiment!=5].index)
neg_df = df.loc[neg_ind]

In [17]:
df = pd.concat([pos_df,neg_df])

In [59]:
df.sentiment.value_counts()

5    102232
2     51240
1     41123
Name: sentiment, dtype: int64

In [18]:
texts = df.review
classes = df.sentiment
classes_oh = df.sentiment.apply(lambda x: 1 if x ==5 else 0)
classes_oh = classes_oh.values

In [20]:
def bert_tokenize(texts):
    return [tokenizer.tokenize("[CLS] " + text + " [SEP]") for text in texts]

In [21]:
def bert_tokenize(t):
    return tokenizer.tokenize("[CLS] " + t + " [SEP]")

In [22]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [24]:
tokenized = []
for i in tqdm(texts):
    tokenized.append(bert_tokenize(i))

100%|██████████| 194595/194595 [06:42<00:00, 483.21it/s]


In [25]:
bert_ids = [tokenizer.convert_tokens_to_ids(tokens) for tokens in tokenized]

In [26]:
number_of_tokens = np.array([len(bert_id) for bert_id in bert_ids])
stats.describe(number_of_tokens)

DescribeResult(nobs=194595, minmax=(3, 6735), mean=115.34015776356021, variance=34293.66305711972, skewness=5.284996801296655, kurtosis=63.82442108297337)

In [27]:
MAX_LEN = 216
padded_bert_ids = pad_sequences(bert_ids, maxlen=MAX_LEN, dtype='long', truncating='post', padding='post')
padded_bert_ids[0]

array([  101,  7167,  1997, 14841, 18939, 12762,  2006,  6048,  1012,
        2428,  5632,  2009,  1012,   102,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,

In [28]:
attention_masks = []
for seq in tqdm(padded_bert_ids):
    seq_mask = [float(i>0) for i in seq]
    attention_masks.append(seq_mask)

100%|██████████| 194595/194595 [00:28<00:00, 6878.29it/s]


In [29]:
X_train, X_test, y_train, y_test = train_test_split(padded_bert_ids, classes_oh, random_state=42, test_size=0.3)
masks_train, masks_test, _, _ = train_test_split(attention_masks, padded_bert_ids, random_state=42, test_size=0.3)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, random_state=42, test_size=0.1)
masks_train, masks_val, _, _ = train_test_split(masks_train, masks_train, random_state=42, test_size=0.1)

display(
    f"X_train: {X_train.shape}",
    f"X_val: {X_val.shape}",
    f"X_test: {X_test.shape}",
    f"y_train: {y_train.shape}",
    f"y_val: {y_val.shape}",
    f"y_test: {y_test.shape}",
    f"masks_train: {len(masks_train)}",
    f"masks_val: {len(masks_val)}",
    f"masks_test: {len(masks_test)}",
)

'X_train: (122594, 216)'

'X_val: (13622, 216)'

'X_test: (58379, 216)'

'y_train: (122594,)'

'y_val: (13622,)'

'y_test: (58379,)'

'masks_train: 122594'

'masks_val: 13622'

'masks_test: 58379'

In [35]:
train_inputs = torch.tensor(X_train)
train_labels = torch.tensor(y_train)
train_masks = torch.tensor(masks_train)
validation_inputs = torch.tensor(X_val)
validation_labels = torch.tensor(y_val)
validation_masks = torch.tensor(masks_val)

test_inputs = torch.tensor(X_test)
test_labels = torch.tensor(y_test)
test_masks = torch.tensor(masks_test)

print(train_inputs.shape)
print(train_labels.shape)
print(train_masks.shape)
print(validation_inputs.shape)
print(validation_labels.shape)
print(validation_masks.shape)
print(test_inputs.shape)
print(test_labels.shape)
print(test_masks.shape)

torch.Size([122594, 216])
torch.Size([122594])
torch.Size([122594, 216])
torch.Size([13622, 216])
torch.Size([13622])
torch.Size([13622, 216])
torch.Size([58379, 216])
torch.Size([58379])
torch.Size([58379, 216])


In [36]:
BATCH_SIZE = 32

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=BATCH_SIZE)

validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=BATCH_SIZE)

test_data = TensorDataset(test_inputs, test_masks, test_labels)
test_sampler = RandomSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=BATCH_SIZE)

In [37]:
device = torch.device("cuda")

In [38]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

optimizer = AdamW(model.parameters(),
                  lr = 3e-5, # 학습률
                  eps = 1e-8 # 0으로 나누는 것을 방지하기 위한 epsilon 값
                )

epochs = 3
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0,
                                            num_training_steps = total_steps)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [39]:
# 정확도 계산 함수
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [40]:
# 시간 표시 함수
def format_time(elapsed):
    # 반올림
    elapsed_rounded = int(round((elapsed)))
    # hh:mm:ss으로 형태 변경
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [41]:
# 재현을 위해 랜덤시드 고정
seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [42]:
# 그래디언트 초기화
model.zero_grad()

In [43]:
for epoch_i in range(0, epochs):
    
    # ========================================
    #               Training
    # ========================================
    
    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    # 시작 시간 설정
    t0 = time.time()

    # 로스 초기화
    total_loss = 0

    # 훈련모드로 변경
    model.train()
        
    # 데이터로더에서 배치만큼 반복하여 가져옴
    for step, batch in enumerate(train_dataloader):
        # 경과 정보 표시
        if step % 100 == 0 and not step == 0:
            elapsed = format_time(time.time() - t0)
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

        # 배치를 GPU에 넣음
        batch = tuple(t.to(device) for t in batch)
        
        # 배치에서 데이터 추출
        b_input_ids, b_input_mask, b_labels = batch
        # b_input_ids = b_input_ids.to(device)
        # b_input_mask = b_input_mask.to(device)
        # b_labels = b_labels.to(device)
        # b_input_ids.to(device)
        # b_input_mask.to(device)
        # b_labels.to(device)
        model.to(device)

        # Forward 수행                
        outputs = model(b_input_ids, 
                        token_type_ids=None, 
                        attention_mask=b_input_mask, 
                        labels=b_labels)
        
        # 로스 구함
        loss = outputs[0]

        # 총 로스 계산
        total_loss += loss.item()

        # Backward 수행으로 그래디언트 계산
        loss.backward()

        # 그래디언트 클리핑
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # 그래디언트를 통해 가중치 파라미터 업데이트
        optimizer.step()

        # 스케줄러로 학습률 감소
        scheduler.step()

        # 그래디언트 초기화
        model.zero_grad()

    # 평균 로스 계산
    avg_train_loss = total_loss / len(train_dataloader)            

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(format_time(time.time() - t0)))
        
    # ========================================
    #               Validation
    # ========================================

    print("")
    print("Running Validation...")

    #시작 시간 설정
    t0 = time.time()

    # 평가모드로 변경
    model.eval()

    # 변수 초기화
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0

    # 데이터로더에서 배치만큼 반복하여 가져옴
    for batch in validation_dataloader:
        # 배치를 GPU에 넣음
        batch = tuple(t.to(device) for t in batch)
        
        # 배치에서 데이터 추출
        b_input_ids, b_input_mask, b_labels = batch
        
        # 그래디언트 계산 안함
        with torch.no_grad():     
            # Forward 수행
            outputs = model(b_input_ids, 
                            token_type_ids=None, 
                            attention_mask=b_input_mask)
        
        # 로스 구함
        logits = outputs[0]

        # CPU로 데이터 이동
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        
        # 출력 로짓과 라벨을 비교하여 정확도 계산
        tmp_eval_accuracy = flat_accuracy(logits, label_ids)
        eval_accuracy += tmp_eval_accuracy
        nb_eval_steps += 1

    print("  Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps))
    print("  Validation took: {:}".format(format_time(time.time() - t0)))

print("")
print("Training complete!")


Training...
  Batch   100  of  3,832.    Elapsed: 0:00:45.
  Batch   200  of  3,832.    Elapsed: 0:01:25.
  Batch   300  of  3,832.    Elapsed: 0:02:05.
  Batch   400  of  3,832.    Elapsed: 0:02:45.
  Batch   500  of  3,832.    Elapsed: 0:03:25.
  Batch   600  of  3,832.    Elapsed: 0:04:04.
  Batch   700  of  3,832.    Elapsed: 0:04:45.
  Batch   800  of  3,832.    Elapsed: 0:05:25.
  Batch   900  of  3,832.    Elapsed: 0:06:05.
  Batch 1,000  of  3,832.    Elapsed: 0:06:45.
  Batch 1,100  of  3,832.    Elapsed: 0:07:25.
  Batch 1,200  of  3,832.    Elapsed: 0:08:05.
  Batch 1,300  of  3,832.    Elapsed: 0:08:45.
  Batch 1,400  of  3,832.    Elapsed: 0:09:25.
  Batch 1,500  of  3,832.    Elapsed: 0:10:05.
  Batch 1,600  of  3,832.    Elapsed: 0:10:45.
  Batch 1,700  of  3,832.    Elapsed: 0:11:25.
  Batch 1,800  of  3,832.    Elapsed: 0:12:05.
  Batch 1,900  of  3,832.    Elapsed: 0:12:45.
  Batch 2,000  of  3,832.    Elapsed: 0:13:24.
  Batch 2,100  of  3,832.    Elapsed: 0:14:04.


In [44]:
#시작 시간 설정
t0 = time.time()

# 평가모드로 변경
model.eval()

# 변수 초기화
eval_loss, eval_accuracy = 0, 0
nb_eval_steps, nb_eval_examples = 0, 0

# 데이터로더에서 배치만큼 반복하여 가져옴
for step, batch in enumerate(test_dataloader):
    # 경과 정보 표시
    if step % 100 == 0 and not step == 0:
        elapsed = format_time(time.time() - t0)
        print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(test_dataloader), elapsed))

    # 배치를 GPU에 넣음
    batch = tuple(t.to(device) for t in batch)
    
    # 배치에서 데이터 추출
    b_input_ids, b_input_mask, b_labels = batch
    
    # 그래디언트 계산 안함
    with torch.no_grad():     
        # Forward 수행
        outputs = model(b_input_ids, 
                        token_type_ids=None, 
                        attention_mask=b_input_mask)
    
    # 로스 구함
    logits = outputs[0]

    # CPU로 데이터 이동
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()
    
    # 출력 로짓과 라벨을 비교하여 정확도 계산
    tmp_eval_accuracy = flat_accuracy(logits, label_ids)
    eval_accuracy += tmp_eval_accuracy
    nb_eval_steps += 1

print("")
print("Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps))
print("Test took: {:}".format(format_time(time.time() - t0)))

  Batch   100  of  1,825.    Elapsed: 0:00:12.
  Batch   200  of  1,825.    Elapsed: 0:00:25.
  Batch   300  of  1,825.    Elapsed: 0:00:37.
  Batch   400  of  1,825.    Elapsed: 0:00:50.
  Batch   500  of  1,825.    Elapsed: 0:01:02.
  Batch   600  of  1,825.    Elapsed: 0:01:14.
  Batch   700  of  1,825.    Elapsed: 0:01:27.
  Batch   800  of  1,825.    Elapsed: 0:01:39.
  Batch   900  of  1,825.    Elapsed: 0:01:52.
  Batch 1,000  of  1,825.    Elapsed: 0:02:04.
  Batch 1,100  of  1,825.    Elapsed: 0:02:16.
  Batch 1,200  of  1,825.    Elapsed: 0:02:29.
  Batch 1,300  of  1,825.    Elapsed: 0:02:41.
  Batch 1,400  of  1,825.    Elapsed: 0:02:54.
  Batch 1,500  of  1,825.    Elapsed: 0:03:06.
  Batch 1,600  of  1,825.    Elapsed: 0:03:18.
  Batch 1,700  of  1,825.    Elapsed: 0:03:31.
  Batch 1,800  of  1,825.    Elapsed: 0:03:43.

Accuracy: 0.97
Test took: 0:03:46


In [50]:
# 입력 데이터 변환
def convert_input_data(sentences):

    # BERT의 토크나이저로 문장을 토큰으로 분리
    tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]

    # 입력 토큰의 최대 시퀀스 길이
    MAX_LEN = 216

    # 토큰을 숫자 인덱스로 변환
    input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
    
    # 문장을 MAX_LEN 길이에 맞게 자르고, 모자란 부분을 패딩 0으로 채움
    input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

    # 어텐션 마스크 초기화
    attention_masks = []

    # 어텐션 마스크를 패딩이 아니면 1, 패딩이면 0으로 설정
    # 패딩 부분은 BERT 모델에서 어텐션을 수행하지 않아 속도 향상
    for seq in input_ids:
        seq_mask = [float(i>0) for i in seq]
        attention_masks.append(seq_mask)

    # 데이터를 파이토치의 텐서로 변환
    inputs = torch.tensor(input_ids)
    masks = torch.tensor(attention_masks)

    return inputs, masks

In [104]:
# 문장 테스트
def test_sentences(sentences):

    # 평가모드로 변경
    model.eval()

    # 문장을 입력 데이터로 변환
    inputs, masks = convert_input_data(sentences)

    # 데이터를 GPU에 넣음
    b_input_ids = inputs.to(device)
    b_input_mask = masks.to(device)
            
    # 그래디언트 계산 안함
    with torch.no_grad():     
        # Forward 수행
        outputs = model(b_input_ids, 
                        token_type_ids=None, 
                        attention_mask=b_input_mask)

    # 로스 구함
    logits = outputs[0]
    # CPU로 데이터 이동
    logits = logits.detach().cpu().numpy()

    return logits

# 리뷰 긍부정 점수 컬럼 생성

In [72]:
df2 = pd.read_parquet('merge_book_2015_30_5.parquet', engine='pyarrow')

In [73]:
# reloaded_model = tf.saved_model.load('book_bert')

In [74]:
df_reviews = []
for i in df2.reviewText:
    if i != None:
        df_reviews.append(i)

In [75]:
len(df_reviews)

823236

In [133]:
# 한번에 하면 메모리 에러 나서 100개씩 진행
review_score_list = []
for i in tqdm(range(len(df_reviews)//100+1)):
    for j in test_sentences(df_reviews[i*100:i*100+100]):
        review_score_list.append(np.argmax(j))

100%|██████████| 8233/8233 [1:27:22<00:00,  1.57it/s]


In [134]:
total_review_score_list = review_score_list.copy()

In [135]:
# review가 none 값 들어있는 index에 감성점수도 none값 넣어줌
none_review_index = list(df2[pd.isnull(df2.reviewText)].index)

In [137]:
df2['review_sentiment_score'] = total_review_score_list

In [140]:
df2.groupby('overall')['review_sentiment_score'].mean()

overall
1    0.074405
2    0.087934
3    0.394422
4    0.890668
5    0.971910
Name: review_sentiment_score, dtype: float64

In [141]:
df2.to_parquet('torch_bert_merge_book_sentiment_2015_30_5.parquet', engine='pyarrow', index=False)

### model 저장하기

In [173]:
os.makedirs("book_pytorch", exist_ok=True)
PATH = './book_pytorch/'
torch.save(model, PATH + 'model.pt')  

In [174]:
model2 = torch.load(PATH + 'model.pt')