# Amazon BERT - Sentiment analysis

In [1]:
import torch
from transformers import BertTokenizer
from transformers import BertForSequenceClassification, AdamW, BertConfig
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from transformers import get_linear_schedule_with_warmup

import pandas as pd
import numpy as np
import random
import time
import datetime
import os

from utils import (
    tokenizer_setting,
    preprocessing,
    GPU_setting,
    hyperparmeter_setting,
    flat_accuracy,
    format_time,
    initial_setting,
    run_train,
    run_test,
    convert_input_data,
    test_sentence_unit,
    test_sentence_many
)

### 1. Initial Setting
- Load the data
- Split train, test data
- Load tokenizer

In [12]:
path = './../../data/amazon/sentiment_analysis_data/'
os.listdir(path)

['predictions.csv', 'test.tsv', 'test.tsv.zip', 'train.tsv', 'train.tsv.zip']

In [13]:
train_df = pd.read_csv(path + 'train.tsv', sep="\t")
test_df = pd.read_csv(path + 'test.tsv', sep="\t")

In [14]:
train_df.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2


In [15]:
test_df.head()

Unnamed: 0,PhraseId,SentenceId,Phrase
0,156061,8545,An intermittently pleasing but mostly routine ...
1,156062,8545,An intermittently pleasing but mostly routine ...
2,156063,8545,An
3,156064,8545,intermittently pleasing but mostly routine effort
4,156065,8545,intermittently pleasing but mostly routine


In [16]:
X_train = train_df['Phrase']
y_train = train_df['Sentiment']

X_test  = test_df['Phrase']
y_test = pd.read_csv(path + 'predictions.csv')

In [17]:
# X_train, X_test, y_train, y_test = train_test_split(feature_df, 
#                                                     target_df, 
#                                                     test_size=0.3, 
#                                                     random_state=42)

In [19]:
tokenizer = tokenizer_setting()

---
### 2. Pre-processing

In [20]:
# train_dataloader, validation_dataloader = preprocessing(X_train.review_content, y_train, tokenizer, process_type='train')
train_dataloader, validation_dataloader = preprocessing(X_train, y_train, tokenizer, process_type='train')

In [21]:
# test_dataloader = preprocessing(X_test.review_content, y_test, tokenizer, process_type='test')
test_dataloader = preprocessing(X_test, y_test, tokenizer, process_type='test')

---
### 3. Pre-training

In [22]:
start_time = time.time()
model = BertForSequenceClassification.from_pretrained("bert-base-multilingual-cased", num_labels = 5) #label개수에 따라 변경
model.cuda()
print("  Loading took: {:}".format(format_time(time.time() - start_time)))

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model ch

  Loading took: 0:00:06


In [23]:
device = GPU_setting()

There are 1 GPU(s) available.
Device 0 : NVIDIA GeForce RTX 3080


In [24]:
optimizer, epochs, total_steps, scheduler = hyperparmeter_setting(model, train_dataloader, lr=2e-5, eps=1e-8, epochs=3)

In [25]:
model = initial_setting(model, seed_val=42)

---
### 4. Fine-tuning

In [34]:
def save_checkpoint(state, path, file_name='checkpoint.pth.tar'):
    file_path = path + file_name
    print(f"file_path: {file_path}")
    torch.save(state, file_path)

In [None]:
def run_train(model, epochs, train_dataloader, validation_dataloader, optimizer, scheduler, device, path):
    first_start_time = time.time()
    
    # 에폭 수만큼 반복
    for epoch in range(epochs):

        # ========================================
        #               1. Training
        # ========================================
        print("")
        print('======== Epoch {:} / {:} ========'.format(epoch + 1, epochs))
        print('Training...')

        # 시작 시간 설정
        start_time = time.time()

        # 로스 초기화
        total_loss = 0

        # 훈련모드로 변경
        model.train()

        for step, batch in enumerate(train_dataloader):
            # 경과 정보 표시 (step 500번마다 출력)
            if step % 500 == 0 and not step == 0:
                elapsed = format_time(time.time() - start_time)
                print('Batch {:>5,}  of  {:>5,}. Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

            # 배치를 GPU에 올림
            batch = tuple(b.to(device) for b in batch)

            # 배치에서 데이터 추출 (input, mask, label 순으로 넣었었음)
            b_input_ids, b_input_mask, b_labels = batch

            # forward 수행
            outputs = model(b_input_ids,
                            attention_mask=b_input_mask,
                           token_type_ids=None,
                            labels=b_labels)

            # 로스 구함
            loss = outputs.loss # outputs[0]

            # 총 로스 계산
            total_loss += loss.item()

            # Backward 수행으로 그래디언트 계산 (Back-propagation)
            loss.backward()

            # 그래디언트 클리핑
            torch.nn.utils.clip_grad_norm_(model.parameters(), 2.0) # 예제 코드에서는 1.0이었음

            # 그래디언트를 이용해 가중치 파라미터를 lr만큼 업데이트
            optimizer.step()

            # 스케줄러로 학습률 감소
            scheduler.step()

            # 그래디언트 초기화
            ## (호출시 경사값을 0으로 설정. 이유 : 반복 때마다 기울기를 새로 계산하기 때문)
            model.zero_grad()

        # 1 에폭이 끝나면 평균 train 로스 계산 (전체 loss / 배치 수)
        avg_train_loss = total_loss / len(train_dataloader)

        print("")
        print("  Average training loss: {0:.2f}".format(avg_train_loss))
        print("  Training epoch took: {:}".format(format_time(time.time() - start_time)))
        
        # 체크포인트 저장
        print("  Model Checkpoint Save")
        save_checkpoint({
            'epoch': epoch,
            'state_dict': model.state_dict(),
            'optimizer' : optimizer.state_dict()
            }, path)
        
        # ========================================
        #               2. Validation
        # ========================================

        # 1 에폭이 끝나면 validation 시행
        print("")
        print("Running Validation...")

        # 시작 시간 설정
        start_time = time.time()

        # 평가 모드로 변경
        model.eval()

        # 변수 초기화
        total_valid_accuracy = 0
        nb_valid_steps = 0

        # valid 데이터로더에서 배치만큼 반복하여 가져옴
        for batch in validation_dataloader:

            # 배치를 GPU에 넣음
            batch = tuple(b.to(device) for b in batch)

            # 배치에서 데이터 추출
            b_input_ids, b_input_mask, b_labels = batch

            # 그래디언트 계산 안함!
            with torch.no_grad():
                # Forward 수행
                outputs = model(b_input_ids, 
                                token_type_ids=None, 
                                attention_mask=b_input_mask)

            # 로스 구함 (train할 때는 loss, validation할 때는 logits)
            ## logits은 softmax를 거치기 전의 classification score를 반환합니다. shape: (batch_size, config.num_labels)
            logits = outputs.logits

            # CPU로 데이터 이동
            logits = logits.detach().cpu().numpy()
            label_ids = b_labels.to('cpu').numpy()

            # 출력 로짓과 라벨을 비교하여 정확도 계산
            valid_accuracy = flat_accuracy(logits, label_ids)
            total_valid_accuracy += valid_accuracy

        print("  Accuracy: {0:.2f}".format(total_valid_accuracy/len(validation_dataloader)))
        print("  Validation took: {:}".format(format_time(time.time() - start_time)))

    print("")
    print("Total took: {:}".format(format_time(time.time() - first_start_time)))
    print("Training complete!")
    
    return model

In [28]:
epochs = 2

In [37]:
model = run_train(model, epochs, train_dataloader, validation_dataloader, optimizer, scheduler, device, path)


Training...
Batch   500  of  4,390. Elapsed: 0:07:58.
Batch 1,000  of  4,390. Elapsed: 0:15:56.
Batch 1,500  of  4,390. Elapsed: 0:23:55.
Batch 2,000  of  4,390. Elapsed: 0:31:53.
Batch 2,500  of  4,390. Elapsed: 0:39:52.
Batch 3,000  of  4,390. Elapsed: 0:47:51.
Batch 3,500  of  4,390. Elapsed: 0:55:51.
Batch 4,000  of  4,390. Elapsed: 1:03:50.

  Average training loss: 0.62
  Training epoch took: 1:10:04
  Model Checkpoint Save
file_path: ./../../data/rotten_tomato/sentiment_analysis_data/checkpoint.pth.tar

Running Validation...
  Accuracy: 0.70
  Validation took: 0:02:31

Training...
Batch   500  of  4,390. Elapsed: 0:07:59.
Batch 1,000  of  4,390. Elapsed: 0:15:57.
Batch 1,500  of  4,390. Elapsed: 0:23:59.
Batch 2,000  of  4,390. Elapsed: 0:31:59.
Batch 2,500  of  4,390. Elapsed: 0:39:57.
Batch 3,000  of  4,390. Elapsed: 0:47:55.
Batch 3,500  of  4,390. Elapsed: 0:55:54.
Batch 4,000  of  4,390. Elapsed: 1:03:52.

  Average training loss: 0.58
  Training epoch took: 1:10:05
  Mode

### 5. Save the model

In [38]:
# torch.save(model.state_dict(), "./../../data/rotten_tomato/sentiment_analysis_data/model_new.pth")

In [29]:
model.load_state_dict(torch.load("./../../data/rotten_tomato/sentiment_analysis_data/model_new.pth"))
model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elemen

In [30]:
import os

# Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()

output_dir = path + '/model_save/'

# Create output directory if needed
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

print("Saving model to %s" % output_dir)

# Save a trained model, configuration and tokenizer using `save_pretrained()`.
# They can then be reloaded using `from_pretrained()`
model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
model_to_save.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

Saving model to ./../../data/amazon/sentiment_analysis_data//model_save/


('./../../data/amazon/sentiment_analysis_data//model_save/tokenizer_config.json',
 './../../data/amazon/sentiment_analysis_data//model_save/special_tokens_map.json',
 './../../data/amazon/sentiment_analysis_data//model_save/vocab.txt',
 './../../data/amazon/sentiment_analysis_data//model_save/added_tokens.json')

- Load the model

In [31]:
# Load a trained model and vocabulary that you have fine-tuned
model_new = BertForSequenceClassification.from_pretrained(output_dir)
tokenizer = BertTokenizer.from_pretrained(output_dir)

# Copy the model to the GPU.
model_new.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elemen

---
### 6. Testing

In [42]:
run_test(model, test_dataloader, device)

  Batch   100  of  2,072.    Elapsed: 0:00:31.
  Batch   200  of  2,072.    Elapsed: 0:01:02.
  Batch   300  of  2,072.    Elapsed: 0:01:33.
  Batch   400  of  2,072.    Elapsed: 0:02:04.
  Batch   500  of  2,072.    Elapsed: 0:02:35.
  Batch   600  of  2,072.    Elapsed: 0:03:07.
  Batch   700  of  2,072.    Elapsed: 0:03:38.
  Batch   800  of  2,072.    Elapsed: 0:04:09.
  Batch   900  of  2,072.    Elapsed: 0:04:40.
  Batch 1,000  of  2,072.    Elapsed: 0:05:11.
  Batch 1,100  of  2,072.    Elapsed: 0:05:42.
  Batch 1,200  of  2,072.    Elapsed: 0:06:13.
  Batch 1,300  of  2,072.    Elapsed: 0:06:44.
  Batch 1,400  of  2,072.    Elapsed: 0:07:15.
  Batch 1,500  of  2,072.    Elapsed: 0:07:47.
  Batch 1,600  of  2,072.    Elapsed: 0:08:18.
  Batch 1,700  of  2,072.    Elapsed: 0:08:49.
  Batch 1,800  of  2,072.    Elapsed: 0:09:20.
  Batch 1,900  of  2,072.    Elapsed: 0:09:51.
  Batch 2,000  of  2,072.    Elapsed: 0:10:22.

Accuracy: 0.00
Test took: 0:10:44


- 단일 문장

In [38]:
sentence = "It's a  great"
logits = test_sentence_unit(model, device, tokenizer, [sentence])

print(logits)
print(np.argmax(logits))

[[-3.6424768  -3.59159    -0.14507934  3.7208323   3.9107153 ]]
4


In [39]:
sentence = "Whether audiences will get behind The Lightning Thief is hard to predict. Overall, it's an entertaining introduction to a promising new world -- but will the consuming shadow of Potter be too big to break free of?"
logits = test_sentence_unit(model, device, tokenizer, [sentence])

print(logits)
print(np.argmax(logits))

[[-2.2826602   0.88587826  1.9615097   0.792078   -2.2370389 ]]
2


In [40]:
sentence = "First section is good, but last is bad."
logits = test_sentence_unit(model, device, tokenizer, [sentence])

print(logits)
print(np.argmax(logits))

[[-1.6444795   1.9478985   2.4090805  -0.02791836 -3.761274  ]]
2


In [55]:
sentence = "It's so so."
logits = test_sentence_unit(model, device, tokenizer, [sentence])

print(logits)
print(np.argmax(logits))

[[-2.9226365  0.8520015  3.7444754  0.5416726 -3.7775207]]
2


In [56]:
sentence = "This is terrible."
logits = test_sentence_unit(model, device, tokenizer, [sentence])

print(logits)
print(np.argmax(logits))

[[ 4.080648   1.6646496 -0.7384983 -2.6886609 -1.8533113]]
0


In [43]:
def test_sentence_unit(model, device, tokenizer, sentence):
        
    # 평가모드로 변경
    model.eval()

    # 문장을 입력 데이터로 변환
    inputs, masks = convert_input_data(tokenizer, sentence)

    # 데이터를 GPU에 넣음
    b_input_ids = inputs.to(device)
    b_input_mask = masks.to(device)
            
    # 그래디언트 계산 안함
    with torch.no_grad():     
        # Forward 수행
        outputs = model(b_input_ids, 
                        token_type_ids=None, 
                        attention_mask=b_input_mask)

    # 로스 구함
    logits = outputs[0]

    # CPU로 데이터 이동
    logits = logits.detach().cpu().numpy()

    return logits

- 여러 문장

In [44]:
path

'./../../data/amazon/sentiment_analysis_data/'

In [45]:
path2 = './../../data/amazon/'

In [86]:
rating_df = pd.read_csv(path2+'amazon(review_50_user).csv')

In [87]:
rating_df.head()

Unnamed: 0,user_id,movie_id,review,rating,review_date,review_all,helpful
0,45585,987,Well worth a watch.,3.0,"01 1, 2000",I've seen October Sky three times and must say...,"[9, 12]"
1,95128,1235,A gem for the ages.,5.0,"01 1, 2000",This has always been one of my favorite movies...,"[1, 1]"
2,27036,5635,At last a Bond Girl with teeth,4.0,"01 1, 2001",This entry has something new - besides the meg...,"[1, 1]"
3,120361,11376,Hackman And Coppola At Peak Performance,5.0,"01 1, 2001",Hubby and I have seen this movie many times si...,"[2, 5]"
4,78909,9976,Where Is The Story?,2.0,"01 1, 2001",Perhaps this is the best way to begin. Movies ...,"[1, 2]"


In [76]:
len(rating_df)

525712

In [90]:
rating_df = rating_df.dropna()

In [92]:
len(rating_df)

525698

In [93]:
start_time = time.time()
labels = test_sentence_many(model, device, tokenizer, rating_df.review)
print("  Labeling took: {:}".format(format_time(time.time() - start_time)))

  Batch   100  of  16,429.    Elapsed: 0:01:35.
  Batch   200  of  16,429.    Elapsed: 0:01:39.
  Batch   300  of  16,429.    Elapsed: 0:01:43.
  Batch   400  of  16,429.    Elapsed: 0:01:48.
  Batch   500  of  16,429.    Elapsed: 0:01:52.
  Batch   600  of  16,429.    Elapsed: 0:01:56.
  Batch   700  of  16,429.    Elapsed: 0:02:00.
  Batch   800  of  16,429.    Elapsed: 0:02:04.
  Batch   900  of  16,429.    Elapsed: 0:02:09.
  Batch 1,000  of  16,429.    Elapsed: 0:02:13.
  Batch 1,100  of  16,429.    Elapsed: 0:02:17.
  Batch 1,200  of  16,429.    Elapsed: 0:02:21.
  Batch 1,300  of  16,429.    Elapsed: 0:02:25.
  Batch 1,400  of  16,429.    Elapsed: 0:02:30.
  Batch 1,500  of  16,429.    Elapsed: 0:02:34.
  Batch 1,600  of  16,429.    Elapsed: 0:02:38.
  Batch 1,700  of  16,429.    Elapsed: 0:02:42.
  Batch 1,800  of  16,429.    Elapsed: 0:02:46.
  Batch 1,900  of  16,429.    Elapsed: 0:02:51.
  Batch 2,000  of  16,429.    Elapsed: 0:02:55.
  Batch 2,100  of  16,429.    Elapsed: 0

In [94]:
len(labels)

525698

In [95]:
rating_df['sentiment'] = labels

In [96]:
rating_df.head()

Unnamed: 0,user_id,movie_id,review,rating,review_date,review_all,helpful,sentiment
0,45585,987,Well worth a watch.,3.0,"01 1, 2000",I've seen October Sky three times and must say...,"[9, 12]",4
1,95128,1235,A gem for the ages.,5.0,"01 1, 2000",This has always been one of my favorite movies...,"[1, 1]",4
2,27036,5635,At last a Bond Girl with teeth,4.0,"01 1, 2001",This entry has something new - besides the meg...,"[1, 1]",2
3,120361,11376,Hackman And Coppola At Peak Performance,5.0,"01 1, 2001",Hubby and I have seen this movie many times si...,"[2, 5]",2
4,78909,9976,Where Is The Story?,2.0,"01 1, 2001",Perhaps this is the best way to begin. Movies ...,"[1, 2]",2


In [97]:
path

'./../../data/amazon/sentiment_analysis_data/'

In [98]:
rating_df.to_csv('./../../data/amazon/amazon_sentiment.csv', index=False)

---
### 7. Pretraining 층으로만 학습

In [None]:
# model_no_finetuning = BertForSequenceClassification.from_pretrained("bert-base-multilingual-cased", num_labels=2)
# model_no_finetuning.cuda()

In [None]:
# #시작 시간 설정
# start_time = time.time()

# # 평가모드로 변경
# model_rotten2.eval()

# # 변수 초기화
# eval_loss, eval_accuracy = 0, 0
# nb_eval_steps, nb_eval_examples = 0, 0

# # 데이터로더에서 배치만큼 반복하여 가져옴
# for step, batch in enumerate(test_dataloader):
#     # 경과 정보 표시
#     if step % 100 == 0 and not step == 0:
#         elapsed = format_time(time.time() - start_time)
#         print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(test_dataloader), elapsed))

#     # 배치를 GPU에 넣음
#     batch = tuple(b.to(device) for b in batch)
    
#     # 배치에서 데이터 추출
#     b_input_ids, b_input_mask, b_labels = batch
    
#     # 그래디언트 계산 안함
#     with torch.no_grad():     
#         # Forward 수행
#         outputs = model_rotten2(b_input_ids, 
#                         token_type_ids=None, 
#                         attention_mask=b_input_mask)
    
#     # 로스 구함
#     logits = outputs[0]

#     # CPU로 데이터 이동
#     logits = logits.detach().cpu().numpy()
#     label_ids = b_labels.to('cpu').numpy()
    
#     # 출력 로짓과 라벨을 비교하여 정확도 계산
#     tmp_eval_accuracy = flat_accuracy(logits, label_ids)
#     eval_accuracy += tmp_eval_accuracy

# print("")
# print("Accuracy: {0:.2f}".format(eval_accuracy/len(test_dataloader)))
# print("Test took: {:}".format(format_time(time.time() - start_time)))

---
### 8. 특정 영화에 대한 긍정/부정 리뷰 취합 후 파일로 저장

In [33]:
df = pd.read_csv(path+'rotten_review_scaled_label.csv')

In [34]:
df.head()

Unnamed: 0,rotten_tomatoes_link,critic_name,top_critic,publisher_name,review_type,review_score,review_date,review_content,label
0,m/0814255,Ben McEachen,False,Sunday Mail (Australia),Fresh,1.0,2010-02-09,Whether audiences will get behind The Lightnin...,1
1,m/0814255,Nick Schager,False,Slant Magazine,Rotten,0.25,2010-02-10,Harry Potter knockoffs don't come more transpa...,0
2,m/0814255,Bill Goodykoontz,True,Arizona Republic,Fresh,1.0,2010-02-10,"Percy Jackson isn't a great movie, but it's a ...",1
3,m/0814255,Jordan Hoffman,False,UGO,Fresh,0.7,2010-02-10,"Fun, brisk and imaginative",1
4,m/0814255,Mark Adams,False,Daily Mirror (UK),Fresh,0.8,2010-02-10,"This action-packed fantasy adventure, based on...",1


In [38]:
df_review = df[['rotten_tomatoes_link','review_content', 'label']]

In [42]:
df_review.head()

Unnamed: 0,rotten_tomatoes_link,review_content,label
0,m/0814255,Whether audiences will get behind The Lightnin...,1
1,m/0814255,Harry Potter knockoffs don't come more transpa...,0
2,m/0814255,"Percy Jackson isn't a great movie, but it's a ...",1
3,m/0814255,"Fun, brisk and imaginative",1
4,m/0814255,"This action-packed fantasy adventure, based on...",1


In [126]:
pivot_review = df_review.pivot_table(index='rotten_tomatoes_link', columns='label', aggfunc=len, fill_value=0)

In [137]:
pivot_review

Unnamed: 0_level_0,review_content,review_content
label,0,1
rotten_tomatoes_link,Unnamed: 1_level_2,Unnamed: 2_level_2
m/+_one_2019,0,33
m/+h,2,2
m/-_man,1,3
m/-cule_valley_of_the_lost_ants,0,5
m/0814255,12,34
...,...,...
m/zoom_2006,22,3
m/zootopia,2,166
m/zorba_the_greek,0,3
m/zulu,0,4


In [145]:
pivot_review.loc['m/0814255']

                label
review_content  0        12
                1        34
Name: m/0814255, dtype: int64

In [144]:
pivot_review.loc['m/10000_bc']

                label
review_content  0        48
                1        12
Name: m/10000_bc, dtype: int64

- Movie1: 'm/0814255'

In [102]:
# 긍정, 부정 리뷰들을 취합 후 파일저장
def movie_sentiment_filter(df, movie_link, file_name):
    condition = df['rotten_tomatoes_link'] == movie_link
    df = df[condition]
    
    pos_review_list = df[df['label'] == 1].reset_index(drop=True).review_content
    movie1_pos = " ".join(pos_review_list)
    
    neg_review_list = df[df['label'] == 0].reset_index(drop=True).review_content
    movie1_neg = " ".join(neg_review_list)
    
    f = open(path + f"{file_name}_pos.txt", 'w')
    f.write(movie1_pos)
    f.close()
    
    f = open(path + f"{file_name}_neg.txt", 'w')
    f.write(movie1_neg)
    f.close()
    
    print(f"{file_name}_pos/neg save finish!!")

In [96]:
movie_sentiment_filter(df=df_review, movie_link='m/0814255', file_name='movie1')

movie1_pos/neg save finish!!


In [104]:
movie_sentiment_filter(df=df_review, movie_link='m/0878835', file_name='movie2')

movie2_pos/neg save finish!!


In [105]:
movie_sentiment_filter(df=df_review, movie_link='m/10000_bc', file_name='movie3')

movie3_pos/neg save finish!!
