In [34]:
import collections
import numpy as np
import pandas as pd
import re

from argparse import Namespace

In [35]:
args = Namespace(
    raw_train_dataset_csv='data/raw_train.csv',
    raw_test_dataset_csv='data/raw_test.csv',
    train_proportion=0.7,
    val_proportion=0.3,
    output_munged_csv='data/reviews_with_splits_full.csv',
    seed=1337
)

In [36]:
# 원본 데이터를 읽어옴
train_reviews = pd.read_csv(args.raw_train_dataset_csv, header=None, names=['rating', 'review'], encoding='utf-8')
train_reviews = train_reviews[~pd.isnull(train_reviews.review)] # 리뷰가 없는 행 제거
test_reviews = pd.read_csv(args.raw_test_dataset_csv, header=None, names=['rating', 'review'], encoding='utf-8')
test_reviews = test_reviews[~pd.isnull(test_reviews.review)]

In [37]:
train_reviews.head()

Unnamed: 0,rating,review
0,1,"Unfortunately, the frustration of being Dr. Go..."
1,2,Been going to Dr. Goldberg for over 10 years. ...
2,1,I don't know what Dr. Goldberg was like before...
3,1,I'm writing this review to give you a heads up...
4,2,All the food is great here. But the best thing...


In [38]:
test_reviews.head()

Unnamed: 0,rating,review
0,1,Ordered a large Mango-Pineapple smoothie. Stay...
1,2,Quite a surprise! \n\nMy wife and I loved thi...
2,1,"First I will say, this is a nice atmosphere an..."
3,2,I was overall pretty impressed by this hotel. ...
4,1,Video link at bottom review. Worst service I h...


In [39]:
# 고유 클래스
set(train_reviews.rating) # 긍정, 부정 클래스만 존재

{1, 2}

In [40]:
# 훈련, 검증, 테스트를 만들기 위해 별점을 기준으로 나눔
by_rating = collections.defaultdict(list)
for _, row in train_reviews.iterrows(): # iterrows 사용시 행 번호도 같이 나옴
    by_rating[row.rating].append(row.to_dict())
'''
별점마다 그룹화할 경우
데이터 분할, 데이터 불균형, 감정 분류 문제, 다양한 모델 적용, 시각화 및 분석에 유리
'''

'\n별점마다 그룹화할 경우\n데이터 분할, 데이터 불균형, 감정 분류 문제, 다양한 모델 적용, 시각화 및 분석에 유리\n'

In [41]:
# 분할 데이터를 만듦
final_list = []
np.random.seed(args.seed)

for _, item_list in sorted(by_rating.items()):
    '''
    by_rating = {
        1:[
            {'rating': 1, 'review': ...}
            ]
        0: [...}
    형태로 이루어짐
    
    item_list = [
        {'rating': 1, review: ...}
        ]
    '''
    np.random.shuffle(item_list)
    
    n_total = len(item_list)
    n_train = int(args.train_proportion * n_total)
    n_val = int(args.val_proportion * n_total)
    
    # 데이터 포인터에 분할 속성 추가
    for item in item_list[:n_train]:
        item['split'] = 'train'
    
    for item in item_list[n_train:]:
        item['split'] = 'val'
    
    # 최종 리스트에 추가
    final_list.extend(item_list)

In [42]:
for _, row in test_reviews.iterrows():
    row_dict = row.to_dict()
    row_dict['split'] = 'test'
    final_list.append(row_dict)

In [43]:
# 분할 데이터를 데이터 프레임으로 만듭니다
final_reviews = pd.DataFrame(final_list)

In [44]:
final_reviews.split.value_counts()

train    392000
val      168000
test      38000
Name: split, dtype: int64

In [45]:
final_reviews.review.head()

0    The entrance was the #1 impressive thing about...
1    I'm a Mclover, and I had no problem\nwith the ...
2    Less than good here, not terrible, but I see n...
3    I don't know if I can ever bring myself to go ...
4    Food was OK/Good but the service was terrible....
Name: review, dtype: object

In [46]:
final_reviews[pd.isnull(final_reviews.review)] # 리뷰가 비어있는 항목 검색

Unnamed: 0,rating,review,split


In [47]:
# 리뷰 전처리
def preprocess_text(text):
    if type(text) == float:
        print(text)
    text = text.lower()
    text = re.sub(r"([.,!?])", r" \1 ", text) # 해당 문자 앞뒤 공백 추가
    text = re.sub(r"[^a-zA-Z.,!?]+", r" ", text) # 알파벳, 해당 문자외 공백으로 변경
    return text
final_reviews.review = final_reviews.review.apply(preprocess_text)

In [48]:
final_reviews.rating = final_reviews.rating.apply({1: 'negative', 2: 'positive'}.get) # get은 키값을 가져오는거

In [51]:
final_reviews.head()

Unnamed: 0,rating,review,split
0,negative,the entrance was the impressive thing about th...,train
1,negative,"i m a mclover , and i had no problem nwith the...",train
2,negative,"less than good here , not terrible , but i see...",train
3,negative,i don t know if i can ever bring myself to go ...,train
4,negative,food was ok good but the service was terrible ...,train


In [50]:
final_reviews.to_csv(args.output_munged_csv, index=False)