## 3.6. 예제: 레스토랑 리뷰 감성 분류하기
### 3.6.1. 옐프 리뷰 데이터셋

In [54]:
import collections 
import numpy as np
import pandas as pd
import re

from argparse import Namespace

args = Namespace(\
    raw_train_dataset_csv = "raw_train.csv",
    raw_test_dataset_csv = "raw_test.csv",
    proportion_subset_of_train=0.1,
    train_proportion=0.7,
    val_proportion=0.15,
    test_proportion=0.15,
    output_munged_csv = "reviews_with_splits_lite2.csv",
    seed=1337)

In [7]:
train_reviews = pd.read_csv(args.raw_train_dataset_csv, 
                            header=None, names=['rating','review'])

In [8]:
train_reviews.head()

Unnamed: 0,rating,review
0,1,"Unfortunately, the frustration of being Dr. Go..."
1,2,Been going to Dr. Goldberg for over 10 years. ...
2,1,I don't know what Dr. Goldberg was like before...
3,1,I'm writing this review to give you a heads up...
4,2,All the food is great here. But the best thing...


In [9]:
# 리뷰 클래스 비율이 동일하도록 만들기
by_rating = collections.defaultdict(list)

for _, row in train_reviews.iterrows():
    by_rating[row.rating].append(row.to_dict())
    # 데이터프레임 타입변환 메소드 .to_dict()
    # df 각 행 반복 메소드 df.iterrows() --> 행번호, 행 
"""
by_rating = defaultdict(list,
{1: [ {'rating':1, 'review':"___" }, {'rating':1, 'review':"___"}, ... ],
2: [ {'rating':2, 'review':"___" }, {'rating':2, 'review':"___"}, ... ]})
"""
review_subset = []

""" Q. 왜 랜덤샘플링 안함? """
for _, item_list in sorted(by_rating.items()):
    n_total = len(item_list) # 평점1, 평점2 각각 28만개 
    n_subset = int(args.proportion_subset_of_train * n_total) # 28만 * 0.1
    review_subset.extend(item_list[:n_subset]) # 각각 2만8천개씩 떼옴
    
review_subset = pd.DataFrame(review_subset) # 총 5만6천 길이

In [10]:
review_subset.head()

Unnamed: 0,rating,review
0,1,"Unfortunately, the frustration of being Dr. Go..."
1,1,I don't know what Dr. Goldberg was like before...
2,1,I'm writing this review to give you a heads up...
3,1,Wing sauce is like water. Pretty much a lot of...
4,1,Owning a driving range inside the city limits ...


In [27]:
train_reviews.rating.value_counts()

1    280000
2    280000
Name: rating, dtype: int64

In [28]:
# 고유 클래스
set(review_subset.rating)

{1, 2}

In [49]:
# 훈련/검증/테스트를 만들기 위해 별점을 기준으로 나눔.
by_rating = collections.defaultdict(list)

for _, row in review_subset.iterrows():
    by_rating[row.rating].append(row.to_dict())
    
# 분할 데이터 만들기
final_list = []
np.random.seed(args.seed) # 샘플링 결과값 고정

for _, item_list in sorted(by_rating.items()):
    np.random.shuffle(item_list)
    
    n_total = len(item_list)
    n_train = int(args.train_proportion * n_total) # 0.7* 2만8천
    n_val = int(args.val_proportion * n_total) # 0.15
    n_test = int(args.test_proportion * n_total) # 0.15
    
    # 데이터 포인터에 분할 속성 추가
    for item in item_list[:n_train]:
        item['split'] = 'train'
        
    for item in item_list[n_train: n_train+n_val]:
        item['split'] = 'val'
        
    for item in item_list[n_train+n_val:]:
        item['split'] = 'test'
        
    # 최종 리스트에 추가
    final_list.extend(item_list)
    
final_reviews = pd.DataFrame(final_list)

In [50]:
final_reviews.split.value_counts()

train    39200
val       8400
test      8400
Name: split, dtype: int64

In [52]:
# 리뷰 전처리 
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"([.,!?])", r" \1 ", text)
    text = re.sub(r"[^a-zA-Z.,!?]+", r" ", text)
    return text
    
final_reviews.review = final_reviews.review.apply(preprocess_text)

final_reviews['rating'] = final_reviews.rating.apply({1:'negative', 2:'positive'}.get)
# .get 빠뜨리면 'Series' 객체가 아님 오류 
# type(final_reviews.rating): pandas.core.series.Series

""" 데이터프레임의 apply 메소드
- 각 행마다 해당 함수를 적용해줌
- 마치 for문 돌아가는 것처럼 """

' 데이터프레임의 apply 메소드\n- 각 행마다 해당 함수를 적용해줌\n- 마치 for문 돌아가는 것처럼 '

In [53]:
final_reviews.head()

Unnamed: 0,rating,review,split
0,negative,terrible place to work for i just heard a stor...,train
1,negative,"hours , minutes total time for an extremely s...",train
2,negative,my less than stellar review is for service . w...,train
3,negative,i m granting one star because there s no way t...,train
4,negative,the food here is mediocre at best . i went aft...,train


In [55]:
final_reviews.to_csv(args.output_munged_csv, index=False)