In [29]:
import collections
import numpy as np
import pandas as pd
import re

from argparse import Namespace

In [30]:
args = Namespace(
    raw_train_dataset_csv='data/raw_train.csv',
    raw_test_dataset_csv='data/raw_test.csv',
    proportion_subset_of_train=0.1,
    train_proportion=0.7,
    val_proportion=0.15,
    test_proportion=0.15,
    output_munged_csv='data/reviews_with_splits_lite.csv',
    seed=1337
)

In [31]:
# 원본 데이터를 읽어옴
train_reviews = pd.read_csv(args.raw_train_dataset_csv, header=None, names=['rating', 'review'])

In [32]:
train_reviews.head()

Unnamed: 0,rating,review
0,1,"Unfortunately, the frustration of being Dr. Go..."
1,2,Been going to Dr. Goldberg for over 10 years. ...
2,1,I don't know what Dr. Goldberg was like before...
3,1,I'm writing this review to give you a heads up...
4,2,All the food is great here. But the best thing...


In [33]:
# 리뷰 클래스 비율이 동일하도록 만듦
by_rating = collections.defaultdict(list)

for _, row in train_reviews.iterrows():
    by_rating[row.rating].append(row.to_dict())

review_subset = []

for _, item_list in sorted(by_rating.items()):
    n_total = len(item_list)
    n_subset = int(args.proportion_subset_of_train * n_total)
    review_subset.extend(item_list[:n_subset])

review_subset = pd.DataFrame(review_subset)

In [34]:
review_subset.head()

Unnamed: 0,rating,review
0,1,"Unfortunately, the frustration of being Dr. Go..."
1,1,I don't know what Dr. Goldberg was like before...
2,1,I'm writing this review to give you a heads up...
3,1,Wing sauce is like water. Pretty much a lot of...
4,1,Owning a driving range inside the city limits ...


In [35]:
train_reviews.rating.value_counts()

1    280000
2    280000
Name: rating, dtype: int64

In [36]:
set(review_subset.rating)

{1, 2}

In [37]:
# 훈련, 검증, 테스트 분할
by_rating = collections.defaultdict(list)
for _, row in sorted(review_subset.iterrows()):
    by_rating[row.rating].append(row.to_dict())

# 분할 데이터 만듦
np.random.seed(args.seed)
final_list = []

for _, item_list in sorted(by_rating.items()):
    np.random.shuffle(item_list)
    
    n_total = len(item_list)
    n_train = int(n_total * args.train_proportion)
    n_val = int(args.val_proportion * n_total)
    n_test = int(args.test_proportion * n_total)
    
    # 데이터 포인터에 분할 속성 추가
    for item in item_list[:n_train]:
        item['split'] = 'train'
    for item in item_list[n_train:n_train + n_val]:
        item['split'] = 'val'
    for item in item_list[n_train+n_val:n_train+n_val+n_test]:
        item['split'] = 'test'
        
    # 최종 리스트에 추가
    final_list.extend(item_list)

In [38]:
# 분할 데이터를 데이터 프레임으로 만듦
final_list = pd.DataFrame(final_list)

In [39]:
final_list.head()

Unnamed: 0,rating,review,split
0,1,Terrible place to work for I just heard a stor...,train
1,1,"3 hours, 15 minutes-- total time for an extrem...",train
2,1,My less than stellar review is for service. ...,train
3,1,I'm granting one star because there's no way t...,train
4,1,The food here is mediocre at best. I went afte...,train


In [40]:
final_list.split.value_counts()

train    39200
val       8400
test      8400
Name: split, dtype: int64

In [41]:
# 데이터 전처리
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'([.,?!])', r'\1', text)
    text = re.sub(r'[^a-zA-Z.,!?]', r' ', text)
    return text

final_list.review = final_list.review.apply(preprocess_text)

In [42]:
final_list['rating'] = final_list.rating.apply({1: 'negative', 2: 'positive'}.get)

In [43]:
final_list.head()

Unnamed: 0,rating,review,split
0,negative,terrible place to work for i just heard a stor...,train
1,negative,"hours, minutes total time for an extrem...",train
2,negative,my less than stellar review is for service. ...,train
3,negative,i m granting one star because there s no way t...,train
4,negative,the food here is mediocre at best. i went afte...,train


In [44]:
final_list.to_csv(args.output_munged_csv, index=False)