In [67]:
import collections
import re
import numpy as np
import pandas as pd

from argparse import Namespace

In [68]:
args = Namespace(
    raw_train_csv='data/raw_train.csv',
    raw_test_csv='data/raw_test.csv',
    train_proportion=0.7,
    val_proportion=0.3,
    output_munged_csv='data/review_with_splits_full_2.csv',
    seed=1337
)

In [69]:
# read raw data
train_reviews = pd.read_csv(args.raw_train_csv, header=None, names=['rating', 'review'])
train_reviews = train_reviews[~pd.isnull(train_reviews)]
test_reviews = pd.read_csv(args.raw_test_csv, header=None, names=['rating', 'review'])
test_reviews = test_reviews[~pd.isnull(test_reviews)]

In [70]:
train_reviews.head()

Unnamed: 0,rating,review
0,1,"Unfortunately, the frustration of being Dr. Go..."
1,2,Been going to Dr. Goldberg for over 10 years. ...
2,1,I don't know what Dr. Goldberg was like before...
3,1,I'm writing this review to give you a heads up...
4,2,All the food is great here. But the best thing...


In [71]:
test_reviews.head()

Unnamed: 0,rating,review
0,1,Ordered a large Mango-Pineapple smoothie. Stay...
1,2,Quite a surprise! \n\nMy wife and I loved thi...
2,1,"First I will say, this is a nice atmosphere an..."
3,2,I was overall pretty impressed by this hotel. ...
4,1,Video link at bottom review. Worst service I h...


In [72]:
train_reviews.isnull().sum()

rating    0
review    0
dtype: int64

In [73]:
# 고유 클래스
set(train_reviews.rating)

{1, 2}

In [74]:
# train, valid, test 데이터를 위해 별점 기준으로 나눔
by_rating = collections.defaultdict(list)
for _, row in train_reviews.iterrows():
    by_rating[row.rating].append(row.to_dict())

In [75]:
by_rating[row.rating][:5]

[{'rating': 2,
  'review': "Been going to Dr. Goldberg for over 10 years. I think I was one of his 1st patients when he started at MHMG. He's been great over the years and is really all about the big picture. It is because of him, not my now former gyn Dr. Markoff, that I found out I have fibroids. He explores all options with you and is very patient and understanding. He doesn't judge and asks all the right questions. Very thorough and wants to be kept in the loop on every aspect of your medical health and your life."},
 {'rating': 2,
  'review': 'All the food is great here. But the best thing they have is their wings. Their wings are simply fantastic!!  The \\"Wet Cajun\\" are by the best & most popular.  I also like the seasoned salt wings.  Wing Night is Monday & Wednesday night, $0.75 whole wings!\\n\\nThe dining area is nice. Very family friendly! The bar is very nice is well.  This place is truly a Yinzer\'s dream!!  \\"Pittsburgh Dad\\" would love this place n\'at!!'},
 {'ratin

In [76]:
# 분할 데이터 만듦
final_list = []
np.random.seed(args.seed)

for _, item_list in sorted(by_rating.items()):
    np.random.shuffle(item_list)

    n_total = len(item_list)
    n_train = int(args.train_proportion * n_total)
    n_val = int(args.val_proportion * n_total)

    # add 'split property'
    for item in item_list[:n_train]:
        item['split'] = 'train'

    for item in item_list[n_train : n_train + n_val]:
        item['split'] = 'val'

    # 최종 리스트에 추가
    final_list.extend(item_list)

In [77]:
for _, row in test_reviews.iterrows():
    row_dict = row.to_dict()
    row_dict['split'] = 'test'
    final_list.append(row_dict)

In [78]:
# 분할 데이터를 데이터 프레임으로 만듦
final_reviews = pd.DataFrame(final_list)

In [79]:
final_reviews.split.value_counts()

train    392000
val      168000
test      38000
Name: split, dtype: int64

In [80]:
final_reviews.head()

Unnamed: 0,rating,review,split
0,1,The entrance was the #1 impressive thing about...,train
1,1,"I'm a Mclover, and I had no problem\nwith the ...",train
2,1,"Less than good here, not terrible, but I see n...",train
3,1,I don't know if I can ever bring myself to go ...,train
4,1,Food was OK/Good but the service was terrible....,train


In [81]:
final_reviews[pd.isnull(final_reviews)].sum()

rating    0.0
review      0
split       0
dtype: object

In [82]:
# 리뷰 전처리
def preprocess_text(text):
    if type(text) == float:
        print(text)
    text = text.lower()
    text = re.sub(r"([.,!?])", r" \1 ", text)
    text = re.sub(r"[^a-zA-Z.,!?]+", r" ", text)
    return text

final_reviews.review = final_reviews.review.apply(preprocess_text)

In [83]:
final_reviews['rating'] = final_reviews.rating.apply({1: 'negative', 2: 'positive'}.get)

In [84]:
final_reviews.head()

Unnamed: 0,rating,review,split
0,negative,the entrance was the impressive thing about th...,train
1,negative,"i m a mclover , and i had no problem nwith the...",train
2,negative,"less than good here , not terrible , but i see...",train
3,negative,i don t know if i can ever bring myself to go ...,train
4,negative,food was ok good but the service was terrible ...,train


In [85]:
final_reviews.to_csv(args.output_munged_csv, index=False)