# 리뷰 정리
- 공백 리뷰 버리기
- 무의미 리뷰 버리기(Jimin's Rule)
(- 리뷰 길이)
- 처리 완료된 리뷰 저장 ** String 그대로 **



### TODO
1. 리뷰 버리는 기준 보완하기: 진짜 필요 없는 애만 버리도록!


In [1]:
import pandas as pd
from konlpy.tag import Komoran, Twitter
from tqdm import tqdm_notebook

pd.set_option('display.max_rows', 1000)
pd.set_option('display.width', 2000)

### Review csv 파일 불러오기

In [2]:
def read_from_csv(file):
    """
    input : path/file.csv
    return: pd.DataFrame with None review dropped    
    """
    df = pd.read_csv(file, index_col=0)
    df = df[df['Review'].notnull()]
    return df 

In [9]:
file = '../crawling/Reviews_csv/reviews_merged.csv'
reviews_all_df = read_from_csv(file)
reviews_all_df.head()

Unnamed: 0,Id,Rating,Review
0,2301428333,추천,기여운 동전지갑♡ 복실복실 기여운 털동전지갑입니다. 가방 키링으로도 괜찮구요~ 열쇠...
1,2301428333,추천,기염 깜찍^^ 털이 보슬보슬.하구요~~^^ 귀여워요. 동전이랑 립스틱정도 넣고 다니...
2,2301428333,적극추천,귀여우용 제가 쓰려고 샀는데 카드가 안들어가길래 당황햇어요ㅠㅠ 그래서 어머니 키링으...
3,2301428333,적극추천,ㅇ선물 칭구 선물이라 일칙 주려구했는데 배송이 좀 걸렸지만 판매자분이 친절해서 기다...
4,2301428333,적극추천,배송 ㅠ 배송 일칙 받을수 잇다구해서 시켯는데 늦어졌음 ㅜㅜ 그래도 판매자분이 친절...


### 공백 리뷰 버리기

In [11]:
def drop_empty_reviews(df):
    before = len(df)
    review_series = df['Review']
    review_list_stripped = [item.strip() for item in review_series]
    df['Review'] = review_list_stripped
    df = df[df['Review'] != '']
    after = len(df)
    print("dropped: %d\n" % (before-after))
    return df

In [12]:
reviews_all_df = drop_empty_reviews(reviews_all_df)
reviews_all_df.info()

dropped: 2921

<class 'pandas.core.frame.DataFrame'>
Int64Index: 264005 entries, 0 to 108934
Data columns (total 3 columns):
Id        264005 non-null int64
Rating    264005 non-null object
Review    264005 non-null object
dtypes: int64(1), object(2)
memory usage: 8.1+ MB


### 중복 리뷰 버리기

In [13]:
def drop_duplicate_reviews(df):
    before = len(df)
    df = df.drop_duplicates()
    after = len(df)
    print("dropped: %d\n" % (before-after))
    return df

In [14]:
reviews_all_df = drop_duplicate_reviews(reviews_all_df)
reviews_all_df.info()

dropped: 29034

<class 'pandas.core.frame.DataFrame'>
Int64Index: 234971 entries, 0 to 108934
Data columns (total 3 columns):
Id        234971 non-null int64
Rating    234971 non-null object
Review    234971 non-null object
dtypes: int64(1), object(2)
memory usage: 7.2+ MB


### 리뷰 형태소 분석 및 버리기(기준: Jimin's rule)

In [15]:
def get_alphabet_ratio(sentence):
    """
    input: pos tagged list(twitter.pos)
    """
    num_alphabet = sum(item[1] == 'Alpha' for item in sentence)
    return num_alphabet / len(sentence)

def get_kparticle_num(sentence):
    """
    input: pos tagged list(Twitter.pos)
    """
    return sum(item[1] == 'KoreanParticle' for item in sentence)

def pos_tag_reviews(df):
    """
    pos tag reviews using twitter.pos(norm=True)
    """
    review_list = df['Review'].tolist()
    tagged_review_list = []
    twitter = Twitter()    #
    
    print("pos_tagging_reviews...")
    for review in tqdm_notebook(review_list):
        analyzed_result = twitter.pos(review, norm=True)
        tagged_review_list.append(analyzed_result)
    df['Review'] = tagged_review_list
    return df

def drop_useless_reviews(df):
    """
    drops useless reviews based on Twitter.pos result
    """
    review_list = df['Review'].tolist()
    to_keep = []

    print("dropping useless_reviews...")
    for review in review_list:
        num_koreanparticle = get_kparticle_num(review)
        ratio_alphabet = get_alphabet_ratio(review)
        decision = False if num_koreanparticle != 0 or ratio_alphabet == 1 else True
        to_keep.append(decision)
    print('Reduced from %d => %d (%.2f%%)' % (len(df), sum(to_keep), 
                                              (sum(to_keep)/len(df))*100))
    return df[to_keep]

In [134]:
reviews_all_df = pos_tag_reviews(reviews_all_df)
reviews_all_df = drop_useless_reviews(reviews_all_df)
reviews_all_df.head()

pos_tagging_reviews...


HBox(children=(IntProgress(value=0, max=154540), HTML(value='')))


Reduced from 154540 => 110322 (71.39%)


Unnamed: 0,Id,Rating,Review
0,2301428333,추천,"[(귀여운, Adjective), (동전지갑, Noun), (♡, Foreign),..."
1,2301428333,추천,"[(기염, Noun), (깜, Verb), (찍, Noun), (^^, Punctu..."
5,2301428333,적극추천,"[(너무, Noun), (이뻐, Adjective), (요, Eomi), (^^, ..."
6,2301428333,적극추천,"[(강추템, Noun), (털, Noun), (빠짐, Verb), (이, Eomi)..."
7,2301428333,적극추천,"[(너무, Noun), (귀엽, Adjective), (습니다, Eomi), (!,..."


### 리뷰 평균 길이


In [138]:
def get_review_len(df):
    """
    adds review_len column to df
    """
    review_list = df['Review'].tolist()
    review_len_list = [len(review) for review in review_list]
    df['Review_len_morph'] = review_len_list

    avg_review_len = sum(review_len_list) // len(review_len_list)
    print("리뷰 평균 길이(형태소 개수): ", avg_review_len)
    return df

In [144]:
reviews_all_df = get_review_len(reviews_all_df)
reviews_all_df.head()

리뷰 평균 길이(형태소 개수):  19


Unnamed: 0,Id,Rating,Review,Review_len_morph
0,2301428333,추천,"[(귀여운, Adjective), (동전지갑, Noun), (♡, Foreign),...",32
1,2301428333,추천,"[(기염, Noun), (깜, Verb), (찍, Noun), (^^, Punctu...",33
5,2301428333,적극추천,"[(너무, Noun), (이뻐, Adjective), (요, Eomi), (^^, ...",30
6,2301428333,적극추천,"[(강추템, Noun), (털, Noun), (빠짐, Verb), (이, Eomi)...",30
7,2301428333,적극추천,"[(너무, Noun), (귀엽, Adjective), (습니다, Eomi), (!,...",63


### 파일 저장

In [51]:
reviews_all_df.to_csv("../crawling/Reviews_csv/Reviews_merged_processed.csv",
                     encoding='utf-8')