# 리뷰 정리
- 공백 리뷰 버리기
- 리뷰 버리기(Jimin's Rule)
- NA ratio 계산하기
- 리뷰 평균 길이
- 리뷰 딕셔너리 및 최빈값
- good/soso review 병합
- 처리 완료된 리뷰 저장



### TODO
~~1. bad review -> reviews_all에 통합하기!~~  
~~2. 중복 리뷰 있으면 버리기(drop_duplicates)~~
~~3. product id개수 확인하기(11686)~~

4. Char lv로 정리하기
5. word lv & char lv 단어 최상위 _몇 개_로 자르기!

In [114]:
import pandas as pd
from konlpy.tag import Komoran, Twitter
from tqdm import tqdm_notebook

pd.set_option('display.max_rows', 1000)

### Review csv 파일 불러오기

In [115]:
def read_from_csv(file):
    """
    input : path/file.csv
    return: pd.DataFrame with None review dropped    
    """
    df = pd.read_csv(file, index_col=0)
    df = df[df['Review'].notnull()]
    return df 

In [128]:
# soso&bad 파일 처리!
# file = "/Users/hwii/Documents/BOAZ/project/code/크롤링/reviews_soso.csv"
# good 파일 처리
file = "/Users/hwii/Documents/BOAZ/project/code/크롤링/Reviews_csv/Reviews_good.csv"
reviews_all_df = read_from_csv(file)
reviews_all_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 158897 entries, 0 to 163573
Data columns (total 3 columns):
Id        158897 non-null int64
Rating    158897 non-null object
Review    158897 non-null object
dtypes: int64(1), object(2)
memory usage: 4.8+ MB


### 공백 리뷰 버리기

In [129]:
def drop_empty_reviews(df):
    before = len(df)
    review_series = df['Review']
    review_list_stripped = [item.strip() for item in review_list]
    df['Review'] = review_list_stripped
    df = df[df['Review'] != '']
    after = len(df)
    print("dropped: %d\n" % (before-after))
    return df

In [130]:
reviews_all_df = drop_empty_reviews(reviews_all_df)
reviews_all_df.info()

dropped: 2885

<class 'pandas.core.frame.DataFrame'>
Int64Index: 156012 entries, 0 to 163573
Data columns (total 3 columns):
Id        156012 non-null int64
Rating    156012 non-null object
Review    156012 non-null object
dtypes: int64(1), object(2)
memory usage: 4.8+ MB


### 중복 리뷰 버리기

In [131]:
def drop_duplicate_reviews(df):
    before = len(df)
    df = df.drop_duplicates()
    after = len(df)
    print("dropped: %d\n" % (before-after))
    return df

In [132]:
reviews_all_df = drop_duplicate_reviews(reviews_all_df)
reviews_all_df.info()

dropped: 1472

<class 'pandas.core.frame.DataFrame'>
Int64Index: 154540 entries, 0 to 163573
Data columns (total 3 columns):
Id        154540 non-null int64
Rating    154540 non-null object
Review    154540 non-null object
dtypes: int64(1), object(2)
memory usage: 4.7+ MB


### 리뷰 형태소 분석 및 버리기(기준: Jimin's rule)

In [135]:
def get_alphabet_ratio(sentence):
    """
    input: pos tagged list(twitter.pos)
    """
    num_alphabet = sum(item[1] == 'Alpha' for item in sentence)
    return num_alphabet / len(sentence)

def get_kparticle_num(sentence):
    """
    input: pos tagged list(Twitter.pos)
    """
    return sum(item[1] == 'KoreanParticle' for item in sentence)

def pos_tag_reviews(df):
    """
    pos tag reviews using twitter.pos(norm=True)
    """
    review_list = df['Review'].tolist()
    tagged_review_list = []
    twitter = Twitter()    #
    
    print("pos_tagging_reviews...")
    for review in tqdm_notebook(review_list):
        analyzed_result = twitter.pos(review, norm=True)
        tagged_review_list.append(analyzed_result)
    df['Review'] = tagged_review_list
    return df

def drop_useless_reviews(df):
    """
    drops useless reviews based on Twitter.pos result
    """
    review_list = df['Review'].tolist()
    to_keep = []

    print("dropping useless_reviews...")
    for review in review_list:
        num_koreanparticle = get_kparticle_num(review)
        ratio_alphabet = get_alphabet_ratio(review)
        decision = False if num_koreanparticle != 0 or ratio_alphabet == 1 else True
        to_keep.append(decision)
    print('Reduced from %d => %d (%.2f%%)' % (len(df), sum(to_keep), 
                                              (sum(to_keep)/len(df))*100))
    return df[to_keep]

In [134]:
reviews_all_df = pos_tag_reviews(reviews_all_df)
reviews_all_df = drop_useless_reviews(reviews_all_df)
reviews_all_df.head()

pos_tagging_reviews...


HBox(children=(IntProgress(value=0, max=154540), HTML(value='')))


Reduced from 154540 => 110322 (71.39%)


Unnamed: 0,Id,Rating,Review
0,2301428333,추천,"[(귀여운, Adjective), (동전지갑, Noun), (♡, Foreign),..."
1,2301428333,추천,"[(기염, Noun), (깜, Verb), (찍, Noun), (^^, Punctu..."
5,2301428333,적극추천,"[(너무, Noun), (이뻐, Adjective), (요, Eomi), (^^, ..."
6,2301428333,적극추천,"[(강추템, Noun), (털, Noun), (빠짐, Verb), (이, Eomi)..."
7,2301428333,적극추천,"[(너무, Noun), (귀엽, Adjective), (습니다, Eomi), (!,..."


### 리뷰 평균 길이


In [138]:
def get_review_len(df):
    """
    adds review_len column to df
    """
    review_list = df['Review'].tolist()
    review_len_list = [len(review) for review in review_list]
    df['Review_len_morph'] = review_len_list

    avg_review_len = sum(review_len_list) // len(review_len_list)
    print("리뷰 평균 길이(형태소 개수): ", avg_review_len)
    return df

In [137]:
reviews_all_df = get_review_len(reviews_all_df)
reviews_all_df.head()

리뷰 평균 길이(형태소 개수):  19


Unnamed: 0,Id,Rating,Review,Review_len_morph
0,2301428333,추천,"[(귀여운, Adjective), (동전지갑, Noun), (♡, Foreign),...",32
1,2301428333,추천,"[(기염, Noun), (깜, Verb), (찍, Noun), (^^, Punctu...",33
5,2301428333,적극추천,"[(너무, Noun), (이뻐, Adjective), (요, Eomi), (^^, ...",30
6,2301428333,적극추천,"[(강추템, Noun), (털, Noun), (빠짐, Verb), (이, Eomi)...",30
7,2301428333,적극추천,"[(너무, Noun), (귀엽, Adjective), (습니다, Eomi), (!,...",63


### 파일 저장

In [51]:
# reviews_all_df.to_csv("/Users/hwii/Documents/BOAZ/project/code/크롤링/Reviews_csv/Reviews_soso_processed.csv",
#                      encoding='utf-8')
reviews_all_df.to_csv("/Users/hwii/Documents/BOAZ/project/code/크롤링/Reviews_csv/Reviews_good_processed.csv",
                     encoding='utf-8')

### 파일 병합

In [53]:
reviews_good = pd.read_csv("/Users/hwii/Documents/BOAZ/project/code/크롤링/Reviews_csv/Reviews_good_processed.csv",
                           index_col=0)
reviews_soso = pd.read_csv("/Users/hwii/Documents/BOAZ/project/code/크롤링/Reviews_csv/Reviews_soso_processed.csv",
                           index_col=0)
reviews_merged = pd.concat([reviews_good, reviews_soso], axis=0, ignore_index=True)
reviews_merged.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 184963 entries, 0 to 184962
Data columns (total 4 columns):
Id                  184963 non-null int64
Rating              184963 non-null object
Review              184963 non-null object
Review_len_morph    184963 non-null int64
dtypes: int64(2), object(2)
memory usage: 5.6+ MB


### 파일 저장

In [54]:
reviews_merged.to_csv("/Users/hwii/Documents/BOAZ/project/code/크롤링/Reviews_csv/Reviews_merged_processed.csv",
                     encoding='utf-8')