### 네이버 영화 평점 리뷰 데이터 노트북

#### 읽어오기

In [1]:
import pandas as pd

train = pd.read_csv('data/ratings_train.txt', sep='\t')
test = pd.read_csv('data/ratings_test.txt', sep='\t')

#### Null 값 제거하기

In [2]:
# null 개수 확인
print(f'trainset null 개수:\n{train.isnull().sum()}\n')
print(f'testset null 개수:\n{test.isnull().sum()}')

trainset null 개수:
id          0
document    5
label       0
dtype: int64

testset null 개수:
id          0
document    3
label       0
dtype: int64


In [3]:
train.dropna(inplace=True)
test.dropna(inplace=True)

#### 중복 제거하기

In [4]:
print(f'중복 제거 전 train length: {len(train)}')
train.drop_duplicates(subset=['document'], inplace=True, ignore_index=True)
print(f'중복 제거 후 train length: {len(train)}\n')
print(f'중복 제거 전 test length: {len(test)}')
test.drop_duplicates(subset=['document'], inplace=True, ignore_index=True)
print(f'중복 제거 후 test length: {len(test)}\n')

중복 제거 전 train length: 149995
중복 제거 후 train length: 146182

중복 제거 전 test length: 49997
중복 제거 후 test length: 49157



#### 정규표현식으로 한국어만 남기기

In [5]:
import re
from tqdm import tqdm

def cleaning(df):
    for idx, row in tqdm(df.iterrows(), desc='cleaning', total=len(df)):
        new_doc = re.sub('[^가-힣]', '', row['document']).strip()
        df.loc[idx, 'document'] = new_doc
    return df

In [6]:
train = cleaning(train)
test = cleaning(test)

cleaning: 100%|██████████| 146182/146182 [00:23<00:00, 6183.17it/s]
cleaning: 100%|██████████| 49157/49157 [00:07<00:00, 6252.30it/s]


#### 조사 제거하기

In [7]:
from konlpy.tag import Mecab

m = Mecab()
mecab_tags = ['JKS', 'JKC', 'JKG', 'JKO', 'JKB', 'JKV', 'JKQ', 'JX', 'JC', 'JK']

def remove_josa_mecab(df, tags):
    for idx, row in tqdm(df.iterrows(), desc='removing josa', total=len(df)):
        josa_removed = [x[0] for x in m.pos(row['document']) if x[1] not in tags]
        df.loc[idx, 'document'] = ' '.join(josa_removed)
    return df

train_mecab = remove_josa_mecab(train, mecab_tags)
test_mecab = remove_josa_mecab(test, mecab_tags)

removing josa: 100%|██████████| 146182/146182 [00:39<00:00, 3698.32it/s]
removing josa: 100%|██████████| 49157/49157 [00:13<00:00, 3673.89it/s]


In [8]:
from konlpy.tag import Komoran

k = Komoran()
komoran_tags = ['JK', 'JKS', 'JKC', 'JKG', 'JKO', 'JKB', 'JKV', 'JKQ', 'JX', 'JC']

def remove_josa_komoran(df, tags):
    for idx, row in tqdm(df.iterrows(), desc='removing josa', total=len(df)):
        josa_removed = [x[0] for x in k.pos(row['document']) if x[1] not in tags]
        df.loc[idx, 'document'] = ' '.join(josa_removed)
    return df

train_komoran = remove_josa_komoran(train, komoran_tags)
test_komoran = remove_josa_komoran(test, komoran_tags)

removing josa: 100%|██████████| 146182/146182 [02:03<00:00, 1187.99it/s]
removing josa: 100%|██████████| 49157/49157 [00:40<00:00, 1217.17it/s]


#### label 분포 확인하기