# Colab 설정

In [0]:
from google.colab import auth
auth.authenticate_user()

from google.colab import drive
drive.mount('/content/gdrive', force_remount=False)

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:
import os
from pathlib import Path

folder = "colab/movie_pred_git" ## 자기 드라이브 경로 입력

base_path = Path("/content/gdrive/My Drive/")
project_path = base_path / folder
os.chdir(project_path)
for x in list(project_path.glob("*")):
    if x.is_dir():
        dir_name = str(x.relative_to(project_path))
        os.rename(dir_name, dir_name.split(" ", 1)[0])
print(f"{os.getcwd()}")

/content/gdrive/My Drive/colab/movie_pred_git


# 패키지 로딩

In [0]:
!pip install konlpy



In [0]:
import pandas as pd
import numpy as np
import re
from konlpy.tag import Okt
import pickle
from tqdm import tqdm_notebook as tqdm

# 전처리

In [0]:

train_data = pd.read_table('data/ratings_train.txt')
test_data = pd.read_table('data/ratings_test.txt')

In [0]:
train_data.fillna('', inplace=True)
test_data.fillna('', inplace=True)

In [0]:
## stopword 불러오기
with open ('data/stopwords.txt', 'rb') as f:
    stopwords = pickle.load(f)

In [0]:
## 특수 문자 제거
train_data['document'] = train_data['document'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","")
test_data['document'] = test_data['document'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","")

In [0]:
okt = Okt()

In [0]:
## okt 이용 토크나이징
train = []

for i in tqdm(range(len(train_data))):
  tmp = okt.morphs(train_data['document'][i], stem=True)
  tmp = [word for word in tmp if not word in stopwords]
  train.append(tmp)

HBox(children=(IntProgress(value=0, max=150000), HTML(value='')))

In [0]:
test = []

for i in tqdm(range(len(test_data))):
  tmp = okt.morphs(test_data['document'][i], stem=True)
  tmp = [word for word in tmp if not word in stopwords]
  test.append(tmp)

HBox(children=(IntProgress(value=0, max=50000), HTML(value='')))

In [0]:
## 토크나이징 된 문장으로 변경
tmp = []
for i in train:
  a = ""
  for wrd in i:
    a+=wrd+" "
  tmp.append(a)

In [0]:
train_data["document"] = tmp

In [0]:
tmp = []
for i in test:
  a = ""
  for wrd in i:
    a+=wrd+" "
  tmp.append(a)

In [0]:
test_data["document"] = tmp

In [0]:
## 문장 길이가 0인 항목 제거
lst = []

for i in range(len(train_data)):
  if len(train_data["document"][i].split()) == 0:
    lst.append(i)

train_data.drop(lst, axis = 0, inplace=True)

In [0]:
lst = []

for i in range(len(test_data)):
  if len(test_data["document"][i].split()) == 0:
    lst.append(i)

test_data.drop(lst, axis = 0, inplace=True)

In [0]:
## 인덱스 초기화
train_data.reset_index(drop=True, inplace=True)
test_data.reset_index(drop=True, inplace=True)

In [0]:
## 저장
with open('data/train.txt', 'wb') as f:
    pickle.dump(train_data, f)
with open('data/test.txt', 'wb') as f:
    pickle.dump(test_data, f)