In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install sklearn
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
!pip install konlpy 
from konlpy.tag import Okt

!pip install git+https://github.com/ssut/py-hanspell.git
!pip install git+https://github.com/haven-jeon/PyKoSpacing.git
from hanspell import spell_checker
from pykospacing import spacing

!pip install konlpy 
from konlpy.tag import Okt
okt = Okt()

import regex as re

Collecting konlpy
[?25l  Downloading https://files.pythonhosted.org/packages/85/0e/f385566fec837c0b83f216b2da65db9997b35dd675e107752005b7d392b1/konlpy-0.5.2-py2.py3-none-any.whl (19.4MB)
[K     |████████████████████████████████| 19.4MB 49.1MB/s 
[?25hCollecting colorama
  Downloading https://files.pythonhosted.org/packages/44/98/5b86278fbbf250d239ae0ecb724f8572af1c91f4a11edf4d36a206189440/colorama-0.4.4-py2.py3-none-any.whl
Collecting JPype1>=0.7.0
[?25l  Downloading https://files.pythonhosted.org/packages/cd/a5/9781e2ef4ca92d09912c4794642c1653aea7607f473e156cf4d423a881a1/JPype1-1.2.1-cp37-cp37m-manylinux2010_x86_64.whl (457kB)
[K     |████████████████████████████████| 460kB 30.0MB/s 
Collecting beautifulsoup4==4.6.0
[?25l  Downloading https://files.pythonhosted.org/packages/9e/d4/10f46e5cfac773e22707237bfcd51bbffeaf0a576b0a847ec7ab15bd7ace/beautifulsoup4-4.6.0-py3-none-any.whl (86kB)
[K     |████████████████████████████████| 92kB 8.8MB/s 
Installing collected packages: colorama



In [3]:
import nltk
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
# NLTK Data 다운
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

stop_words = open('/content/drive/MyDrive/신입프로젝트/korean_stopwords.txt').read()


# 위의 불용어는 명사가 아닌 단어 중에서 저자가 임의로 선정한 것으로 실제 의미있는 선정 기준이 아님
stop_words=stop_words.split('\n')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [4]:
txt = '커피가 너무 맛있어요!! 최고의 디저트,&케이크 진짜 달고6 맛있어요. 알바생이 친절합니다!'

In [5]:
def clean_text(texts): 
  corpus = [] 
  for i in range(0, len(texts)): 
    review = re.sub(r'[@%\\*=()/~#&\+á?\xc3\xa1\-\|\.\:\;\!\-\,\_\~\$\'\"]', '',str(texts[i])) #remove punctuation 
    review = re.sub(r'\d+','', str(texts[i]))# remove number 
    review = review.lower() #lower case 
    review = re.sub(r'\s+', ' ', review) #remove extra space 
    review = re.sub(r'<[^>]+>','',review) #remove Html tags 
    review = re.sub(r'\s+', ' ', review) #remove spaces 
    review = re.sub(r"^\s+", '', review) #remove space from start 
    review = re.sub(r'\s+$', '', review) #remove space from the end 
    review = re.sub(r'&', '', review)

    corpus.append(review) 
  return corpus


In [6]:
clean_text(txt)

['커',
 '피',
 '가',
 '',
 '너',
 '무',
 '',
 '맛',
 '있',
 '어',
 '요',
 '!',
 '!',
 '',
 '최',
 '고',
 '의',
 '',
 '디',
 '저',
 '트',
 ',',
 '',
 '케',
 '이',
 '크',
 '',
 '진',
 '짜',
 '',
 '달',
 '고',
 '',
 '',
 '맛',
 '있',
 '어',
 '요',
 '.',
 '',
 '알',
 '바',
 '생',
 '이',
 '',
 '친',
 '절',
 '합',
 '니',
 '다',
 '!']

In [7]:
def grammar_check(text):
  spelled_sent = spell_checker.check(text)
  hanspell_sent = spelled_sent.checked
  return hanspell_sent

In [8]:
def tokenize_tagged(text):
  temp_X = okt.pos(text, norm=True, stem=True) # 토큰화
  temp_X = [word for word in temp_X if not word in stop_words] # 불용어 제거
  return ['/'.join(t) for t in temp_X]

In [9]:
def preprocess(crude_text):
  token_list = []
  txt = ''.join(clean_text(crude_text))
  txt1 = spacing(txt)
  txt2 = grammar_check(txt1)
  txt3 = tokenize_tagged(txt2)
  regex1 = re.compile('Josa$')
  regex2 = re.compile('Punctuation$')
  regex3 = re.compile('Suffix$')
  regex4 = re.compile('KoreanParticle$')
  regex5 = re.compile('Alpha$')
  regex6 = re.compile('Foreign$')
  text_nj = []
  for item in txt3:
    mo1 = regex1.search(item)
    mo2 = regex2.search(item)
    mo3 = regex3.search(item)
    mo4 = regex4.search(item)
    mo5 = regex5.search(item)
    mo6 = regex6.search(item)
    if mo1 == None and mo2 == None and mo3 == None and mo4 == None and  mo5 == None and mo6 == None:
      text_nj.append(item)

  return text_nj

In [10]:
preprocess(txt)

['커피/Noun',
 '너무/Adverb',
 '맛있다/Adjective',
 '최고/Noun',
 '디저트/Noun',
 '케이크/Noun',
 '진짜/Noun',
 '달/Noun',
 '맛있다/Adjective',
 '아르바이트생/Noun',
 '친절하다/Adjective']