# Feature Extraction - Korean

#### 추준호(20224224)

In [1]:
# The corpus object
corpus = [
    "반얀트리 서울, 스포츠 시설 리뉴얼 기념 농구 대회 개최",
    "금융위에 P2P업체 투게더펀딩·펀다 온투업 등록 신청",
    "문 닫힌 롯데백화점 본점",
]

In [2]:
import re

In [3]:
def clean_doc(doc):
    doc = re.sub(r'[^\wㄱ-ㅎ가-힣]', ' ', doc)
    return doc

In [4]:
corpus = [clean_doc(x) for x in corpus]
corpus

['반얀트리 서울  스포츠 시설 리뉴얼 기념 농구 대회 개최',
 '금융위에 P2P업체 투게더펀딩 펀다 온투업 등록 신청',
 '문 닫힌 롯데백화점 본점']

## Compare Tokenizer

### (1) Okt - Open Korean Text

In [5]:
from konlpy.tag import Okt

In [7]:
okt = Okt()

In [8]:
for doc in corpus:
    tokens = okt.morphs(doc)
    print(tokens)

['반', '얀', '트리', '서울', '스포츠', '시설', '리뉴얼', '기념', '농구', '대회', '개최']
['금융위', '에', 'P', '2', 'P', '업체', '투게더', '펀딩', '펀다', '온', '투업', '등록', '신청']
['문', '닫힌', '롯데', '백화점', '본점']


### (2) Hannanum - KAIST

In [9]:
from konlpy.tag import Hannanum

In [10]:
han = Hannanum()

In [11]:
for doc in corpus:
    tokens = han.morphs(doc)
    print(tokens)

['반얀트리', '서울', '스포츠', '시설', '리뉴얼', '기념', '농구', '대회', '개최']
['금융위', '에', 'P2P업체', '투게더펀딩', '펀다', '온투업', '등록', '신청']
['문', '닫히', 'ㄴ', '롯데백화점', '본점']


### (3) Kkma - SNU

In [12]:
from konlpy.tag import Kkma

In [13]:
kkma = Kkma()

In [14]:
for doc in corpus:
    tokens = kkma.morphs(doc)
    print(tokens)

['반', '얀', '트리', '서울', '스포츠', '시설', '리', '뉴', '얼', '기념', '농구', '대회', '개최']
['금융', '위', '에', 'P', '2', 'P', '업체', '투', '것', '이', '더', '펀', '딩', '펀', '다', '오', 'ㄴ', '투', '업', '등록', '신청']
['문', '닫히', 'ㄴ', '롯데', '백화점', '본점']


### (4) Mecab - Japan, written in C

## Feature Extraction

In [15]:
import gensim

In [19]:
tokenized_corpus = [
    han.morphs(doc) for doc in corpus
]

In [20]:
tokenized_corpus

[['반얀트리', '서울', '스포츠', '시설', '리뉴얼', '기념', '농구', '대회', '개최'],
 ['금융위', '에', 'P2P업체', '투게더펀딩', '펀다', '온투업', '등록', '신청'],
 ['문', '닫히', 'ㄴ', '롯데백화점', '본점']]

In [21]:
lexicon = gensim.corpora.Dictionary(tokenized_corpus)

In [22]:
for x in lexicon.items():
    print(x)

(0, '개최')
(1, '기념')
(2, '농구')
(3, '대회')
(4, '리뉴얼')
(5, '반얀트리')
(6, '서울')
(7, '스포츠')
(8, '시설')
(9, 'P2P업체')
(10, '금융위')
(11, '등록')
(12, '신청')
(13, '에')
(14, '온투업')
(15, '투게더펀딩')
(16, '펀다')
(17, 'ㄴ')
(18, '닫히')
(19, '롯데백화점')
(20, '문')
(21, '본점')


### (1) Bag of Words

In [23]:
for doc in tokenized_corpus:
    vec = lexicon.doc2bow(doc)
    print(vec)

[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1)]
[(9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1)]
[(17, 1), (18, 1), (19, 1), (20, 1), (21, 1)]


### (2) One-Hot Encoding

In [24]:
for doc in tokenized_corpus:
    vec = lexicon.doc2bow(doc)
    vec = [(x[0], 1) for x in vec]
    print(vec)

[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1)]
[(9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1)]
[(17, 1), (18, 1), (19, 1), (20, 1), (21, 1)]


### (3) Tf-idf

In [26]:
tfidf = gensim.models.TfidfModel(dictionary=lexicon, normalize=True)

In [27]:
for doc in tokenized_corpus:
    vec = lexicon.doc2bow(doc)
    vec = tfidf[vec]
    print(vec)

[(0, 0.3333333333333333), (1, 0.3333333333333333), (2, 0.3333333333333333), (3, 0.3333333333333333), (4, 0.3333333333333333), (5, 0.3333333333333333), (6, 0.3333333333333333), (7, 0.3333333333333333), (8, 0.3333333333333333)]
[(9, 0.35355339059327373), (10, 0.35355339059327373), (11, 0.35355339059327373), (12, 0.35355339059327373), (13, 0.35355339059327373), (14, 0.35355339059327373), (15, 0.35355339059327373), (16, 0.35355339059327373)]
[(17, 0.4472135954999579), (18, 0.4472135954999579), (19, 0.4472135954999579), (20, 0.4472135954999579), (21, 0.4472135954999579)]
