In [20]:
import pandas as pd
import numpy as np
from gensim.models import FastText
import re
import os
import matplotlib.pyplot as plt
import seaborn as sns
import platform
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
if platform.system() == 'Darwin': #맥
        plt.rc('font', family='AppleGothic')
elif platform.system() == 'Windows': #윈도우
        plt.rc('font', family='Malgun Gothic')
elif platform.system() == 'Linux': #리눅스 (구글 콜랩)
        #!wget "https://www.wfonts.com/download/data/2016/06/13/malgun-gothic/malgun.ttf"
        #!mv malgun.ttf /usr/share/fonts/truetype/
        #import matplotlib.font_manager as fm
        #fm._rebuild()
        plt.rc('font', family='Malgun Gothic')
plt.rcParams['axes.unicode_minus'] = False #한글 폰트 사용시 마이너스 폰트 깨짐 해결
#matplotlib 패키지 한글 깨짐 처리 끝

In [3]:
data_path = '../../../data/뉴스'

train_token_path = os.path.join(data_path, 'token.csv')
test_token_path = os.path.join(data_path, 'test_token.csv')

train_token = pd.read_csv(train_token_path)
test_token = pd.read_csv(test_token_path)

In [4]:
# 리스트로 변환
tokenize_data = []
for sentence in train_token['title'].tolist():
    tokenize_data.append(sentence.split(' '))

In [32]:
def preprocessing(x, reg=r'[^\d가-힣a-zA-Z ]'):
    x = re.sub(reg, '', x)

    return x

In [33]:
preprocessing(train_token['title'].tolist()[0])

'인천  핀란드 항공기 결항 휴가 철 여행객 분통'

In [30]:
train_token['title'].tolist()[0]

'인천 → 핀란드 항공기 결항 휴가 철 여행객 분통'

In [31]:
train_token['title'].apply(preprocessing)

0                   인천  핀란드 항공기 결항 휴가 철 여행객 분통
1              실리콘밸리 넘어서겠다 구글 15조원 들여  전역 거점 화
2              이란 외무 긴장 완화 해결 책 미국 경제 전쟁 멈추는 것
3         NYT 클린턴 측근  기업 특수 관계 조명 공과 사 맞 물려 종합
4                    시진핑 트럼프 중미 무역 협상 조속 타결 희망
                         ...                  
45649             KB 금융 미국 IB 스티펠 제휴 선진국 시장 공략
45650        1 보 서울시 교육청 신종 코로나 확산 개학 연기 휴업 검토
45651          게시판 키움 증권 2020 키움 영웅 전 실전 투자 대회
45652                   답변 하는 배 기동 국립 중앙 박물관 장
45653    2020 한국 인터넷 기자 상 시상식 내달 1일 개최 특별상 김성후
Name: title, Length: 45654, dtype: object

In [5]:
# fasttext fit
model = FastText(tokenize_data, window=5, min_count=5, workers=4, sg=1)

In [6]:
def make_token_and_tokenizer(text, max_len=100, oov=None, num_words=None, pad_option='post'):
    tokenizer = Tokenizer(num_words=num_words, oov_token=oov)
    tokenizer.fit_on_texts(text)
    token_index = tokenizer.texts_to_sequences(text)
    vocab_size = len(tokenizer.word_index) + 1
    token_index = pad_sequences(token_index, maxlen=max_len, padding=pad_option, truncating=pad_option)

    return token_index, vocab_size, tokenizer


def make_embedding_matrix(word_dict, model, vocab_size, embedding_size=100):
    embedding_matrix = np.zeros((vocab_size, embedding_size))

    for word, index in word_dict.items():
        embedding_matrix[index] = model.wv[word]

    return np.array(embedding_matrix)

In [7]:
token_index, vocab_size, tokenizer = make_token_and_tokenizer(train_token['title'].tolist(), num_words=5)

In [11]:
embedding_matrix = make_embedding_matrix(tokenizer.word_index, model, vocab_size)
embedding_matrix[1]

array([-2.34000653e-01, -1.48438901e-01, -8.46140385e-02,  4.08726722e-01,
       -2.35203832e-01,  2.06076339e-01, -1.69520497e-01,  4.17405605e-01,
       -1.22278109e-01, -1.82890862e-01, -8.99545178e-02, -3.32807004e-01,
        1.58818513e-01, -1.71533033e-01, -1.73078328e-01, -3.60920876e-02,
        1.46059722e-01, -2.64962375e-01,  2.76981056e-01, -2.12057084e-01,
        1.23996824e-01, -2.02850312e-01,  1.10532055e-02, -2.19364762e-01,
       -2.56515414e-01,  8.59668851e-01, -3.13659430e-01,  7.36804903e-02,
       -2.25314453e-01, -2.31257945e-01, -8.58191401e-02,  6.03953702e-03,
        4.70048070e-01, -1.91677377e-01, -2.42605045e-01, -6.42967671e-02,
        3.41128469e-01, -4.24498171e-01, -1.80519283e-01, -1.19330727e-01,
       -2.24605724e-01, -5.08240834e-02, -1.41700879e-01,  1.39267474e-01,
       -3.44296172e-02,  2.10596412e-01,  1.89147994e-01, -2.41610587e-01,
        4.20441806e-01, -8.15321654e-02, -1.22011870e-01, -1.79017380e-01,
       -5.33852696e-01,  

In [15]:
embedding_matrix[2]

array([-0.46987399,  0.11404312, -0.12749296, -0.07105628, -0.38162771,
       -0.37322485, -0.066191  ,  0.33457598, -0.24870729, -0.03607721,
        0.0576367 , -0.17725083,  0.12323913,  0.14855817,  0.0374545 ,
       -0.25264937,  0.21855438, -0.24384964,  0.02618166,  0.0173444 ,
        0.1427622 , -0.13877352, -0.00180056, -0.13860919, -0.26955086,
        0.20427953, -0.35139984, -0.08637713,  0.05299949, -0.10124034,
        0.16792449, -0.26216951,  0.33024561, -0.05227954, -0.22807267,
        0.37810633,  0.27609929, -0.06432895,  0.29060668,  0.25452116,
       -0.00603123, -0.45129418, -0.07700988,  0.17995019, -0.1181601 ,
        0.24631879,  0.17428289,  0.29237244, -0.09337172,  0.20880966,
        0.17561908, -0.13826071, -0.03798177, -0.1293499 , -0.28957424,
        0.02757886,  0.27273509,  0.00143139, -0.27437136,  0.26441479,
        0.05669692, -0.22543986,  0.00256754,  0.02733808, -0.03141342,
        0.02034145,  0.28932109,  0.01002217,  0.11553812,  0.35

In [14]:
model.wv['종합']

array([-2.34000653e-01, -1.48438901e-01, -8.46140385e-02,  4.08726722e-01,
       -2.35203832e-01,  2.06076339e-01, -1.69520497e-01,  4.17405605e-01,
       -1.22278109e-01, -1.82890862e-01, -8.99545178e-02, -3.32807004e-01,
        1.58818513e-01, -1.71533033e-01, -1.73078328e-01, -3.60920876e-02,
        1.46059722e-01, -2.64962375e-01,  2.76981056e-01, -2.12057084e-01,
        1.23996824e-01, -2.02850312e-01,  1.10532055e-02, -2.19364762e-01,
       -2.56515414e-01,  8.59668851e-01, -3.13659430e-01,  7.36804903e-02,
       -2.25314453e-01, -2.31257945e-01, -8.58191401e-02,  6.03953702e-03,
        4.70048070e-01, -1.91677377e-01, -2.42605045e-01, -6.42967671e-02,
        3.41128469e-01, -4.24498171e-01, -1.80519283e-01, -1.19330727e-01,
       -2.24605724e-01, -5.08240834e-02, -1.41700879e-01,  1.39267474e-01,
       -3.44296172e-02,  2.10596412e-01,  1.89147994e-01, -2.41610587e-01,
        4.20441806e-01, -8.15321654e-02, -1.22011870e-01, -1.79017380e-01,
       -5.33852696e-01,  

In [7]:
model.wv.most_similar("경제")

[('경제위기', 0.8050490021705627),
 ('대외', 0.8042980432510376),
 ('경제정책', 0.7995499968528748),
 ('경제학', 0.7953577637672424),
 ('관계', 0.783466100692749),
 ('성과', 0.7813449501991272),
 ('우호', 0.7750430703163147),
 ('자유무역', 0.7726960182189941),
 ('강조', 0.7656141519546509),
 ('시장경제', 0.7585611343383789)]

In [8]:
model.wv.most_similar("손흥민")

[('기성용', 0.9182633757591248),
 ('토트넘', 0.9130188822746277),
 ('차범근', 0.9050355553627014),
 ('황희', 0.9038408994674683),
 ('이승우', 0.9028437733650208),
 ('EPL', 0.8941956758499146),
 ('태극전사', 0.8891777396202087),
 ('첼시', 0.8862379193305969),
 ('맨시티', 0.8860947489738464),
 ('유니폼', 0.8833632469177246)]

In [9]:
model.wv.most_similar("인천")

[('울산', 0.8596231341362),
 ('남양주', 0.8474705815315247),
 ('대구', 0.8407805562019348),
 ('공원', 0.8297332525253296),
 ('전남', 0.8254337310791016),
 ('안산', 0.8251743316650391),
 ('청주', 0.8097084760665894),
 ('충남', 0.8093366026878357),
 ('경남', 0.8083070516586304),
 ('호우경보', 0.803193211555481)]

In [10]:
model.wv.most_similar("금융")

[('금융위', 0.8594071865081787),
 ('금융업', 0.833926796913147),
 ('금감원', 0.7757264971733093),
 ('금융투자', 0.7727395296096802),
 ('은행', 0.7718164324760437),
 ('DLF', 0.7582404017448425),
 ('硏', 0.7555417418479919),
 ('금융기관', 0.7493996024131775),
 ('운용', 0.7489657998085022),
 ('BNK', 0.7481409311294556)]

In [11]:
model.wv.most_similar("미국")

[('이어', 0.7200479507446289),
 ('화웨이', 0.718909740447998),
 ('중동', 0.7131580710411072),
 ('중미', 0.7122350931167603),
 ('뿔', 0.7111219763755798),
 ('주일', 0.7075144648551941),
 ('對', 0.7072343230247498),
 ('전쟁', 0.7063120603561401),
 ('美', 0.7060495615005493),
 ('푸틴', 0.7006589770317078)]