In [4]:
'''
[ gensim version ]

'''
# Libraries

import os 
TEMP_FOLDER = 'tmp/gensim'
RES_FOLDER = 'res'
from smart_open import open
import json

# gensim
import logging 
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) 
import tempfile 
print('Folder "{}" will be used to save temporary dictionary and corpus.'.format(TEMP_FOLDER)) 
import warnings 
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim') 
from gensim import corpora 
import gensim 
import numpy as np 
import pandas as pd 
import csv 
import collections 
import random 
from pprint import pprint  # pretty-printer 

# tensorflow
import tqdm
import tensorflow as tf

# directories

CONTENTS_SIZE = 7
contents_files = [ os.path.join(RES_FOLDER, 'contents/data.'+str(i)) for i in range(CONTENTS_SIZE) ]
metadata_file = os.path.join(RES_FOLDER, 'metadata.json')

if not os.path.exists(TEMP_FOLDER) :
    os.mkdir(TEMP_FOLDER)
    
documents_file = os.path.join(TEMP_FOLDER, 'documents_dict.json')
words_file = os.path.join(TEMP_FOLDER, 'words_dict.json')
preprocess_words_file = os.path.join(TEMP_FOLDER, 'preprocess_words_dict.json')

all_words_list = os.path.join(TEMP_FOLDER, 'all_words')


import string
def normalize_text(text) :
    # Remove Punctuation
    text = text.strip(string.punctuation)
    
    return text

Folder "tmp/gensim" will be used to save temporary dictionary and corpus.


In [5]:
'''

[ Make Vaild Documents Dictionary ]
- From metadata

[ Make Word Dictionary ]

'''

documents = {}
word_dict = {}


with open(metadata_file, 'r') as f :
    for raw in f :
        line = json.JSONDecoder().decode(raw)
        # line
        '''
        # {'magazine_id': 8982, 
            'user_id': '@bookdb', 
            'title': '사진으로 옮기기에도 아까운, 리치필드 국립공원', 
            'keyword_list': ['여행', '호주', '국립공원'], 
            'display_url': 'https://brunch.co.kr/@bookdb/782', 
            'sub_title': '세상 어디에도 없는 호주 Top 10', 
            'reg_ts': 1474944427000, 
            'article_id': 782, 
            'id': '@bookdb_782'}
        '''
        # document id
        if line['id'] not in documents :
            documents[line['id']] = line
        
        # keyword
        for keyword in line['keyword_list'] :
            if keyword not in word_dict :
                word_dict[keyword] = len(word_dict)
            
        # title
        new_title = []
        for title in line['title'] :
            for w in title.split(' ') :
                w = normalize_text(w)
                if w not in word_dict :
                    word_dict[w] = len(word_dict)
                
                new_title.append(word_dict[w])
                
        documents[line['id']]['title'] = new_title
        
        # sub_title
        new_sub_title = []
        for subtitle in line['sub_title'] :
            for w in subtitle.split(' ') :
                w = normalize_text(w)
                if w not in word_dict :
                    word_dict[w] = len(word_dict)
                
                new_sub_title.append(word_dict[w])
                
        documents[line['id']]['sub_title'] = new_sub_title

# save document file
with open(documents_file, 'w') as f :
    f.write(json.JSONEncoder().encode(documents))

# save word dictionary file
with open(words_file, 'w') as f :
    f.write(json.JSONEncoder().encode(word_dict))
        
        

In [7]:
'''

[ Make word dictionary ]
- From contents/data.0 ~ 6
- Using metadata

- for gensim project, word dict cannot be updated.
  -> store all words previously
  -> preprocess_word_dict = {}
'''

# load document file
with open(documents_file, 'r') as f :
    documents = json.JSONDecoder().decode(f.read())

# load word dictionary file
word_dict = {}
with open(words_file, 'r') as f :
    word_dict = json.JSONDecoder().decode(f.read())


# 1. word dictionary & max length of sentences

# for training gensim initialize
preprocess_word_dict = {}

# word_dict = {} : be made previously
MAX_SENTENCE_LEN = 0
for file_number in tqdm_notebook(range(CONTENTS_SIZE)) :
    # read each file
    with open(contents_files[file_number]) as f :
        # read each line
        for raw in f :
            line = json.JSONDecoder().decode(raw)
            
            doc_id = line['id']
            # if not exists in metadata -> ignore
            if doc_id not in documents : continue
                
            morphs = line['morphs']
            MAX_SENTENCE_LEN = max(MAX_SENTENCE_LEN, len(morphs))
            
            for morph in morphs :
                if morph not in word_dict :
                    word_dict[morph] = len(word_dict)
                
                if word_dict[morph] not in preprocess_word_dict :
                    preprocess_word_dict[word_dict[morph]] = doc_id
            
# save word dictionary file
with open(words_file, 'w') as f :
    f.write(json.JSONEncoder().encode(word_dict))
    
# save preprocess word dictionary file
with open(preprocess_words_file, 'w') as f :
    f.write(json.JSONEncoder().encode(preprocess_word_dict))
            


643104
643104
https://brunch.co.kr/@bookdb/782


In [2]:
def get_value_from_dict(dictionary, key) :
    # if not included, update.
    if key not in dictionary :
        # start from 1 (not zero)
        dictionary[key] = len(dictionary)+1
    
    return dictionary[key]

metadata = pd.read_json(metadata_file, lines=True)

metadata.shape

(643104, 9)

In [3]:
metadata.head()

Unnamed: 0,article_id,display_url,id,keyword_list,magazine_id,reg_ts,sub_title,title,user_id
0,782,https://brunch.co.kr/@bookdb/782,@bookdb_782,"[여행, 호주, 국립공원]",8982,1474944427000,세상 어디에도 없는 호주 Top 10,"사진으로 옮기기에도 아까운, 리치필드 국립공원",@bookdb
1,81,https://brunch.co.kr/@kohwang56/81,@kohwang56_81,"[목련꽃, 아지랑이, 동행]",12081,1463092749000,,[시] 서러운 봄,@kohwang56
2,4,https://brunch.co.kr/@hannahajink/4,@hannahajink_4,[],0,1447997287000,무엇 때문에,무엇을 위해,@hannahajink
3,88,https://brunch.co.kr/@bryceandjuli/88,@bryceandjuli_88,"[감정, 마음, 위로]",16315,1491055161000,,싫다,@bryceandjuli
4,34,https://brunch.co.kr/@mijeongpark/34,@mijeongpark_34,"[유럽여행, 더블린, 아일랜드]",29363,1523292942000,#7. 내 친구의 집은 어디인가,Dubliner#7,@mijeongpark


In [4]:
metadata['keyword_list']

0                [여행, 호주, 국립공원]
1               [목련꽃, 아지랑이, 동행]
2                            []
3                  [감정, 마음, 위로]
4             [유럽여행, 더블린, 아일랜드]
5            [석유에너지, 베네수엘라, 경제]
6           [입찰, 유치권, 부동산경매변호사]
7                [사랑, 연애, rain]
8                     [메일, 출판사]
9               [도시애벌레, 공부, 동시]
10                [패션, 에세이, 경제]
11              [불꽃축제, 불꽃, 여의도]
12               [주머니, 동시, 빵가게]
13                 [생각, 과거, 무시]
14                 [단상, 여행, 생각]
15                 [시험, 아침조회시간]
16             [감성에세이, 사랑, 첫사랑]
17            [애견놀이터, 애견, 애견카페]
18                           []
19                 [엄마, 취향, 효도]
20                [칼국수, 대화, 저항]
21               [세계여행, 여행, 행복]
22                [버스, 지하철, 친구]
23              [그림자, 하이에나, 도시]
24                   [자작소설, 창작]
25               [팀워크, 성과, 개인기]
26              [브랜드, 브랜딩, 마케팅]
27             [일자리, 청년실업, 저출산]
28               [취업, 사회생활, 결혼]
29                           []
                  ...          
643074  