# Step 1: IMPORT THE LIBRARIES

In [1]:
#pip install pymongo

In [2]:
from pymongo import MongoClient
import pandas as pd
import numpy as np
from bson.objectid import ObjectId

# Step 2: CONNECTION TO THE DATABASE

In [3]:
client = MongoClient('mongodb://203.255.92.141:27017', authSource='admin')

In [4]:
db = client['SCIENCEON']

In [5]:
db.list_collection_names()

['QueryKeyword',
 'ExpertFactor',
 'Author',
 'Rawdata',
 'ExpertFactorTable',
 'AuthorPapers',
 'AuthorRelation']

In [6]:
scienceOn_author = db['Author']
scienceOn_authorPapers = db['AuthorPapers']
scienceOn_rawData = db['Rawdata']

# Step 3: FIND THE AUTHOR RESEARCH ID

In [7]:
author_cursor = scienceOn_author.find({'name':'유재수', 'inst': '충북대학교'})

In [8]:
for author in author_cursor:
    researcher_ID = author['_id']

In [9]:
print(researcher_ID)

s101957


# Step 4: FIND ALL THE PAPERS OF THE AUTHOR HAVING THE RESEARCH ID WE FOUND & PUT IT IN A DATAFRAME

CREATE A DATAFRAME WHICH SHOULD COUNTAIN ALL THE PAPERS

In [10]:
dfPapers = pd.DataFrame(columns=['papers'])

COLLECT ALL THE PAPERS

In [11]:
authorPapers_cursor = scienceOn_authorPapers.find({'A_ID':researcher_ID})
for authorPapers in authorPapers_cursor:
    papers = authorPapers['papers']
    for i in range(len(papers)):
        papersID = papers[i]
        objInstance = ObjectId(papersID)
        rawData_cursor = scienceOn_rawData.find({ "_id" : objInstance })
        for document in rawData_cursor:
            if type(document['paper_keyword']) != list:
                new_document = document['title'] + ' ' + document['english_title'] + ' ' + document['abstract'] + ' ' + document['paper_keyword'] + ' ' + document['english_abstract']
            else:
                paper_keyword = ''
                for j in range(len(document['paper_keyword'])):
                    paper_keyword += document['paper_keyword'][j] + ' '
                new_document = document['title'] + ' ' + document['english_title'] + ' ' + document['abstract'] + paper_keyword + document['english_abstract']
            df_new_document = pd.DataFrame(data=np.array([[new_document]]), columns=['papers'])
            dfPapers = pd.concat([dfPapers,df_new_document], ignore_index=True)

In [12]:
documents = dfPapers

# Step 5: PRINT THE DATAFRAME

In [13]:
documents

Unnamed: 0,papers
0,HDFS에서 적응형 캐시 관리 기법 Adaptive Cache Management ...
1,이미지 데이터 마이닝을 이용한 모바일 기반 금형 검색 시스템 A Mold Searc...
2,빅데이터 활성화 정책 및 응용 사례 [] 다양한 정보 채널의 등장과 함께 빅데이터에...
3,빅데이터 병렬 처리 기술 동향 [] []
4,4차 산업혁명에서 빅데이터 [] []
5,화장품 추천을 위한 개인의 피부 유형 및 유전자를 이용한 빅데이터 분석 기반 모바일...
6,NoSQL 데이터베이스 엔진을 이용한 스토리지 벤치마킹 시스템 Storage Ben...
7,노화 관련 유전자의 후성유전학적 접근 Epigenomic Approaches for...
8,이미지 데이터 마이닝을 이용한 모바일 기반 금형 검색 시스템 A Mold Searc...
9,YCSB 기반의 데이터베이스 엔진 벤치마킹 GUI 설계 Design of GUI f...


# LDA PART

# Step 1: Data Cleaning & Prepare text for LDA analysis

In [14]:
import nltk
import gensim
import numpy as np

from nltk.stem.porter import *
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer

stemmer = SnowballStemmer("english")

In [15]:
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos ='v'))

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 2:
            result.append(lemmatize_stemming(token))
    return result

In [16]:
preprocessed_docs = documents['papers'].map(preprocess)
preprocessed_docs[:10]

0    [hdfs에서, 적응형, adapt, cach, manag, scheme, hdfs...
1    [이미지, 데이터, 마이닝을, 이용한, 모바일, 시스템, mold, search, ...
2    [빅데이터, 활성화, 다양한, 채널의, 등장과, 빅데이터에, 중요성이, 부각되고, ...
3                                               [빅데이터]
4                                       [산업혁명에서, 빅데이터]
5    [화장품, 추천을, 개인의, 유전자를, 이용한, 빅데이터, 모바일, 서비스, big...
6    [nosql, 데이터베이스, 엔진을, 이용한, 스토리지, 벤치마킹, 시스템, sto...
7    [유전자의, 후성유전학적, epigenom, approach, regul, age,...
8    [이미지, 데이터, 마이닝을, 이용한, 모바일, 시스템, mold, search, ...
9    [ycsb, 기반의, 데이터베이스, 벤치마킹, gui, design, gui, be...
Name: papers, dtype: object

# Step 2: Dictionary & corpus 

In [17]:
dictionary = gensim.corpora.Dictionary(preprocessed_docs)
count = 0 
for k, v in dictionary.iteritems():
    print(k, v, dictionary.dfs[k])
    count += 1
    if count > 286 :
        break

0 adapt 1
1 cach 1
2 hdfs 1
3 hdfs에서 1
4 hdfs을 1
5 ict 1
6 manag 1
7 scheme 5
8 공정의 1
9 관리하기 1
10 관리한다 1
11 기법은 4
12 기법을 8
13 기법의 1
14 기업들은 1
15 기존보다 1
16 나타나는 1
17 노드에서 1
18 노드의 1
19 높이고 1
20 다양한 8
21 데이터 6
22 데이터가 1
23 데이터를 5
24 디스크 1
25 디스크의 1
26 방대한 1
27 보인다 1
28 분석하여 2
29 속도에 1
30 속도의 1
31 스마트팩토리는 1
32 이용한 12
33 이용한다 1
34 읽기와 1
35 적용할 1
36 적응형 1
37 적합한 2
38 정보통신기술 1
39 제안하는 8
40 제안한다 14
41 제어하고 1
42 처리하기 1
43 크기를 1
44 크기의 1
45 평가를 1
46 하둡을 1
47 향상과 1
48 확장시 1
49 환경에서 1
50 효과를 1
51 효율성을 1
52 효율적으로 1
53 base 6
54 big 5
55 cloud 2
56 data 6
57 icbm 2
58 idustri 2
59 imag 2
60 iot 3
61 mine 2
62 mobil 5
63 mold 2
64 search 2
65 각광을 2
66 검색하여 2
67 구축하고 3
68 그림파일을 2
69 금형에 2
70 금형의 2
71 기반의 11
72 기술이 2
73 논문에서는 20
74 데이터베이스로 2
75 도래에 2
76 마이닝을 2
77 매칭시켜 2
78 모바일 5
79 사용자가 2
80 산업분야에서도 2
81 산업혁명 2
82 설계도면 2
83 수집하여 5
84 스마트팩토리 2
85 시대의 6
86 시스템 10
87 시스템을 7
88 이미지 2
89 이미지만으로 2
90 있으며 6
91 정보를 8
92 정보와 2
93 필요로 2
94 핵심요소로 2
95 힘입어 2
96 blsrc 1
97 개발하고 1
98 개발한 1
99 국가별 1
100 기술을 1
101 기술

In [18]:
print(dictionary)

Dictionary(475 unique tokens: ['adapt', 'cach', 'hdfs', 'hdfs에서', 'hdfs을']...)


In [19]:
bow_corpus = [dictionary.doc2bow(doc) for doc in preprocessed_docs]

In [20]:
print(bow_corpus)

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 4), (22, 1), (23, 2), (24, 1), (25, 1), (26, 1), (27, 1), (28, 2), (29, 1), (30, 1), (31, 1), (32, 1), (33, 1), (34, 1), (35, 1), (36, 2), (37, 1), (38, 1), (39, 2), (40, 1), (41, 1), (42, 1), (43, 2), (44, 1), (45, 1), (46, 1), (47, 1), (48, 1), (49, 1), (50, 1), (51, 1), (52, 1)], [(21, 1), (32, 1), (40, 1), (53, 1), (54, 1), (55, 1), (56, 2), (57, 1), (58, 1), (59, 1), (60, 1), (61, 1), (62, 2), (63, 1), (64, 1), (65, 1), (66, 1), (67, 1), (68, 1), (69, 2), (70, 1), (71, 1), (72, 2), (73, 1), (74, 1), (75, 1), (76, 1), (77, 1), (78, 2), (79, 1), (80, 1), (81, 1), (82, 1), (83, 1), (84, 1), (85, 1), (86, 1), (87, 1), (88, 1), (89, 1), (90, 1), (91, 1), (92, 1), (93, 1), (94, 1), (95, 1)], [(20, 1), (96, 1), (97, 1), (98, 1), (99, 1), (100, 1), (101, 2), (102, 1), (103, 1), (104, 1), (105, 1), (106, 1)

In [21]:
for i in range(len(bow_corpus)):
    for j in range(len(bow_corpus[i])):
        print("Word {} (\"{}\") appears {} time.".format(bow_corpus[i][j][0], dictionary[bow_corpus[i][j][0]], bow_corpus[i][j][1]))

Word 0 ("adapt") appears 1 time.
Word 1 ("cach") appears 1 time.
Word 2 ("hdfs") appears 1 time.
Word 3 ("hdfs에서") appears 1 time.
Word 4 ("hdfs을") appears 1 time.
Word 5 ("ict") appears 1 time.
Word 6 ("manag") appears 1 time.
Word 7 ("scheme") appears 1 time.
Word 8 ("공정의") appears 1 time.
Word 9 ("관리하기") appears 1 time.
Word 10 ("관리한다") appears 1 time.
Word 11 ("기법은") appears 1 time.
Word 12 ("기법을") appears 1 time.
Word 13 ("기법의") appears 1 time.
Word 14 ("기업들은") appears 1 time.
Word 15 ("기존보다") appears 1 time.
Word 16 ("나타나는") appears 1 time.
Word 17 ("노드에서") appears 1 time.
Word 18 ("노드의") appears 1 time.
Word 19 ("높이고") appears 1 time.
Word 20 ("다양한") appears 1 time.
Word 21 ("데이터") appears 4 time.
Word 22 ("데이터가") appears 1 time.
Word 23 ("데이터를") appears 2 time.
Word 24 ("디스크") appears 1 time.
Word 25 ("디스크의") appears 1 time.
Word 26 ("방대한") appears 1 time.
Word 27 ("보인다") appears 1 time.
Word 28 ("분석하여") appears 2 time.
Word 29 ("속도에") appears 1 time.
Word 30 ("속도의") appears 1 

Word 144 ("가능해짐으로써") appears 1 time.
Word 145 ("개인마다") appears 1 time.
Word 146 ("개인의") appears 3 time.
Word 147 ("검사가") appears 1 time.
Word 148 ("고려하고") appears 1 time.
Word 149 ("나만의") appears 1 time.
Word 150 ("나타나고") appears 1 time.
Word 151 ("네트워크에서의") appears 1 time.
Word 152 ("다르다") appears 1 time.
Word 153 ("맞춤형") appears 1 time.
Word 154 ("민간기관에서도") appears 1 time.
Word 155 ("분석도") appears 1 time.
Word 156 ("분석을") appears 1 time.
Word 157 ("사람의") appears 1 time.
Word 158 ("사용에") appears 1 time.
Word 159 ("상기와") appears 1 time.
Word 160 ("상태에") appears 1 time.
Word 161 ("상태의") appears 1 time.
Word 162 ("선호도가") appears 1 time.
Word 163 ("선호도는") appears 1 time.
Word 164 ("소비자들의") appears 1 time.
Word 165 ("순으로") appears 1 time.
Word 166 ("실정으로") appears 1 time.
Word 167 ("유전자") appears 3 time.
Word 168 ("유전자를") appears 1 time.
Word 169 ("유형과") appears 1 time.
Word 170 ("자세한") appears 1 time.
Word 171 ("차이가") appears 1 time.
Word 172 ("추천을") appears 1 time.
Word 173 ("카운슬링") appe

In [22]:
print(bow_corpus[0])

[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 4), (22, 1), (23, 2), (24, 1), (25, 1), (26, 1), (27, 1), (28, 2), (29, 1), (30, 1), (31, 1), (32, 1), (33, 1), (34, 1), (35, 1), (36, 2), (37, 1), (38, 1), (39, 2), (40, 1), (41, 1), (42, 1), (43, 2), (44, 1), (45, 1), (46, 1), (47, 1), (48, 1), (49, 1), (50, 1), (51, 1), (52, 1)]


In [23]:
corpus_dictionary = {}
for i in range(len(bow_corpus)):
    for j in range(len(bow_corpus[i])):
        if bow_corpus[i][j][0] not in corpus_dictionary.keys():
            corpus_dictionary[bow_corpus[i][j][0]] = bow_corpus[i][j][1]
        else:
            corpus_dictionary[bow_corpus[i][j][0]] = corpus_dictionary[bow_corpus[i][j][0]] + bow_corpus[i][j][1]
sortedDict = sorted(corpus_dictionary.items(), key=lambda x: x[1], reverse=True)

In [24]:
for i in range (len(sortedDict[:5])):
    print("Word {} (\"{}\") appears {} time.".format(sortedDict[i][0], dictionary[sortedDict[i][0]], sortedDict[i][1]))

Word 188 ("벤치마킹") appears 35 time.
Word 182 ("nosql") appears 23 time.
Word 73 ("논문에서는") appears 20 time.
Word 303 ("그래프") appears 20 time.
Word 20 ("다양한") appears 16 time.


# Step 3: LDA model training

In [25]:
lda_model = gensim.models.LdaMulticore(corpus=bow_corpus,
                                       id2word=dictionary,
                                       num_topics=5)

In [26]:
keywords = lda_model.print_topics(-1,5)

In [27]:
keywords

[(0,
  '0.046*"벤치마킹" + 0.027*"nosql" + 0.018*"스토리지" + 0.018*"데이터베이스" + 0.013*"이용한"'),
 (1, '0.025*"그래프" + 0.019*"데이터" + 0.015*"구조적" + 0.014*"제안한다" + 0.013*"고려한"'),
 (2,
  '0.024*"그래프" + 0.019*"유전자" + 0.011*"제안한다" + 0.008*"논문에서는" + 0.008*"data"'),
 (3,
  '0.038*"벤치마킹" + 0.022*"nosql" + 0.015*"분석을" + 0.014*"논문에서는" + 0.013*"데이터베이스"'),
 (4,
  '0.023*"nosql" + 0.021*"벤치마킹" + 0.020*"빅데이터" + 0.020*"스토리지" + 0.016*"이용한"')]

In [28]:
for idx, topic in lda_model.print_topics(-1,5):
    print('Topic: {} \nwords: {}'.format(idx, topic))

Topic: 0 
words: 0.046*"벤치마킹" + 0.027*"nosql" + 0.018*"스토리지" + 0.018*"데이터베이스" + 0.013*"이용한"
Topic: 1 
words: 0.025*"그래프" + 0.019*"데이터" + 0.015*"구조적" + 0.014*"제안한다" + 0.013*"고려한"
Topic: 2 
words: 0.024*"그래프" + 0.019*"유전자" + 0.011*"제안한다" + 0.008*"논문에서는" + 0.008*"data"
Topic: 3 
words: 0.038*"벤치마킹" + 0.022*"nosql" + 0.015*"분석을" + 0.014*"논문에서는" + 0.013*"데이터베이스"
Topic: 4 
words: 0.023*"nosql" + 0.021*"벤치마킹" + 0.020*"빅데이터" + 0.020*"스토리지" + 0.016*"이용한"


# Step 4: Visualization

In [29]:
import pyLDAvis.gensim_models
import pickle 
import pyLDAvis

# Visualize the topics
pyLDAvis.enable_notebook()
LDAvis_prepared = pyLDAvis.gensim_models.prepare(lda_model, bow_corpus, dictionary)
LDAvis_prepared