In [5]:
import requests
from bs4 import BeautifulSoup

In [6]:
papers = {}

target_url = 'http://www.kdd.org/kdd2018/accepted-papers'
r = requests.get(target_url)
soup = BeautifulSoup(r.text, 'lxml')
for i, item in enumerate( soup.select('a[href^="http://www.kdd.org/kdd2018/accepted-papers/view/"]') ):
    if i % 20 == 0: print( i )
    title = item.get_text()
    url = item.get('href')
    r = requests.get(url)
    soup = BeautifulSoup(r.text, 'lxml')
    s = soup.find('div', {'class': 'g-mb-60'})

    author = s.find('h6').get_text()
    author_list = [ n.strip() for n in author.split(';')]
    abst = s.find('p').get_text()
    try:
        yid = s.find('iframe').attrs['src'].split('/')[-1]
    except:
        yid = 'None'
    try:
        pdf = s.find('a').attrs['href']
    except:
        pdf = 'None'
    
    paper = {}
    paper['title'] = title
    paper['author'] = author_list
    paper['abst'] = abst
    paper['yid'] = yid
    paper['pdf'] = pdf
    papers[i] = paper

0
20
40
60
80
100
120
140
160
180
200
220
240
260
280


In [8]:
def get_texts_from_papers( papers ):
    Texts = []
    for paper in papers.values():
        text = paper['title'] + ' ' + paper['abst']
        text = text.lower()
        Texts.append( text )
    return Texts

def get_BoWs_from_english_rawtext( texts_tmp, pos_filter ):
    BoWs_tmp = []
    for text in texts_tmp:
        tokens = get_tokens( text, pos_filter )
        BoWs_tmp.append( tokens )
    return BoWs_tmp

from nltk import word_tokenize, pos_tag
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()

def get_tokens( text, pos_filter ):
    tokens = []
    words = word_tokenize( text )
    words = list(map(stemmer.stem, words))
    for token in pos_tag( words ):
        if token[1] in pos_filter:
            tokens.append( token[0] )
    return tokens

import gensim
from gensim import corpora, models, similarities

def get_tfidf( BoWs_tmp ):
    dictionary = corpora.Dictionary( BoWs_tmp )
    dictionary.filter_extremes(no_below=2, no_above=0.5 )
    corpus = [dictionary.doc2bow(text) for text in BoWs_tmp]
    tfidf = models.TfidfModel(corpus)
    return corpus, dictionary, tfidf

def get_tfidf_each_document( Corpus, Dictionary, Tfidf ):
    TfidfScores = {}
    docNum = len( Corpus )
    for i in range( docNum ):
        tfidfScore = {}
        tfidfCorpus = [ Tfidf[c] for c in Corpus ]
        for j, wc in enumerate( sorted( tfidfCorpus[i], key=lambda x:x[1], reverse=True ) ):
            tfidfScore[ Dictionary[wc[0]] ] = wc[1]
        TfidfScores[i] = tfidfScore
    return TfidfScores

def get_tfidf_each_word( TfidfScores ):
    wordTfidf = {}
    for tfidf_score in TfidfScores.values():
        for w, score in tfidf_score.items():
            if w in wordTfidf:
                wordTfidf[w] += score
            else:
                wordTfidf[w] = score
    return wordTfidf

def get_keywords( TfidfEachWord, topN ):
    n = 0
    for w, score in sorted( TfidfEachWord.items(), key=lambda x:x[1], reverse=True):
        if n >= topN: continue
        n += 1
        print( '   %.3f %s' % ( score, w) )

def get_keywords_each_document( TfidfScores, topN ):
    for i, tfidfScore in TfidfScores.items():
        print( i, papers[i]['title'] )
        n = 0
        for w, score in sorted( tfidfScore.items(), key=lambda x:x[1], reverse=True):
            if n >= topN: continue
            n += 1
            print( '   %.3f %s' % ( score, w) )        
        

In [5]:
import json
fw = open( 'kdd2018_papers.json', 'w' )
json.dump( papers, fw, indent=4 )

### TFIDF

In [16]:
filteringPos = ['NN', 'NNP', 'JJ']
Texts = get_texts_from_papers( papers )
BoWsNoun = get_BoWs_from_english_rawtext( Texts, filteringPos )

In [17]:
Corpus, Dictionary, Tfidf = get_tfidf( BoWsNoun )
TfidfScores = get_tfidf_each_document( Corpus, Dictionary, Tfidf )
TfidfEachWord = get_tfidf_each_word( TfidfScores )

In [18]:
get_keywords( TfidfEachWord, 10 )

   7.976 network
   7.682 graph
   6.735 user
   6.429 recommend
   6.426 system
   5.818 method
   5.469 featur
   5.391 algorithm
   5.286 predict
   5.060 deep


In [19]:
get_keywords_each_document( TfidfScores, 5 )

0 Smoothed Dilated Convolutions for Improved Dense Prediction
   0.603 convolut
   0.359 dcnn
   0.240 smooth
   0.220 artifact
   0.220 decomposit
1 Discovering Non-Redundant K-means Clusterings in Optimal Subspaces
   0.472 cluster
   0.316 k-mean
   0.268 subspac
   0.248 particularli
   0.240 high-dimension
2 Trajectory-driven Influential Billboard Placement
   0.401 influenti
   0.300 budget
   0.280 trajectori
   0.272 influenc
   0.272 ratio
3 Multi-Type Itemset Embedding for Learning Behavior Success
   0.566 item
   0.481 behavior
   0.300 success
   0.247 context
   0.150 maker
4 STAMP: Short-Term Attention/Memory Priority Model for Session-based Recommendation
   0.340 short-term
   0.291 memori
   0.287 session
   0.275 user
   0.244 prioriti
5 Calibrated Multi-Task Learning
   0.399 non-convex
   0.388 regular
   0.326 calibr
   0.285 loss
   0.228 multi-task
6 Unlearn What You Have Learned: Adaptive Crowd Teaching with Exponentially Decayed Memory Learners
   0.677 crowds

In [None]:
### Tfidf keyword edge

In [20]:
th = 0.3
keywordsEachDocument = {}
for i, scores in TfidfScores.items():
    keywordsEachDocument[i] = []
    for w, score in scores.items():
        if score > th: keywordsEachDocument[i].append( w )

In [21]:
tf = {}
for i, keywords in keywordsEachDocument.items():
    for k in keywords:
        if not k in tf: tf[k] = 0
        tf[k] += 1

In [22]:
papersWithSameKeyword = {}
for w, freq in sorted( tf.items(), key= lambda x:x[1], reverse=True ):
    if freq < 3: continue
    if len(w) < 2: continue
    print( 'Keyword: %s (Number of papers: %d)' % ( w, freq) )
    papersWithSameKeyword[w] = []
    for i, keywords in keywordsEachDocument.items():
        if w in keywords:
            print(  '   Title:', papers[i]['title'] )
            papersWithSameKeyword[w].append( i )

Keyword: graph (Number of papers: 10)
   Title: Graph Classification using Structural Attention
   Title: NetLSD: Hearing the Shape of a Graph
   Title: SpotLight: Detecting Anomalies in Streaming Graphs
   Title: Adversarial Attacks on Neural Networks for Graph Data
   Title: EvoGraph: An Effective and Efficient Graph Upscaling Method for Preserving Graph Properties
   Title: Large-Scale Learnable Graph Convolutional Networks
   Title: An Iterative Global Structure-Assisted Labeled Network Aligner
   Title: D2K: Scalable Community Detection in Massive Networks via Small-Diameter k-Plexes
   Title: Approximating the Spectrum of a Graph
   Title: Graph Convolutional Neural Networks for Web-Scale Recommender Systems
Keyword: cluster (Number of papers: 7)
   Title: Discovering Non-Redundant K-means Clusterings in Optimal Subspaces
   Title: Scalable k-Means Clustering via Lightweight Coresets
   Title: Spectral Clustering of Large-scale Data by Directly Solving Normalized Cut
   Title: Mu

In [17]:
# Gephi (GDF)
'''
nodedef>name VARCHAR,title VARCHAR,label VARCHAR,
1,title,article
2,title,article
3,title,tag
edgedef>node1 VARCHAR,node2 VARCHAR
1,2
2,3
3,2
3,1
'''

f = open( './graph/kdd2018.gdf', 'w', encoding='shift-jis', errors='ignore' )
# node
f.write( 'nodedef>name VARCHAR,tag VARCHAR,label VARCHAR,\n' )
### paper
for i, p in papers.items():
    f.write('%d,%s,%s,\n' % ( i, 'paper', p['title'].replace(' ', '_') ) )

### tag
widx = {}
idx = len( papers )
for w in papersWithSameKeyword.keys():
    f.write('%d,%s,%s,\n' % ( idx, 'keyword', w ) )
    widx[w] = idx
    idx += 1
    
# edge
f.write( 'edgedef>node1 VARCHAR,node2 VARCHAR\n' )
for w, pidxList in papersWithSameKeyword.items():
    for pidx in pidxList:
        f.write('%d,%d' % (widx[w], pidx) )
        f.write('\n')
f.close()

In [46]:
### Title Edge

In [73]:
from nltk.corpus import stopwords

stop_words = frozenset(stopwords.words('english'))

tf = {}
df = {}
for i, p in papers.items():
    title = p['title']
    title = title.lower()
    title = get_tokens( title, filteringPos )
    for w in title:
        if not w in tf: tf[w] = 0
        tf[w] += 1        
        

In [74]:
for w, freq in sorted( tf.items(), key= lambda x:x[1], reverse=True ):
    print( w, freq )

learning 44
deep 30
network 24
prediction 17
detection 16
search 14
recommendation 13
graph 12
discovery 12
dynamic 11
online 11
neural 10
approach 10
adversarial 9
efficient 9
optimization 9
system 9
framework 9
scalable 8
reinforcement 8
information 8
analysis 8
large-scale 8
time 8
machine 8
model 7
active 7
metric 7
multi-task 6
text 6
representation 6
extraction 6
risk 6
feature 6
visual 6
fast 6
optimal 5
adaptive 5
heterogeneous 5
hashing 5
recurrent 5
knowledge 5
new 5
recommender 5
spatio-temporal 5
user 5
local 5
product 5
social 5
behavior 4
memory 4
interactive 4
semantic 4
robust 4
classification 4
inference 4
structural 4
mechanism 4
modeling 4
regression 4
estimation 4
empirical 4
attention 4
mining 4
application 4
transfer 4
effective 4
embedding 4
clustering 4
interpretable 4
taxonomy 4
mobile 4
latent 4
big 4
planning 4
algorithm 4
e-commerce 4
real-time 4
quality 4
event 4
proximity 3
graphs 3
error 3
supervised 3
semi-supervised 3
gradient 3
treatment 3
unsupervised

In [None]:
### Author Edge

In [47]:
# get scrapbox format
import json
from pprint import pprint

In [56]:
f = open( 'kdd2018.json', 'r' )
j = json.load( f )

In [58]:
for i, paper in papers.items():
    page = {}
    page['title'] = paper['title']

    page['lines'] = []
    page['lines'].append( paper['title'] )

    page['lines'].append( '' )   
    aut = ''
    for a in paper['author']:
        a = a.replace(' ', '_')
        aut += '#%s; ' % a
    page['lines'].append( aut )
    
    page['lines'].append( '' )
    page['lines'].append( paper['abst'] )

    if paper['yid'] == 'None':
        page['lines'].append( '' )
        page['lines'].append( 'The video does not exist' )    
    else:
        page['lines'].append( '' )
        page['lines'].append( '[https://www.youtube.com/watch?v=%s]' % paper['yid'] )    
    
    if paper['pdf'] == 'None':
        page['lines'].append( '' )
        page['lines'].append( 'The PDF does not exist' )  
    else:
        page['lines'].append( '' )
        page['lines'].append( 'PDF url: %s' % paper['pdf'] )
    
    j['pages'].append( page )

In [59]:
fw = open( 'kdd2018_.json', 'w' )
json.dump( j, fw, indent=4 )