In [1]:
import pandas as pd
import numpy as np
import json
import feather
from operator import itemgetter

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [23]:
ml_papers = pd.read_pickle("dblp.pkl")

In [24]:
top_10_venues = ml_papers.venue.value_counts().index.tolist()[1:11]
top_10_venues

['Lecture Notes in Computer Science',
 'international conference on acoustics, speech, and signal processing',
 'international conference on robotics and automation',
 'international conference on image processing',
 'international conference on communications',
 'international symposium on circuits and systems',
 'global communications conference',
 'international geoscience and remote sensing symposium',
 'intelligent robots and systems',
 'conference of the international speech communication association']

In [25]:
ml_papers = ml_papers[ml_papers.venue.isin(top_10_venues)]

In [43]:
ml_papers.head()

Unnamed: 0,authors,id,n_citation,references,title,venue,year
17,"[Milos Zelezný, Petr Císar, Zdenek Krnoul, Jan...",00e02aeb-b424-4ca8-b3ca-6e18e322f79e,7,,Design of an audio-visual speech corpus for th...,conference of the international speech communi...,2002
54,"[Claudius Gläser, Martin Heckmann, Frank Joubl...",02e8e38f-ed5c-43a4-b3a4-c5064a725a2d,2,"[18b17dbd-4f51-411e-a099-efadf521f0d8, 24c7948...",Auditory-based formant estimation in noise usi...,conference of the international speech communi...,2008
83,"[Thomas Portele, Silke Goronzy, Martin Emele, ...",0564b85b-eac4-4a59-951e-3d8badc0c3e7,50,"[6bb66751-ba4a-4740-9bc6-a41500f33022, a89dda2...",Smartkom-home - an advanced multi-modal interf...,conference of the international speech communi...,2003
90,"[Guo-Hong Ding, Yifei Zhu, Chengrong Li, Bo Xu]",0602535c-1d37-4e96-9882-9be45ecd334a,14,,Implementing vocal tract length normalization ...,conference of the international speech communi...,2002
145,"[Mireia Farrús, Michael Wagner, Jan Anguita, J...",0a11cd87-659d-45b8-b140-d8a8ff0974ee,50,"[23af12d9-481d-459a-a1e2-23390f7b3c9a, 4047558...",Robustness of prosodic features to voice imita...,conference of the international speech communi...,2008


## LDA

In [28]:
documents = ml_papers[['id', 'title']]

In [29]:
documents.head()

Unnamed: 0,id,title
17,00e02aeb-b424-4ca8-b3ca-6e18e322f79e,Design of an audio-visual speech corpus for th...
54,02e8e38f-ed5c-43a4-b3a4-c5064a725a2d,Auditory-based formant estimation in noise usi...
83,0564b85b-eac4-4a59-951e-3d8badc0c3e7,Smartkom-home - an advanced multi-modal interf...
90,0602535c-1d37-4e96-9882-9be45ecd334a,Implementing vocal tract length normalization ...
145,0a11cd87-659d-45b8-b140-d8a8ff0974ee,Robustness of prosodic features to voice imita...


In [44]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
np.random.seed(3)
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/tniyomkarn/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [31]:
documents['index'] = documents.index

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [32]:
stemmer = SnowballStemmer('english')
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))
def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [33]:
processed_docs = documents['title'].map(preprocess)
processed_docs[:10]

17     [design, audio, visual, speech, corpus, czech,...
54     [auditori, base, formant, estim, nois, probabi...
83     [smartkom, home, advanc, multi, modal, interfa...
90     [implement, vocal, tract, length, normal, mllr...
145                 [robust, prosod, featur, voic, imit]
148             [text, speech, convers, applic, swedish]
152    [phonem, recognit, combin, bayesian, linear, d...
160    [languag, develop, extrem, childhood, depriv, ...
175    [harmon, filter, joint, estim, pitch, voic, so...
181      [high, perform, digit, recognit, real, environ]
Name: title, dtype: object

In [34]:
dictionary = gensim.corpora.Dictionary(processed_docs)
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

0 audio
1 corpus
2 czech
3 design
4 speech
5 synthesi
6 visual
7 auditori
8 base
9 estim
10 formant


In [35]:
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

In [36]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
bow_corpus[4310]

[(8, 1), (67, 1), (89, 1), (105, 1), (288, 1), (291, 1), (627, 1)]

In [37]:
bow_doc_4310 = bow_corpus[4310]
for i in range(len(bow_doc_4310)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_4310[i][0], 
                                               dictionary[bow_doc_4310[i][0]], 
bow_doc_4310[i][1]))

Word 8 ("base") appears 1 time.
Word 67 ("japanes") appears 1 time.
Word 89 ("dynam") appears 1 time.
Word 105 ("generat") appears 1 time.
Word 288 ("error") appears 1 time.
Word 291 ("predict") appears 1 time.
Word 627 ("question") appears 1 time.


In [38]:
from gensim import corpora, models
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]
from pprint import pprint
for doc in corpus_tfidf:
    pprint(doc)
    break

[(0, 0.535105759434703),
 (1, 0.34467077660903883),
 (2, 0.4422136096566205),
 (3, 0.17580047941595725),
 (4, 0.3311877422331826),
 (5, 0.24812578847444544),
 (6, 0.4440196722348407)]


In [45]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=5, id2word=dictionary, passes=2, workers=2)

In [46]:
for idx, topic in lda_model.print_topics(-1, 20):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.025*"channel" + 0.021*"cod" + 0.019*"time" + 0.016*"system" + 0.016*"filter" + 0.016*"estim" + 0.014*"base" + 0.013*"signal" + 0.013*"mimo" + 0.012*"algorithm" + 0.011*"frequenc" + 0.010*"adapt" + 0.009*"rate" + 0.009*"optim" + 0.009*"linear" + 0.008*"analysi" + 0.008*"error" + 0.008*"video" + 0.008*"complex" + 0.008*"design"
Topic: 1 
Words: 0.044*"robot" + 0.020*"base" + 0.018*"model" + 0.016*"control" + 0.012*"mobil" + 0.011*"design" + 0.010*"manipul" + 0.010*"human" + 0.010*"system" + 0.009*"environ" + 0.009*"dynam" + 0.009*"applic" + 0.008*"simul" + 0.008*"interact" + 0.008*"agent" + 0.007*"develop" + 0.007*"program" + 0.007*"approach" + 0.007*"learn" + 0.007*"remot"
Topic: 2 
Words: 0.048*"imag" + 0.031*"base" + 0.018*"data" + 0.017*"detect" + 0.016*"motion" + 0.012*"object" + 0.012*"estim" + 0.010*"model" + 0.009*"time" + 0.009*"method" + 0.008*"algorithm" + 0.008*"track" + 0.008*"segment" + 0.008*"scale" + 0.007*"sens" + 0.007*"compress" + 0.007*"multi" + 0.0

In [47]:
topic_dict = {
    0: "Communications",
    1: "Robotics and Automation",
    2: "Image Processing",
    3: "Circuits and Systems",
    4: "Signal and Speech Processing",
}

In [48]:
topic_prob = lda_model.get_document_topics(bow_corpus, per_word_topics=True)

In [56]:
# for doc_topics, word_topics, phi_values in topics:
#     print('New Document \n')
#     print ('Document topics:', doc_topics)
# #     print 'Word topics:', word_topics
# #     print 'Phi values:', phi_values
#     print(" ")
#     print('-------------- \n')

In [50]:
topics = [max(x[0],key=itemgetter(1))[0]  for x in topic_prob]

In [53]:
topics = np.array(topics)

## Convert authors and references

In [55]:
ml_papers['topic'] = topics

In [117]:
ml_papers.authors = ml_papers.authors.apply(', '.join)
ml_papers.loc[ml_papers['references'].isnull(),['references']] = ml_papers.loc[ml_papers['references'].isnull(),'references'].apply(lambda x: [])
ml_papers.references = ml_papers.references.apply(', '.join)

In [121]:
ml_papers = ml_papers.replace({"topic": topic_dict})

In [123]:
ml_papers.head()

Unnamed: 0,authors,id,n_citation,references,title,venue,year,topic
17,"Milos Zelezný, Petr Císar, Zdenek Krnoul, Jan ...",00e02aeb-b424-4ca8-b3ca-6e18e322f79e,7,,Design of an audio-visual speech corpus for th...,conference of the international speech communi...,2002,Signal and Speech Processing
54,"Claudius Gläser, Martin Heckmann, Frank Joubli...",02e8e38f-ed5c-43a4-b3a4-c5064a725a2d,2,"18b17dbd-4f51-411e-a099-efadf521f0d8, 24c79482...",Auditory-based formant estimation in noise usi...,conference of the international speech communi...,2008,Signal and Speech Processing
83,"Thomas Portele, Silke Goronzy, Martin Emele, A...",0564b85b-eac4-4a59-951e-3d8badc0c3e7,50,"6bb66751-ba4a-4740-9bc6-a41500f33022, a89dda25...",Smartkom-home - an advanced multi-modal interf...,conference of the international speech communi...,2003,Robotics and Automation
90,"Guo-Hong Ding, Yifei Zhu, Chengrong Li, Bo Xu",0602535c-1d37-4e96-9882-9be45ecd334a,14,,Implementing vocal tract length normalization ...,conference of the international speech communi...,2002,Signal and Speech Processing
145,"Mireia Farrús, Michael Wagner, Jan Anguita, Ja...",0a11cd87-659d-45b8-b140-d8a8ff0974ee,50,"23af12d9-481d-459a-a1e2-23390f7b3c9a, 40475584...",Robustness of prosodic features to voice imita...,conference of the international speech communi...,2008,Signal and Speech Processing


In [124]:
ml_papers.reset_index(drop=True).to_feather('ml_papers.feather')