In [1]:
import tarfile

filename = 'nips12raw_str602.tgz'
tar = tarfile.open(filename, 'r:gz')
for item in tar:
    tar.extract(item, path='/tmp')

In [2]:
import os, re

# Folder containing all NIPS papers.
data_dir = '/tmp/nipstxt/'  # Set this path to the data on your machine.

# Folders containin individual NIPS papers.
yrs = ['00', '01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12']
dirs = ['nips' + yr for yr in yrs]

# Get all document texts and their corresponding IDs.
docs = []
doc_ids = []
for yr_dir in dirs:
    files = os.listdir(data_dir + yr_dir)  # List of filenames.
    for filen in files:
        # Get document ID.
        (idx1, idx2) = re.search('[0-9]+', filen).span()  # Matches the indexes of the start end end of the ID.
        doc_ids.append(yr_dir[4:] + '_' + str(int(filen[idx1:idx2])))
        
        # Read document text.
        # Note: ignoring characters that cause encoding errors.
        with open(data_dir + yr_dir + '/' + filen, errors='ignore', encoding='utf-8') as fid:
            txt = fid.read()
            
        # Replace any whitespace (newline, tabs, etc.) by a single space.
        txt = re.sub('\s', ' ', txt)
        
        docs.append(txt)

In [3]:
filenames = [data_dir + 'idx/a' + yr + '.txt' for yr in yrs]
print(filenames)

['/tmp/nipstxt/idx/a00.txt', '/tmp/nipstxt/idx/a01.txt', '/tmp/nipstxt/idx/a02.txt', '/tmp/nipstxt/idx/a03.txt', '/tmp/nipstxt/idx/a04.txt', '/tmp/nipstxt/idx/a05.txt', '/tmp/nipstxt/idx/a06.txt', '/tmp/nipstxt/idx/a07.txt', '/tmp/nipstxt/idx/a08.txt', '/tmp/nipstxt/idx/a09.txt', '/tmp/nipstxt/idx/a10.txt', '/tmp/nipstxt/idx/a11.txt', '/tmp/nipstxt/idx/a12.txt']


In [4]:
author2doc = dict()
i = 0

In [5]:
for yr in yrs:
    filename = data_dir + 'idx/a' + yr + '.txt'
    for line in open(filename, errors='ignore', encoding='utf-8'):
        contents = re.split(',', line)
        author_name = (contents[1] + contents[0]).strip()
        author_name = re.sub('\s', '', author_name)
        ids = [c.strip() for c in contents[2:]]
        if not author2doc.get(author_name):
            author2doc[author_name] = []
            i += 1
        author2doc[author_name].extend([yr + '_' + id for id in ids])

In [6]:
print(len(doc_ids))

1740


In [7]:
doc_id_dict = dict(zip(doc_ids, range(len(doc_ids))))

print(doc_id_dict)

{'00_1': 0, '00_9': 1, '00_22': 2, '00_31': 3, '00_41': 4, '00_52': 5, '00_62': 6, '00_72': 7, '00_82': 8, '00_95': 9, '00_103': 10, '00_114': 11, '00_127': 12, '00_137': 13, '00_144': 14, '00_154': 15, '00_164': 16, '00_174': 17, '00_184': 18, '00_192': 19, '00_201': 20, '00_211': 21, '00_219': 22, '00_223': 23, '00_233': 24, '00_242': 25, '00_249': 26, '00_262': 27, '00_270': 28, '00_278': 29, '00_290': 30, '00_297': 31, '00_301': 32, '00_310': 33, '00_317': 34, '00_338': 35, '00_348': 36, '00_358': 37, '00_367': 38, '00_377': 39, '00_387': 40, '00_397': 41, '00_402': 42, '00_412': 43, '00_422': 44, '00_432': 45, '00_442': 46, '00_457': 47, '00_467': 48, '00_474': 49, '00_485': 50, '00_495': 51, '00_505': 52, '00_515': 53, '00_524': 54, '00_534': 55, '00_544': 56, '00_554': 57, '00_564': 58, '00_573': 59, '00_584': 60, '00_592': 61, '00_602': 62, '00_612': 63, '00_622': 64, '00_632': 65, '00_642': 66, '00_652': 67, '00_662': 68, '00_674': 69, '00_683': 70, '00_693': 71, '00_701': 72,

In [8]:
for a, a_doc_ids in author2doc.items():
    for i, doc_id in enumerate(a_doc_ids):
        author2doc[a][i] = doc_id_dict[doc_id]

In [9]:
print(author2doc)

{'YaserS.Abu-Mostafa': [0, 582, 1225], 'RobertB.Allen': [1, 176], 'JoshuaAlspector': [1, 176, 423, 535], 'JoseAmbros-Ingerson': [34], 'SuguruArimoto': [79], 'AmirF.Atiya': [2, 9], 'LesE.Atlas': [3, 55, 465], 'PierreBaldi': [4, 97, 273, 385, 664, 795, 978], 'JashojibanBanik': [10], 'AlanH.Barr': [63, 669], 'EricB.Baum': [5, 99, 267], 'WilliamBaxter': [6], 'J.Bernasconi': [7], 'LyleJ.Borg-Graham': [8], 'JamesM.Bower': [9, 10, 11, 134, 137, 139, 145, 192, 195, 199], 'DavidBrady': [39], 'D.Brandeis': [48], 'NathanH.Brown': [12], 'JehoshuaBruck': [13, 544, 1323], 'DavidJ.Burr': [14, 36], 'ZoeF.Butler': [59], 'L.RichardCarley': [15], 'H.H.Chen': [78, 231], 'JohnY.Cheung': [16], 'TzioDarChiueh': [17], 'P.A.Chou': [18], 'JoshuaChover': [19], 'D.D.Coon': [20], 'LeonN.Cooper': [69, 936, 1347], 'AmirDembo': [21], 'JohnS.Denker': [22, 53, 88, 488, 538, 741], 'MarkDerthick': [23], 'BradleyW.Dickinson': [61], 'M.Dikaiakos': [76], 'JohnP.Donoghue': [74], 'BruceDow': [6], 'G.Dreyfus': [24], 'FrankH.Ee

In [10]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [11]:
%%time
processed_docs = []    
for doc in nlp.pipe(docs, n_threads=4, batch_size=100):
    # Process document using Spacy NLP pipeline.
    
    ents = doc.ents  # Named entities.

    # Keep only words (no numbers, no punctuation).
    # Lemmatize tokens, remove punctuation and remove stopwords.
    doc = [token.lemma_ for token in doc if token.is_alpha and not token.is_stop]

    # Remove common words from a sto  pword list.
    #doc = [token for token in doc if token not in STOPWORDS]

    # Add named entities, but only if they are a compound of more than word.
    doc.extend([str(entity) for entity in ents if len(entity) > 1])
    
    processed_docs.append(doc)

Wall time: 55min 56s


In [12]:
print(processed_docs[0])

['connectivity', 'versus', 'entropy', 'yaser', 'abu', 'mostafa', 'california', 'institute', 'technology', 'pasadena', 'ca', 'abstract', 'how', 'connectivity', 'neural', 'network', 'number', 'synapsis', 'neuron', 'relate', 'complexity', 'problem', 'handle', 'measure', 'entropy', 'switching', 'theory', 'suggest', 'relation', 'boolean', 'function', 'implement', 'circuit', 'low', 'connectivity', 'input', 'nand', 'gate', 'however', 'network', 'learn', 'problem', 'example', 'local', 'learning', 'rule', 'prove', 'entropy', 'problem', 'low', 'bind', 'connectivity', 'network', 'introduction', 'the', 'distinguishing', 'feature', 'neural', 'network', 'ability', 'taneously', 'learn', 'desire', 'function', 'training', 'sample', 'ability', 'program', 'clearly', 'give', 'neural', 'network', 'learn', 'function', 'restriction', 'network', 'learn', 'function', 'one', 'obvious', 'restriction', 'independent', 'learning', 'aspect', 'network', 'big', 'accommodate', 'circuit', 'ity', 'function', 'eventually'




In [13]:
docs = processed_docs
del processed_docs

In [14]:
# Compute bigrams.
from gensim.models import Phrases
# Add bigrams and trigrams to docs (only ones that appear 20 times or more).
bigram = Phrases(docs, min_count=20)
for idx in range(len(docs)):
    for token in bigram[docs[idx]]:
        if '_' in token:
            # Token is a bigram, add to document.
            docs[idx].append(token)



In [18]:
print(bigram)

Phrases<1453322 vocab, min_count=20, threshold=10.0, max_vocab_size=40000000>


In [26]:
print(doc[0:100])

['monte', 'carlo', 'pomdps', 'sebastian', 'thrun', 'school', 'computer', 'science', 'carnegie', 'mellon', 'university', 'pittsburgh', 'pa', 'abstract', '-PRON-', 'present', 'monte', 'carlo', 'algorithm', 'learn', 'act', 'partially', 'observable', 'markov', 'decision', 'process', 'pomdps', 'real', 'value', 'state', 'action', 'space', '-PRON-', 'approach', 'use', 'importance', 'sample', 'represent', 'belief', 'monte', 'carlo', 'approximation', 'belief', 'propagation', 'a', 'reinforcement', 'learn', 'algorithm', 'value', 'iteration', 'employ', 'learn', 'value', 'function', 'belief', 'state', 'finally', 'base', 'version', 'near', 'neighbor', 'generalize', 'state', 'initial', 'empirical', 'result', 'suggest', 'approach', 'work', 'practical', 'application', 'introduction', 'pomdps', 'address', 'problem', 'act', 'optimally', 'partially', 'observable', 'dynamic', 'ment', 'in', 'pomdps', 'learner', 'interact', 'stochastic', 'environment', 'state', 'partially', 'observable', 'action', 'change', 

In [21]:
# Create a dictionary representation of the documents, and filter out frequent and rare words.

from gensim.corpora import Dictionary
dictionary = Dictionary(docs)

# Remove rare and common tokens.
# Filter out words that occur too frequently or too rarely.
max_freq = 0.5
min_wordcount = 20
dictionary.filter_extremes(no_below=min_wordcount, no_above=max_freq)

_ = dictionary[0]  # This sort of "initializes" dictionary.id2token.

In [24]:
print(dictionary)

Dictionary(7478 unique tokens: ['0  ', 'Addison-Wesley', 'American Institute of Physics', 'C. Mead', 'CA 91125']...)


In [14]:
# Vectorize data.

# Bag-of-words representation of the documents.
corpus = [dictionary.doc2bow(doc) for doc in docs]

In [15]:
print('Number of authors: %d' % len(author2doc))
print('Number of unique tokens: %d' % len(dictionary))
print('Number of documents: %d' % len(corpus))

Number of authors: 2479
Number of unique tokens: 7318
Number of documents: 1740


In [16]:
from gensim.models import AuthorTopicModel
%time model = AuthorTopicModel(corpus=corpus, num_topics=10, id2word=dictionary.id2token, \
                author2doc=author2doc, chunksize=2000, passes=1, eval_every=0, \
                iterations=1, random_state=1)

Wall time: 8.44 s


In [17]:
%%time
model_list = []
for i in range(5):
    model = AuthorTopicModel(corpus=corpus, num_topics=10, id2word=dictionary.id2token, \
                    author2doc=author2doc, chunksize=2000, passes=100, gamma_threshold=1e-10, \
                    eval_every=0, iterations=1, random_state=i)
    top_topics = model.top_topics(corpus)
    tc = sum([t[1] for t in top_topics])
    model_list.append((model, tc))

Wall time: 15min 43s


In [21]:
model, tc = max(model_list, key=lambda x: x[1])
print('Topic coherence: %.3e' %tc)

Topic coherence: -1.063e+01


In [22]:
# Save model.
model.save('/tmp/model.atmodel')

In [23]:
# Load model.
model = AuthorTopicModel.load('/tmp/model.atmodel')

In [25]:
model.show_topic(0)

[('signal', 0.014005823991640638),
 ('chip', 0.011344156762761131),
 ('image', 0.0098473517030712061),
 ('analog', 0.0097970715824693619),
 ('circuit', 0.0091268030403021186),
 ('source', 0.008205933857070349),
 ('gradient', 0.0060581005198853625),
 ('voltage', 0.005739656243063905),
 ('filter', 0.0054425927804779752),
 ('noise', 0.0054381664508153646)]

In [26]:
topic_labels = ['Circuits', 'Neuroscience', 'Numerical optimization', 'Object recognition', \
               'Math/general', 'Robotics', 'Character recognition', \
                'Reinforcement learning', 'Speech recognition', 'Bayesian modelling']

In [27]:
for topic in model.show_topics(num_topics=10):
    print('Label: ' + topic_labels[topic[0]])
    words = ''
    for word, prob in model.show_topic(topic[0]):
        words += word + ' '
    print('Words: ' + words)
    print()

Label: Circuits
Words: signal chip image analog circuit source gradient voltage filter noise 

Label: Neuroscience
Words: recognition speech classifier word layer classification class hide net character 

Label: Numerical optimization
Words: noise w generalization gaussian prediction sample optimal approximation matrix variance 

Label: Object recognition
Words: class f let theorem bound threshold node be w dimension 

Label: Math/general
Words: control layer map hide trajectory object human position target motor 

Label: Robotics
Words: cell visual response stimulus field motion frequency direction eye image 

Label: Character recognition
Words: action policy control q reinforcement optimal update reward dynamic decision 

Label: Reinforcement learning
Words: image mixture cluster likelihood density gaussian object em component matrix 

Label: Speech recognition
Words: neuron spike synaptic fire connection memory activity dynamic potential synapsis 

Label: Bayesian modelling
Words: r

In [28]:
model['YannLeCun']

[(1, 0.99976713055308641)]

In [29]:
from pprint import pprint

def show_author(name):
    print('\n%s' % name)
    print('Docs:', model.author2doc[name])
    print('Topics:')
    pprint([(topic_labels[topic[0]], topic[1]) for topic in model[name]])

In [30]:
show_author('YannLeCun')


YannLeCun
Docs: [106, 401, 410, 488, 499, 538, 579, 592, 792, 817, 1519]
Topics:
[('Neuroscience', 0.99976713055308641)]


In [31]:
show_author('GeoffreyE.Hinton')


GeoffreyE.Hinton
Docs: [37, 106, 206, 217, 246, 474, 491, 550, 606, 700, 701, 922, 963, 1356, 1607, 1655]
Topics:
[('Neuroscience', 0.06717333978942959),
 ('Character recognition', 0.014148754482118903),
 ('Reinforcement learning', 0.91858911519729636)]


In [32]:
show_author('TerrenceJ.Sejnowski')


TerrenceJ.Sejnowski
Docs: [479, 475, 508, 553, 618, 634, 690, 691, 775, 802, 856, 859, 863, 864, 901, 902, 977, 1151, 1210, 1241, 1250, 1252, 1257, 1284, 1366, 1413, 1455, 1510, 1541, 1613, 1714]
Topics:
[('Robotics', 0.99992173321837785)]


In [33]:
show_author('ChristofKoch')


ChristofKoch
Docs: [44, 194, 242, 276, 302, 332, 339, 340, 434, 443, 668, 674, 765, 773, 779, 844, 1144, 1235, 1237, 1457, 1461, 1537, 1549, 1610]
Topics:
[('Robotics', 0.99989759105599862)]


In [34]:
from gensim.models import atmodel
doc2author = atmodel.construct_doc2author(model.corpus, model.author2doc)

In [35]:
# Compute the per-word bound.
# Number of words in corpus.
corpus_words = sum(cnt for document in model.corpus for _, cnt in document)

# Compute bound and divide by number of words.
perwordbound = model.bound(model.corpus, author2doc=model.author2doc, \
                           doc2author=model.doc2author) / corpus_words
print(perwordbound)

-7.76325519555


In [36]:
%time top_topics = model.top_topics(model.corpus)

Wall time: 746 ms


In [37]:
%%time
from sklearn.manifold import TSNE
tsne = TSNE(n_components=2, random_state=0)
smallest_author = 0  # Ignore authors with documents less than this.
authors = [model.author2id[a] for a in model.author2id.keys() if len(model.author2doc[a]) >= smallest_author]
_ = tsne.fit_transform(model.state.gamma[authors, :])  # Result stored in tsne.embedding_

Wall time: 5min 32s


In [38]:
# Tell Bokeh to display plots inside the notebook.
from bokeh.io import output_notebook
output_notebook()

In [40]:
from bokeh.models import HoverTool
from bokeh.plotting import figure, show, ColumnDataSource

x = tsne.embedding_[:, 0]
y = tsne.embedding_[:, 1]
author_names = [model.id2author[a] for a in authors]

# Radius of each point corresponds to the number of documents attributed to that author.
scale = 0.1
author_sizes = [len(model.author2doc[a]) for a in author_names]
radii = [size * scale for size in author_sizes]

source = ColumnDataSource(
        data=dict(
            x=x,
            y=y,
            author_names=author_names,
            author_sizes=author_sizes,
            radii=radii,
        )
    )

# Add author names and sizes to mouse-over info.
hover = HoverTool(
        tooltips=[
        ("author", "@author_names"),
        ("size", "@author_sizes"),
        ]
    )

p = figure(tools=[hover, 'crosshair,pan,wheel_zoom,box_zoom,reset,save,lasso_select'])
p.scatter('x', 'y', radius='radii', source=source, fill_alpha=0.6, line_color=None)
show(p)

In [41]:
from gensim.similarities import MatrixSimilarity

# Generate a similarity object for the transformed corpus.
index = MatrixSimilarity(model[list(model.id2author.values())])

# Get similarities to some author.
author_name = 'YannLeCun'
sims = index[model[author_name]]

In [42]:
# Make a function that returns similarities based on the Hellinger distance.

from gensim import matutils
import pandas as pd

# Make a list of all the author-topic distributions.
author_vecs = [model.get_author_topics(author) for author in model.id2author.values()]

def similarity(vec1, vec2):
    '''Get similarity between two vectors'''
    dist = matutils.hellinger(matutils.sparse2full(vec1, model.num_topics), \
                              matutils.sparse2full(vec2, model.num_topics))
    sim = 1.0 / (1.0 + dist)
    return sim

def get_sims(vec):
    '''Get similarity of vector to all authors.'''
    sims = [similarity(vec, vec2) for vec2 in author_vecs]
    return sims

def get_table(name, top_n=10, smallest_author=1):
    '''
    Get table with similarities, author names, and author sizes.
    Return `top_n` authors as a dataframe.
    
    '''
    
    # Get similarities.
    sims = get_sims(model.get_author_topics(name))

    # Arrange author names, similarities, and author sizes in a list of tuples.
    table = []
    for elem in enumerate(sims):
        author_name = model.id2author[elem[0]]
        sim = elem[1]
        author_size = len(model.author2doc[author_name])
        if author_size >= smallest_author:
            table.append((author_name, sim, author_size))
            
    # Make dataframe and retrieve top authors.
    df = pd.DataFrame(table, columns=['Author', 'Score', 'Size'])
    df = df.sort_values('Score', ascending=False)[:top_n]
    
    return df

In [43]:
get_table('YannLeCun')

Unnamed: 0,Author,Score,Size
2422,YannLeCun,1.0,11
104,AlexWaibel,0.999979,11
1902,RichardP.Lippmann,0.999979,9
1717,PatriceSimard,0.999975,8
2455,YoshuaBengio,0.999969,14
1636,NelsonMorgan,0.999846,5
1533,MichaelCohen,0.999618,3
910,HoracioFranco,0.999618,3
394,ChuckWooters,0.999487,2
2323,VictorAbrash,0.999385,2


In [44]:
get_table('JamesM.Bower', smallest_author=3)

Unnamed: 0,Author,Score,Size
118,JamesM.Bower,1.0,10
151,K.Schulten,0.975073,3
182,MatthewA.Wilson,0.945236,3
150,K.Obermayer,0.932547,3
305,ZhaopingLi,0.90137,6
1,A.E.Friedman,0.889794,3
189,MichaelM.Merzenich,0.860342,3
73,DeLiangL.Wang,0.829488,3
155,KlausObermayer,0.800292,8
177,MarkE.Nelson,0.76968,4


In [45]:
%time model_ser = AuthorTopicModel(corpus=corpus, num_topics=10, id2word=dictionary.id2token, \
                               author2doc=author2doc, random_state=1, serialized=True, \
                               serialization_path='/tmp/model_serialization.mm')

Wall time: 44.7 s


In [46]:

# Delete the file, once you're done using it.
import os
os.remove('/tmp/model_serialization.mm')