## Create vocabulary

Create a character vocabulary by reading in abstracts from multiple files. This will be used
as a common vocabulary for all RNNs.


In [71]:
import json
import numpy as np
import codecs
import pickle
from collections import Counter


files = ["abstracts-small.json", "abstracts-2small.json"]
charvoc = Counter()

for file in files:
    with open("./data/%s" % file, 'r') as fp:
        abstract = json.load(fp)
        nabs = 0
        print("\n%s:" % file)
        for arxiv_id in abstract:
            if nabs < 10:
                print("%s: %s..." % (arxiv_id, abstract[arxiv_id][0:80]))
            nabs = nabs + 1
            charvoc.update([c for c in abstract[arxiv_id].strip()])
        print("processed %d abstracts" % nabs)
        


abstracts-small.json:
1407.0001: The topic of finding effective strategy to halt virus in complex network is of c...
1407.0004: Linear precoding exploits the spatial degrees of freedom offered by multi-antenn...
1407.0016: We present completed observations of the NGC 7448 galaxy group and background vo...
1407.0017: Increasingly stringent limits from LHC searches for new physics, coupled with la...
1407.0023: Results are presented for an initial survey of the Norma Arm gathered with the f...
1407.0026: Current time-domain wide-field sky surveys generally operate with few-degree-siz...
1407.0029: Spinless fermions on a honeycomb lattice provide a minimal realization of lattic...
1407.0030: \vskip 3pt \noindent The strong CP-violating parameter is small today as indicat...
1407.0031: Cross-correlating the Planck High Frequency Instrument (HFI) maps against quasar...
1407.0033: We study low-energy effective field theories for non-Fermi liquids with Fermi su...
processed 9999 abstracts

a

In [86]:
vocab_words = [c[0] for c in charvoc.most_common() if c[1] > 200]
print("vocab tokens: ", vocab_words)
print("vocab length: ", len(vocab_words))

vocab_words += ['<sos>', '<eos>', '<unk>']
vocab = {x: i+1 for i,x in enumerate(vocab_words)}
vocab_size = len(vocab)

with open('vocab.pkl', 'wb') as fp:
    pickle.dump([vocab, vocab_size], fp)


vocab tokens:  [' ', 'e', 't', 'i', 'a', 'o', 'n', 's', 'r', 'l', 'c', 'h', 'd', 'm', 'u', 'p', 'f', 'g', 'y', 'b', 'w', 'v', '.', '$', ',', '-', '\\', '}', '{', 'x', 'k', 'T', 'q', ')', '(', 'W', '1', 'S', '0', 'z', 'I', '2', 'A', 'M', 'C', '_', 'H', 'F', 'L', 'B', 'P', 'D', '^', 'R', 'G', 'N', '3', 'O', 'E', 'j', '5', "'", '4', 'K', 'V', '/', 'U', '=', '6', ':', '~', '8', '7', '9', '`', 'X', '+', ';', 'J', 'Q', 'Ã', 'Â', 'Z', ']', '[', 'Y', '"', '%', '<', '|', '*', '>', '\x90', '?', '\x91', '!', '©', '¢', '&']
vocab length:  99


[' ',
 'e',
 't',
 'i',
 'a',
 'o',
 'n',
 's',
 'r',
 'l',
 'c',
 'h',
 'd',
 'm',
 'u',
 'p',
 'f',
 'g',
 'y',
 'b',
 'w',
 'v',
 '.',
 '$',
 ',',
 '-',
 '\\',
 '}',
 '{',
 'x',
 'k',
 'T',
 'q',
 ')',
 '(',
 'W',
 '1',
 'S',
 '0',
 'z',
 'I',
 '2',
 'A',
 'M',
 'C',
 '_',
 'H',
 'F',
 'L',
 'B',
 'P',
 'D',
 '^',
 'R',
 'G',
 'N',
 '3',
 'O',
 'E',
 'j',
 '5',
 "'",
 '4',
 'K',
 'V',
 '/',
 'U',
 '=',
 '6',
 ':',
 '~',
 '8',
 '7',
 '9',
 '`',
 'X',
 '+',
 ';',
 'J',
 'Q',
 'Ã',
 'Â',
 'Z',
 ']',
 '[',
 'Y',
 '"',
 '%',
 '<',
 '|',
 '*',
 '>',
 '\x90',
 '?',
 '\x91',
 '!',
 '©',
 '¢',
 '&',
 '\x83',
 '¥',
 '\x80',
 '\xad',
 '®',
 '¾',
 '°',
 '¸',
 '½',
 '@',
 '¡',
 'µ',
 '¨',
 '«']