In [1]:
# code meaning predictor using advance NN techs

In [54]:
'''Trains a LSTM on the IMDB sentiment classification task.

The dataset is actually too small for LSTM to be of any advantage
compared to simpler, much faster methods such as TF-IDF+LogReg.

Notes:

- RNNs are tricky. Choice of batch size is important,
choice of loss and optimizer is critical, etc.
Some configurations won't converge.

- LSTM loss decrease patterns during training can be quite different
from what you see with CNNs/MLPs/etc.
'''
from __future__ import print_function
import numpy as np
np.random.seed(1337)  # for reproducibility

from keras.preprocessing import sequence
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM, SimpleRNN, GRU
from keras.datasets import imdb

n_topic = 40
maxlen = 200  # cut texts after this number of words (among top max_features most common words)
batch_size = 10

print(len(X_train), 'train sequences')
print(len(X_test), 'test sequences')

print('Pad sequences (samples x time)')
X_train = sequence.pad_sequences(X_train, maxlen=maxlen)
X_test = sequence.pad_sequences(X_test, maxlen=maxlen)
print('X_train shape:', X_train.shape)
print('X_test shape:', X_test.shape)

print('Build model...')
model = Sequential()
model.add(GRU(64, dropout_W=0.25, dropout_U=0.25, input_dim=8))  # try using a GRU instead, for fun
model.add(Dense(128, activation='tanh'))
model.add(Dropout(0.5))
model.add(Dense(n_topic))
model.add(Activation('softmax'))

# try using different optimizers and different optimizer configs
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

print('Train...')
print(X_train.shape)
print(Y_train.shape)
hist = model.fit(X_train, Y_train, batch_size=batch_size, nb_epoch=200,
          validation_data=(X_test, Y_test))
score, acc = model.evaluate(X_test, Y_test,
                            batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)

3183 train sequences
796 test sequences
Pad sequences (samples x time)
X_train shape: (3183, 200, 8)
X_test shape: (796, 200, 8)
Build model...
Train...
(3183, 200, 8)
(3183, 40)
Train on 3183 samples, validate on 796 samples
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epo

In [55]:
# save model
def save_model(model_filename) :
    print(model.to_json(), file=open(model_filename+'.json', 'w'))
    model.save_weights(model_filename+'.weight.h5')
save_model('../model/ssn,i200,rnn64,f128tanh,f40ce,200ep')

In [62]:
print(*hist.history['val_loss'], sep='\n', file=open('temp.txt', 'w'))

In [52]:
import numpy as np
ratio_train = 0.8
rand_seed = 1337

print('Loading data...')
def sparse_to_row(ivlist) :
    a = np.zeros(n_topics)
    for i, v in ivlist :
        a[i] = v
    return a
Y = np.array([sparse_to_row(lda[c]) for c in corpus])
X = np.array([np.array([code_model[ins]
        for i, ins in enumerate(ins_seq) if i<200
    ]) for ins_seq in l_ins
])

n = len(X)
n_train = int(ratio_train * n)
np.random.seed(rand_seed)
ind = np.random.permutation(n)
ind_train = ind[:n_train]
ind_test = ind[n_train:]
(X_train, Y_train) = X[ind_train], Y[ind_train]
(X_test, Y_test) = X[ind_test], Y[ind_test]

print('X_train shape:', X_train.shape)
print('Y_train shape:', Y_train.shape)
print(X_train.shape[0], 'train samples')
print(X_test.shape[0], 'test samples')

Loading data...
X_train shape: (3183,)
Y_train shape: (3183, 40)
3183 train samples
796 test samples


In [14]:
import data
import topicmodeling as tp
import gensim as g

l_interested = list(set(data.get_entity_list(data.libinfo.interested_libs)))
print(len(l_interested))
l_code = list(map(data.get_code, l_interested))
l_doc = list(map(data.get_doc, l_interested))
l_doc = list(map(lambda doc : data.pdoc.extract(doc, stage=data.pdoc.ADVANCED), l_doc))



3979


In [15]:
# In[34]:

documents = l_doc
texts = tp.simple_process(documents=documents, stoplist=tp.read_stoplist("../SmartStoplist.txt"))
texts = tp.remove_infrequent(texts, n_times=1)


# In[35]:

id2word = g.corpora.Dictionary(texts)
#id2word.save('/tmp/deerwester.dict') # store the id2word, for future reference
print(*list(id2word)[:10])


# In[36]:

corpus = [id2word.doc2bow(text) for text in texts]
#g.corpora.corpusCorpus.serialize('/tmp/deerwester.mm', corpus) # store to disk, for later use
print(*list(corpus)[:10])
tfidf = g.models.tfidfmodel.TfidfModel(corpus)
corpus = [tfidf[bag] for bag in corpus]
print(*list(corpus)[:10])


# In[ ]:

# load id->word mapping (the id2word), one of the results of step 2 above
#id2word = g.g.corpora.id2word.load_from_text('wiki_en_wordids.txt')
# load corpus iterator
#corpus = g.corpora.MmCorpus('/tmp/deerwester.mm')
#corpus = g.g.corpora.MmCorpus(bz2.BZ2File('wiki_en_tfidf.mm.bz2')) # use this if you compressed the TFIDF output (recommended)


# In[81]:

n_topics = 40
## extract 100 LDA topics, using 1 pass and updating once every 1 chunk (10,000 documents)
lda = g.models.ldamodel.LdaModel(corpus=corpus, id2word=id2word, num_topics=n_topics, 
                                 update_every=1, chunksize=10000, passes=5)
## print the most contributing words for n_topic topics
l = list(lda.print_topics(n_topics))
for i, s in l :
    print(i, *s.split(' + '))

685 27 471 682 467 476 1355 1201 368 1278
[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1)] [(5, 1), (6, 1), (7, 1), (8, 1), (9, 1)] [(10, 2), (11, 1), (12, 1), (13, 1), (14, 1)] [(13, 1), (15, 1), (16, 1), (17, 1), (18, 1)] [(16, 1), (19, 1), (20, 1), (21, 1)] [(16, 1), (22, 1), (23, 1), (24, 1)] [(25, 1), (26, 1)] [(15, 1), (16, 1), (17, 1), (18, 1), (27, 1)] [(1, 1), (4, 1), (16, 1), (20, 1), (28, 1), (29, 1)] [(16, 1), (22, 1), (23, 1), (24, 1)]
[(0, 0.49399426709707445), (1, 0.43248535675580047), (2, 0.5739877358718446), (3, 0.44819876007159315), (4, 0.19642309132592412)] [(5, 0.46641051263332084), (6, 0.38002353185707194), (7, 0.44045878078803113), (8, 0.35597827854819025), (9, 0.563310639495286)] [(10, 0.49238052046400904), (11, 0.5162334263963622), (12, 0.5162334263963622), (13, 0.33984698427280347), (14, 0.3302598208841272)] [(13, 0.4631857093147272), (15, 0.5764470680488752), (16, 0.2193312382637294), (17, 0.41208740142097555), (18, 0.4850211932856909)] [(16, 0.3068937687430323), (19,

In [38]:
import dis
import itertools

l_ins = [[i.opname for i in dis.get_instructions(code)]
        for code in l_code]
all_ins = list(itertools.chain(*l_ins))
print(len(all_ins))
all_ins_unique = set(all_ins)
print(len(all_ins_unique))

code_model = g.models.word2vec.Word2Vec(l_ins, size=8, window=10, min_count=0, workers=4, seed=1337, iter=20)

339060
84


In [72]:
lll = lda
#index = g.similarities.SparseMatrixSimilarity(lll[corpus], num_features=22)
index = g.similarities.Similarity(None, lll[corpus], num_features=n_topics)

In [81]:
i_doc = 1959
text = texts[i_doc]
sims = index[lll[tfidf[id2word.doc2bow(text)]]]
print(i_doc, '', texts[i_doc], sep=' | ')
for i, score in list(sorted(enumerate(sims), key=lambda t : t[1], reverse=True))[:10] :
    print(i, "%.3f"%score, texts[i], sep=' | ')

1959 |  | ['popul', 'random', 'type', 'initi', 'cluster', 'latin', 'hypercub', 'sampl', 'gener']
587 | 1.000 | ['comput', 'neg', 'gradient']
853 | 1.000 | ['comput', 'partial', 'fraction', 'expans']
861 | 1.000 | ['comput', 'partial', 'fraction', 'expans']
869 | 1.000 | ['comput', 'partial', 'fraction', 'expans']
876 | 1.000 | ['comput', 'partial', 'fraction', 'expans']
1613 | 1.000 | ['calcul', 'phase', 'gener', 'output']
1629 | 1.000 | ['calcul', 'phase', 'gener', 'output']
1819 | 1.000 | ['set', 'storag', 'index', 'locat', 'valu']
1959 | 1.000 | ['popul', 'random', 'type', 'initi', 'cluster', 'latin', 'hypercub', 'sampl', 'gener']
1967 | 1.000 | ['partial', 'depend', 'plot', 'featur']


In [74]:
print([i_doc for i_doc, doc in enumerate(l_doc) if 'cluster' in doc])

[221, 254, 265, 273, 280, 803, 814, 848, 854, 862, 870, 882, 899, 937, 945, 960, 1207, 1229, 1235, 1244, 1268, 1272, 1277, 1327, 1355, 1889, 1934, 1959, 2056, 2059, 2074, 2101, 2112, 2168, 2181, 2184, 2186, 2190, 2193, 2195, 2787, 2850, 2901, 3053, 3142, 3159, 3470, 3476, 3521, 3565, 3595, 3600, 3604, 3610, 3619, 3655, 3772, 3775, 3784, 3791, 3794, 3806, 3815, 3823, 3825, 3830, 3833, 3840]
