In [2]:
# 0. import packages
import spacy
import os
from collections import Counter
import torch
import glob
from packages.functions import parse_cnn, word_list_to_idx_list
from spacy import attrs
import numpy as np
vocab_size = 500
batch_size = 10

In [3]:
# 1. load nlp model & files to read
nlp = spacy.load('en') # loads default English object
cnn_dir = '/home/jatin/intern/internenv/cnn/stories/'
cnn_pre_dir = '/home/jatin/intern/internenv/cnn/stories_idx/'
file_list = [os.path.join(cnn_dir,file) for file in os.listdir(cnn_dir)]

In [15]:
#2. create dictionary using frequent words
body_list = []
summary_list = []
counter = Counter()
batch_no = 0
while batch_no<len(file_list[200]):
    batch = file_list[batch_no:min(batch_no+batch_size,len(file_list))]
    for file in batch:
        body_words, summary_words = parse_cnn(file,nlp)
        body_list.extend(body_words)
        summary_list.extend(summary_words)
    c = Counter(body_list+summary_list)
    counter = counter + c
    vocab_list = counter.most_common(vocab_size)
    word2idx = dict()
    word2idx['<PAD>']=0
    word2idx['<S>']=1
    word2idx['</S>']=2
    word2idx['<UNK>']=3
    idx2word = dict()
    idx2word[0] = '<PAD>'
    idx2word[1] = '<S>'
    idx2word[2] = '</S>'
    idx2word[3] = '<UNK>'
    for i,(word,_) in enumerate(vocab_list):
        if len(word2idx)>vocab_size:
            break
        word2idx[word] = i+4
        idx2word[i+4] = word
    np.save('word2idx.npy',word2idx)
    np.save('idx2word.npy',idx2word)
    batch_no+=batch_size
    print("Vocabulary created from %d/%d files, top %d words saved" 
          %(batch_no,len(file_list),len(word2idx)))

Vocabulary created from 10/92579 files, top 501 words saved
Vocabulary created from 20/92579 files, top 501 words saved
Vocabulary created from 30/92579 files, top 501 words saved
Vocabulary created from 40/92579 files, top 501 words saved
Vocabulary created from 50/92579 files, top 501 words saved
Vocabulary created from 60/92579 files, top 501 words saved
Vocabulary created from 70/92579 files, top 501 words saved
Vocabulary created from 80/92579 files, top 501 words saved
Vocabulary created from 90/92579 files, top 501 words saved


In [20]:
w2i = np.load('word2idx.npy').item()
i2w = np.load('idx2word.npy').item()
v = len(w2i)
# 3. preprocess each document in CNN so that we get a form where a text is seen in vectors
out_file_list = [os.path.join(cnn_pre_dir,file) for file in os.listdir(cnn_dir)]
in_out_zip = zip(file_list, out_file_list)
cnt = 0
for in_file, out_file in in_out_zip:
    body_words, summary_words = parse_cnn(in_file, nlp)
    body_idx = word_list_to_idx_list(body_words, w2i, v)
    body_idx = [str(x) for x in body_idx]
    summary_idx = word_list_to_idx_list(summary_words,w2i,v)
    summary_idx = [str(x) for x in summary_idx]
    out = ' '.join(body_idx)+"::"+' '.join(summary_idx)
    with open(out_file,'w') as f:
        f.write(out)
    cnt+=1
    if cnt%1000==0:
        print('%d files processed so far' %(cnt))
    # small file batch    
    if cnt>2000:
        break

1000 files processed so far
2000 files processed so far


In [21]:
summary_words

[' ',
 'michel',
 'platini',
 'reiterates',
 'his',
 'preference',
 'for',
 'extra',
 'officials',
 'over',
 'technology',
 'in',
 'football',
 '.',
 ' ',
 'uefa',
 'president',
 'says',
 'the',
 'cost',
 'of',
 'installing',
 'goal',
 '-',
 'line',
 'technology',
 'is',
 'prohibitive',
 '.',
 ' ',
 'glt',
 'was',
 'introduced',
 'in',
 'the',
 'english',
 'premier',
 'league',
 'at',
 'the',
 'start',
 'of',
 'the',
 '2013',
 '-',
 '14',
 'season',
 '.',
 ' ',
 'platini',
 'defends',
 'uefa',
 "'s",
 'decision',
 'to',
 'expand',
 'the',
 'european',
 'championships',
 'to',
 '24',
 'teams',
 '.']

In [22]:
with open(file_list[0]) as f:
    text = f.read()
    text = text.lower()
    text = text.replace('\n\n',' ')
    text = text.replace('(cnn)','')
    text = text.split("@highlight")
    body = text[0]
    body_tokens = nlp(body)
    summaries = text[1:]
    summary_tokens = nlp(' '.join([x.strip()+'.' for x in summaries])+'.')

In [23]:
w2i = dict()
w2i['<PAD>']=0
w2i['<S>']=1
w2i['</S>']=2

i2w = dict()
i2w[0]='<PAD>'
i2w[1]='<S>'
i2w[2]='</S>'

for i,word in enumerate(word2idx):
    if len(w2i)>500:
        break
    w2i[word] = i+3
    i2w[i+3] = word

In [24]:
i2w

{0: '<PAD>',
 1: '<S>',
 2: '</S>',
 3: 'like',
 4: 'small',
 5: 'year',
 6: 'equality',
 7: 'open',
 8: 'most',
 9: 'got',
 10: 'al',
 11: 'go',
 12: 'because',
 13: 'process',
 14: 'election',
 15: 'state',
 16: 'about',
 17: 'technology',
 18: 'defense',
 19: 'rights',
 20: 'called',
 21: 'show',
 22: 'today',
 23: 'as',
 24: 'days',
 25: 'be',
 26: ':',
 27: 'e',
 28: 'issues',
 29: 'act',
 30: 'under',
 31: 'expected',
 32: 'policy',
 33: 'did',
 34: 'can',
 35: 'au',
 36: 'place',
 37: 'by',
 38: 'movement',
 39: 'to',
 40: 'women',
 41: 'will',
 42: 'amazon',
 43: 'two',
 44: 'has',
 45: 'official',
 46: 'past',
 47: '2010',
 48: 'against',
 49: 'sen',
 50: 'those',
 51: 'department',
 52: 'thomas',
 53: '2008',
 54: 'held',
 55: 'day',
 56: 'university',
 57: 'of',
 58: 'million',
 59: 'stewart',
 60: 'areas',
 61: 'does',
 62: 'never',
 63: 'deal',
 64: 'such',
 65: 'forces',
 66: 'camps',
 67: 'support',
 68: 'right',
 69: 'would',
 70: 'gift',
 71: 'members',
 72: 'through',

In [25]:
w2i

{' ': 142,
 '!': 357,
 '"': 271,
 '$': 206,
 "'": 401,
 "'ll": 377,
 "'m": 106,
 "'re": 410,
 "'s": 460,
 "'ve": 126,
 '(': 243,
 ')': 219,
 ',': 385,
 '-': 120,
 '--': 295,
 '.': 235,
 '...': 199,
 '0': 361,
 '1': 163,
 '2': 205,
 '2007': 141,
 '2008': 53,
 '2010': 47,
 '2012': 234,
 ':': 26,
 ';': 216,
 '</S>': 332,
 '<PAD>': 417,
 '<S>': 309,
 '<UNK>': 211,
 '?': 178,
 'a': 98,
 'able': 241,
 'about': 16,
 'according': 441,
 'across': 351,
 'act': 29,
 'administration': 209,
 'after': 294,
 'against': 48,
 'agency': 257,
 'ago': 188,
 'al': 10,
 'all': 412,
 'along': 263,
 'already': 468,
 'also': 242,
 'amazon': 42,
 'america': 384,
 'american': 469,
 'among': 160,
 'an': 252,
 'and': 92,
 'another': 102,
 'anti': 156,
 'any': 108,
 'are': 386,
 'areas': 60,
 'around': 348,
 'as': 23,
 'asked': 378,
 'assad': 411,
 'at': 276,
 'attention': 470,
 'au': 35,
 'authorities': 101,
 'away': 292,
 'back': 240,
 'based': 123,
 'be': 25,
 'because': 12,
 'been': 75,
 'before': 267,
 'being'

In [26]:
def nlp_to_tokens(token_list,word2idx):
    out = []
    oov2idx = dict()
    oov_idx = 0
    for token in token_list:
        word = token.text
        try:
            out.append(word2idx[word])
        except KeyError:
            if word not in oov2idx:
                oov_idx+=1
                oov2idx[word]=vocab_size+oov_idx
            out.append(oov2idx[word])
    return out, oov2idx

In [27]:
out, oov2idx = nlp_to_tokens(list(body_tokens),word2idx)

In [28]:
out

[23,
 31,
 61,
 36,
 57,
 10,
 450,
 501,
 12,
 119,
 261,
 73,
 10,
 390,
 30,
 221,
 4,
 220,
 8,
 70,
 502,
 9,
 70,
 503,
 18,
 10,
 504,
 7,
 505,
 10,
 506,
 507,
 508,
 9,
 316,
 4,
 509,
 8,
 4,
 510,
 18,
 4,
 427,
 5,
 44,
 250,
 297,
 7,
 45,
 8,
 99,
 501,
 5,
 46,
 511,
 512,
 94,
 6,
 10,
 513,
 514,
 390,
 30,
 474,
 4,
 254,
 7,
 337,
 22,
 70,
 515,
 516,
 6,
 128,
 7,
 517,
 518,
 9,
 519,
 520,
 12,
 62,
 521,
 482,
 6,
 11,
 522,
 523,
 5,
 11,
 64,
 524,
 4,
 164,
 525,
 6,
 4,
 478,
 305,
 9,
 4,
 59,
 337,
 25,
 492,
 501,
 73,
 10,
 390,
 526,
 527,
 7,
 528,
 10,
 488,
 508,
 5,
 323,
 13,
 522,
 390,
 16,
 4,
 383,
 390,
 7,
 529,
 14,
 220,
 5,
 37,
 530,
 10,
 531,
 532,
 5,
 12,
 4,
 266,
 533,
 6,
 184,
 390,
 534,
 7,
 129,
 4,
 204,
 14,
 10,
 471,
 13,
 116,
 7,
 535,
 118,
 26,
 536,
 6,
 9,
 6,
 537,
 94,
 325,
 17,
 211,
 6,
 4,
 215,
 109,
 538,
 195,
 4,
 539,
 540,
 541,
 542,
 298,
 5,
 323,
 6,
 543,
 75,
 537,
 36,
 544,
 545,
 12,
 35,
 182,
 

In [35]:
doc = nlp(body)
lst = list(doc)

In [36]:
words = list(set(lst))

In [38]:
out= []
oov_dict = dict()
for x in words:
    try:
        y = word2idx[x]
        out.append(y)
    except KeyError:
        oov_dict[x]

KeyError: after

In [41]:
word2idx['small']

352

In [44]:
c.most_common(300)[0][0]

'the'

In [45]:
import numpy as np
a = list(np.arange(32))

In [49]:
word_list = []
i = 0
for file_name in file_list[0:200]:
    with open(file_name) as f:
        text = f.read()
        text = text.lower()
        text = text.replace('\n\n',' ')
        text = text.replace('(cnn)','')
        text = text.split("@highlight")
        body = text[0]
        doc = list(nlp(body))
        word_list.extend([x.text for x in doc])
    if i%1000==0:
        print(i)
    i+=1

0


In [50]:
c = c + Counter(['a','b','a','b'])

In [52]:
c.most_common(100)[0][0]

'the'

In [53]:
len(list(set(word_list)))

14454

In [54]:
from torch import nn
import numpy as np
from torch.autograd import Variable
a = Variable(torch.LongTensor(np.arange(40).reshape(4,10)))
emb = nn.Embedding(40,20)

In [57]:
help(nn.LSTM)

Help on class LSTM in module torch.nn.modules.rnn:

class LSTM(RNNBase)
 |  Applies a multi-layer long short-term memory (LSTM) RNN to an input
 |  sequence.
 |  
 |  
 |  For each element in the input sequence, each layer computes the following
 |  function:
 |  
 |  .. math::
 |  
 |          \begin{array}{ll}
 |          i_t = \sigma(W_{ii} x_t + b_{ii} + W_{hi} h_{(t-1)} + b_{hi}) \\
 |          f_t = \sigma(W_{if} x_t + b_{if} + W_{hf} h_{(t-1)} + b_{hf}) \\
 |          g_t = \tanh(W_{ig} x_t + b_{ig} + W_{hg} h_{(t-1)} + b_{hg}) \\
 |          o_t = \sigma(W_{io} x_t + b_{io} + W_{ho} h_{(t-1)} + b_{ho}) \\
 |          c_t = f_t c_{(t-1)} + i_t g_t \\
 |          h_t = o_t \tanh(c_t)
 |          \end{array}
 |  
 |  where :math:`h_t` is the hidden state at time `t`, :math:`c_t` is the cell
 |  state at time `t`, :math:`x_t` is the input at time `t`, :math:`h_{(t-1)}`
 |  is the hidden state of the previous layer at time `t-1` or the initial hidden
 |  state at time `0`, and :math

In [58]:
lstm = nn.LSTM(hidden_size=100,input_size=20, batch_first=True)

In [60]:
A=np.arange(24).reshape(4,6)
print(A)
A=A*(-1)
print(A)
A=A+15
print(A)
A = np.maximum(A,0)
print(A)
A = Variable(torch.LongTensor(A))
print(A)

[[ 0  1  2  3  4  5]
 [ 6  7  8  9 10 11]
 [12 13 14 15 16 17]
 [18 19 20 21 22 23]]
[[  0  -1  -2  -3  -4  -5]
 [ -6  -7  -8  -9 -10 -11]
 [-12 -13 -14 -15 -16 -17]
 [-18 -19 -20 -21 -22 -23]]
[[15 14 13 12 11 10]
 [ 9  8  7  6  5  4]
 [ 3  2  1  0 -1 -2]
 [-3 -4 -5 -6 -7 -8]]
[[15 14 13 12 11 10]
 [ 9  8  7  6  5  4]
 [ 3  2  1  0  0  0]
 [ 0  0  0  0  0  0]]
tensor([[ 15,  14,  13,  12,  11,  10],
        [  9,   8,   7,   6,   5,   4],
        [  3,   2,   1,   0,   0,   0],
        [  0,   0,   0,   0,   0,   0]])


In [61]:
B = A==0
B.float().data

tensor([[ 0.,  0.,  0.,  0.,  0.,  0.],
        [ 0.,  0.,  0.,  0.,  0.,  0.],
        [ 0.,  0.,  0.,  1.,  1.,  1.],
        [ 1.,  1.,  1.,  1.,  1.,  1.]])

In [62]:
c = Variable(torch.Tensor(1,4,100))
out=lstm(emb(a[:,0].unsqueeze(1)), (c,c))

In [64]:
out[0].size()

torch.Size([4, 1, 100])

In [65]:
emb(a).size()

torch.Size([4, 10, 20])

In [66]:
c = Counter(['a','a','a','a','a','b'])

In [None]:
import numpy as np
import spacy
import os
from collections import Counter
import torch
import glob
from spacy import attrs


word2idx = np.load('word2idx.npy').item()
vocab_size = len(word2idx)
batch_size = 1000

nlp = spacy.load('en') # loads default English object
cnn_dir = '/home/jatin/intern/internenv/cnn/stories/'
cnn_pre_dir = '/home/jatin/intern/internenv/cnn/preprocessed_stories/'

file_list = [os.path.join(cnn_dir,file) for file in os.listdir(cnn_dir)]
total_files = len(file_list)
files_read = 0
count = 0
for file in file_list[0:100]:
    with open(file) as f:
        text = f.read()
#       print(text)
        text = text.lower()
        text = text.replace('\n\n',' ')
        text = text.replace('(cnn)','')
        text = text.split("@highlight")
        body = text[0]
        body_words = body.split(' ')
        summaries = ' . '.join(text[1:])+' .'
        summary_words = summaries.split(' ')
        unique_words = list(set(body_words+summary_words))
        temp_dict = dict()
        oovs = 0
        for w in unique_words:
            try:
                temp_dict[w] = word2idx[w]
            except KeyError:
                oovs+=1
                temp_dict[w] = oovs+vocab_size
        body_idx = [str(temp_dict[x]) for x in body_words]
        summary_idx = [str(temp_dict[x]) for x in summary_words]
        out = ' '.join(body_idx)+'::'+' '.join(summary_idx)
        out_file = file.replace('/stories/','/preprocessed_stories/')
    with open(out_file,'w') as f:
        f.write(out)
    count+=1
    if count%100==0:
        print(count)


# 		doc = nlp(text)


# counter = Counter()
# while (files_read<total_files):
#     word_list = []
#     batch_files = file_list[files_read:min(files_read+1000,total_files)]
#     for file_name in batch_files:
#         with open(file_name) as f:
#             text = f.read()
#             text = text.lower()
#             text = text.replace('\n\n',' ')
#             text = text.replace('(cnn)','')
#             text = text.split("@highlight")
#             body = text[0]
#             doc = list(nlp(body))
#             word_list.extend([x.text for x in doc])

#     counter = counter + Counter(word_list)
#     files_read+=len(batch_files)
#     print("%d files read so far..." % files_read)
#     word2idx = {tup[0]: i for i,tup in enumerate(counter.most_common(vocab_size))}
#     np.save('word2idx.npy',word2idx)
# print("All merged!")
# word2idx = {tup[0]: i for i,tup in enumerate(counter.most_common(vocab_size))}
# np.save('word2idx.npy',word2idx)