In [45]:
import torch
import torch.nn as nn
import numpy as np
import argparse,random,logging,numpy,os
import json
from torch.autograd import Variable
from torch.utils.data import DataLoader
from torch.nn.utils import clip_grad_norm
from time import time
from tqdm import tqdm
import models

In [46]:
logging.basicConfig(level=logging.INFO, format='%(asctime)s [INFO] %(message)s')
parser = argparse.ArgumentParser(description='extractive summary')
# model
parser.add_argument('-save_dir',type=str,default='checkpoints/')
parser.add_argument('-embed_dim',type=int,default=100)
parser.add_argument('-embed_num',type=int,default=100)
parser.add_argument('-pos_dim',type=int,default=50)
parser.add_argument('-pos_num',type=int,default=100)
parser.add_argument('-seg_num',type=int,default=10)
parser.add_argument('-kernel_num',type=int,default=100)
parser.add_argument('-kernel_sizes',type=str,default='3,4,5')
parser.add_argument('-model',type=str,default='RNN_RNN')
parser.add_argument('-hidden_size',type=int,default=200)
# train
parser.add_argument('-lr',type=float,default=1e-3)
parser.add_argument('-batch_size',type=int,default=32)
parser.add_argument('-epochs',type=int,default=20)
parser.add_argument('-seed',type=int,default=1)
parser.add_argument('-train_dir',type=str,default='data/train.json')
parser.add_argument('-val_dir',type=str,default='data/val.json')
parser.add_argument('-embedding',type=str,default='data/embedding.npz')
parser.add_argument('-word2id',type=str,default='data/word2id.json')
parser.add_argument('-report_every',type=int,default=5)
parser.add_argument('-seq_trunc',type=int,default=50)
parser.add_argument('-max_norm',type=float,default=1.0)
# test
parser.add_argument('-load_dir',type=str,default='checkpoints/RNN_RNN_seed_1.pt')
parser.add_argument('-test_dir',type=str,default='data/test.json')
parser.add_argument('-ref',type=str,default='outputs/ref')
parser.add_argument('-origin',type=str,default='outputs/origin')
parser.add_argument('-lables',type=str,default='outputs/lables')
parser.add_argument('-hyp',type=str,default='outputs/hyp')
parser.add_argument('-pre',type=str,default='outputs/predict')
parser.add_argument('-filename',type=str,default='x.txt') # TextFile to be summarized
parser.add_argument('-topk',type=int,default=15)
# device
parser.add_argument('-device',type=int)
# option
parser.add_argument('-test',action='store_true')
parser.add_argument('-train',action='store_true')
parser.add_argument('-debug',action='store_true')
parser.add_argument('-predict',action='store_true')
parser.add_argument('-predict_all',action='store_true') # predict all
args = parser.parse_args()

In [47]:
embed = torch.Tensor(np.load("data/embedding.npz")['embedding'])

In [48]:
embed.shape

torch.Size([153824, 100])

In [49]:
import utils

In [50]:
with open("data/word2id.json") as f:
    word2id = json.load(f)
    vocab = utils.Vocab(embed, word2id)
    print(vocab)

<utils.Vocab.Vocab object at 0x1c2703908>


In [51]:
with open(args.train_dir) as f:
    examples = [json.loads(line) for line in f]
    train_dataset = utils.Dataset(examples)

In [54]:
with open(args.val_dir) as f:
    examples = [json.loads(line) for line in f]
    val_dataset = utils.Dataset(examples)

In [55]:
args.embed_num =embed.size(0)
args.embed_num

153824

In [56]:
args.embed_dim=embed.size(1)
args.embed_dim

100

In [57]:
args.kernel_sizes=[int(ks) for ks in args.kernel_sizes.split(',')]
args.kernel_sizes

[3, 4, 5]

In [58]:
net = getattr(models,args.model)(args,embed)

In [59]:
net

RNN_RNN(
  (abs_pos_embed): Embedding(100, 50)
  (rel_pos_embed): Embedding(10, 50)
  (embed): Embedding(153824, 100, padding_idx=0)
  (word_RNN): GRU(100, 200, batch_first=True, bidirectional=True)
  (sent_RNN): GRU(400, 200, batch_first=True, bidirectional=True)
  (fc): Linear(in_features=400, out_features=400, bias=True)
  (content): Linear(in_features=400, out_features=1, bias=False)
  (salience): Bilinear(in1_features=400, in2_features=400, out_features=1, bias=False)
  (novelty): Bilinear(in1_features=400, in2_features=400, out_features=1, bias=False)
  (abs_pos): Linear(in_features=50, out_features=1, bias=False)
  (rel_pos): Linear(in_features=50, out_features=1, bias=False)
)

In [60]:
train_iter = DataLoader(dataset=train_dataset,
            batch_size=args.batch_size,
            shuffle=True)
val_iter = DataLoader(dataset=val_dataset,
            batch_size=args.batch_size,
            shuffle=False)

In [62]:
criterion = nn.BCELoss()
criterion

BCELoss()

In [63]:
params = sum(p.numel() for p in list(net.parameters())) / 1e6
print('#Params: %.1fM' % (params))

#Params: 17.0M


In [65]:
min_loss = float('inf')
optimizer = torch.optim.Adam(net.parameters(),lr=args.lr)

In [44]:
#Get reference summary by lable
for i in range(len(origins)):
    ref=[]
    name_file= origins[i].split("/")[2]
    ori = readFile(origins[i])
    lable = readFile(lables[i])
    sentent=  ori.split("\n")
    list_lables= lable.split("\n")
    for i in range(len(sentent)):
        if(list_lables[i] == '1'):
            ref.append(sentent[i])
    with open("outputs/refer/"+name_file, 'w') as f:
        for item in ref:
            f.write("%s\n" % item)
    
    
    

NameError: name 'origins' is not defined

In [48]:
from __future__ import print_function
import torch

In [3]:
x = torch.randn(3)
print(x)

tensor([-0.4390, -1.3339, -0.6605])


In [4]:
y = torch.randn(3)

In [5]:
y.size()

torch.Size([3])

In [6]:
import torch.nn as nn

In [7]:
cos = nn.CosineSimilarity(dim=0)

In [8]:
print (cos(x,y))

tensor(-0.3479)


In [9]:
z = torch.randn(400)

In [10]:
z= z.view(1,-1)

In [11]:
2*z

tensor([[-0.7233,  2.3138,  3.0266, -2.6170,  2.1044,  3.6435,  2.8276, -1.4576,
          0.1917,  2.7865, -0.9188, -1.4830, -0.9808,  1.4554,  1.6691, -0.6281,
          0.8911,  2.4359, -1.6552,  1.9109,  0.6381, -2.4139, -1.1850, -0.6237,
          0.6718,  0.2315,  0.4460, -3.3131,  0.0385,  0.8599,  0.7817, -1.4436,
         -0.5166,  0.1615, -1.2504, -0.3631, -0.7480,  1.6206,  1.8567,  0.6369,
         -1.6678, -0.6702,  1.1897,  1.9984,  0.0361,  2.3814, -1.0759,  2.9297,
          2.3735,  0.4552, -0.7786, -2.9721,  3.2385, -2.2133,  1.4469, -0.1890,
         -1.5967, -2.0535, -0.3216, -4.4474,  1.7736,  0.6548, -1.4450, -1.9984,
         -0.1543,  0.6419,  3.2017, -1.3252,  3.4700, -2.8916,  1.5636,  2.4838,
          3.3101,  0.5045,  1.9046,  0.5430, -2.4051,  0.7972, -1.5550,  5.6442,
          3.3473,  0.7052, -3.3750,  3.3435, -2.9889,  2.1393, -0.2408,  1.3990,
         -0.0066,  3.3057, -1.8161,  0.7046,  2.2865, -1.2843, -2.3651,  1.9517,
          2.6358,  0.8988,  

In [47]:
import torch.nn as nn

In [20]:
sents_i = torch.randn(39,400)

In [21]:
sents_i

tensor([[-1.7238,  1.1632, -1.3943,  ...,  0.7767, -0.2968,  0.4621],
        [-0.7577,  0.4423, -0.3501,  ...,  0.3651,  0.3558,  1.3486],
        [-0.5842, -1.3438, -1.6129,  ...,  0.9289, -0.0453,  0.2647],
        ...,
        [-1.1302,  1.3264, -0.3430,  ...,  0.5151,  1.1447, -1.0910],
        [ 1.5417,  0.1207,  0.9869,  ...,  0.6856, -0.0847,  1.1233],
        [ 0.4594,  1.2397,  0.2801,  ...,  0.9821,  0.0732,  0.0219]])

In [31]:
sents_i[0]

tensor([-1.7238,  1.1632, -1.3943,  1.4063, -0.0736,  0.0145,  0.2460,  1.0829,
         0.5990, -0.3173,  1.0523, -1.9302,  2.0329, -1.3161, -0.6439,  0.7299,
        -1.0637,  0.6751, -1.0324,  1.1895, -0.8138,  0.4279, -0.1901,  1.0942,
        -0.3916,  0.9965,  2.1399, -0.0207,  0.6334, -0.2476,  0.5143,  0.5443,
        -0.0363, -0.2521,  0.2398,  0.8012, -0.2443,  2.0065,  0.0853,  1.2304,
         2.4518,  0.1829,  0.7641, -0.2324, -0.0158, -1.0129, -1.0456, -0.2127,
        -0.8117,  0.2734, -0.0655,  0.1092, -0.1427, -0.9616, -0.1284, -0.0986,
         0.4064,  0.3429,  0.7177,  1.3544,  0.4064, -2.2046,  0.2358,  0.3073,
        -0.5063, -0.8713,  1.5288,  1.3977,  0.7927,  0.9255,  0.3852, -0.1123,
         0.0627,  0.9523,  0.7242, -0.8588,  0.0662,  0.6654, -0.8231,  1.1614,
         0.3174,  0.4534, -0.2009, -0.9590,  1.0892, -0.2580,  0.7737, -0.7711,
         0.3884,  0.6391,  0.3703, -1.7752, -1.0153, -1.8481,  1.0819, -1.3577,
        -0.6685,  1.1388,  1.2461,  0.39

In [14]:
content = nn.Linear(2 * z, 1, bias=False)

TypeError: new(): argument 'size' must be tuple of ints, but found element of type Tensor at pos 2

In [15]:
content=nn.Linear(2*5,4)

In [16]:
content

Linear(in_features=10, out_features=4, bias=True)

In [32]:
len(sents_i[:-2])

37

In [23]:
import networkx as nx

In [40]:
 def page_rank_rel(sents_i, thres=0.1):
        """
        PageRank value of the sentence based on the sentence map

        :param thres: int
            Every two sentences are regarded relevant if their similarity is above a threshold.
        :return: dict
            Dictionary of index nodes with PageRank as value.
        """
        G = nx.Graph()
        cosine = nn.CosineSimilarity(dim=0)

        # Build a sentence map.
        # Every two sentences are regarded relevant if their similarity is above a threshold.
        # Every two relevant sentences are connected with a unidirectional link.
        for i in range(len(sents_i[:-2])):
            for j in range(len(sents_i[i + 1:])):
                cosine_similarity_sentence_doc = cosine(sents_i[i], sents_i[j])
                #if cosine_similarity_sentence_doc > thres:
                G.add_edge(i, j)

        pr = nx.pagerank(G)

        return pr

In [45]:
page_rank_rel(sents_i).get(0,0)

0.05490694621554126

In [50]:
result = torch.cat(page_rank_rel(sents_i).get(0,0), dim=0)

TypeError: cat(): argument 'tensors' (position 1) must be tuple of Tensors, not float

In [51]:
input = torch.Tensor(2, 4, 3)

In [55]:
var = [] 
for I in range(0, 5):
    var.append(new_stuff.unsqueeze(0))
var_tensor = torch.cat(var)

NameError: name 'new_stuff' is not defined

In [67]:
py_list = [page_rank_rel(sents_i).get(0,0)]
pt_tensor_from_list = torch.FloatTensor(py_list)

In [68]:
pt_tensor_from_list.view(1,-1)

tensor([[0.0549]])