In [32]:
import json
import torch
import torch.nn as nn
import dgl
import networkx as nx
import numpy as np
#from torchnlp.encoders.text import whitespace_encoder 
 
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

In [33]:
#loading level 1 summarised paragraphs from json file exported from Phase 1 processing using transformers 
book = json.load( open('book_dict_summary.json'))
book

{'CONTENTS': [['The Alchemist Paulo Coelho Translated by Alan R. Clarke. Published 1992. ISBN 0-7225-3293-8. CONTENTS Part One Part Two Epilogue',
   'summarize: The Alchemist Paulo Coelho Translated by Alan R. Clarke. Published 1992. ISBN 0-7225-3293-8.']],
 'PART ONE': [['The boy\'s name was Santiago . Dusk was falling as the boy arrived with his herd at an abandoned church. The roof had fallen in long ago, and an enormous sycamore had grown on the spot where the sacristy had once stood. He decided to spend the night there. He saw to it that all the sheep entered through the ruined gate, and then laid some planks across it to prevent the flock from wandering away during the night. There were no wolves in the region, but once an animal had strayed during the night, and the boy had had to spend the entire next day searching for it. He swept the floor with his jacket and lay down, using the book he had just finished reading as a pillow. He told himself that he would have to start readin

In [34]:
book.keys()

dict_keys(['CONTENTS', 'PART ONE', 'PART TWO', 'EPILOGUE'])

In [35]:
chapter_nodes = list(book.keys())
chapter_nodes

['CONTENTS', 'PART ONE', 'PART TWO', 'EPILOGUE']

In [36]:
book_complete_texts=''
book_summarised_texts=''
book_nodes_tree=[]
chapter_para_nodes = []
para_sentence_nodes =[]
sentence_list=[]

counter =0
for key in book.keys():    
    for x in book.get(key):
        counter+=1
        book_complete_texts.join(x[0]+' ')
        book_summarised_texts.join(x[1]+' ')
        book_nodes_tree.append((key, x[1]))
        chapter_para_nodes.append((key,'p'+str(counter)))
        para_sentence_nodes.append(('p'+str(counter),x[1]))
        sentence_list.append(x[1])


'''
chapter_node
    paragraph_no_node
        sentence_node
'''

'\nchapter_node\n    paragraph_no_node\n        sentence_node\n'

In [37]:
chapter_para_nodes[:5]

[('CONTENTS', 'p1'),
 ('PART ONE', 'p2'),
 ('PART ONE', 'p3'),
 ('PART ONE', 'p4'),
 ('PART ONE', 'p5')]

In [38]:
para_sentence_nodes[:5]

[('p1',
  'summarize: The Alchemist Paulo Coelho Translated by Alan R. Clarke. Published 1992. ISBN 0-7225-3293-8.'),
 ('p2',
  'Santiago spent the night at an abandoned church with his herd. He swept the floor with his jacket and lay down, using the book he had just finished reading as a pillow. He had had the same dream that night as a week ago, and once again he had awakened before it ended.'),
 ('p3',
  'The boy and his family are traveling to a village. They are going to meet the daughter of a merchant. The merchant was the proprietor of a dry goods shop, and he always demanded that the sheep be sheared in his presence.'),
 ('p4',
  '"I didn\'t know shepherds knew how to read," said a girl\'s voice behind him. "Well, usually I learn more from my sheep than from books," he answered. The girl was typical of the region of Andalusia with flowing black hair, and eyes that vaguely recalled the Moorish conquerors.'),
 ('p5',
  '"I know other girls in other places," he said to his sheep. 

In [39]:
para_nodes_list = [x[0] for x in para_sentence_nodes]
para_nodes_list[:5]

['p1', 'p2', 'p3', 'p4', 'p5']

In [43]:
tokenizer = AutoTokenizer.from_pretrained('facebook/bart-large-cnn')
model =  AutoModelForSeq2SeqLM.from_pretrained('facebook/bart-large-cnn')
para_nodes_tokens = tokenizer(para_nodes_list,return_tensors='pt', 
                                 truncation=True, padding=True)
para_nodes_tokens 

{'input_ids': tensor([[    0,   642,   134,     2],
        [    0,   642,   176,     2],
        [    0,   642,   246,     2],
        [    0,   642,   306,     2],
        [    0,   642,   245,     2],
        [    0,   642,   401,     2],
        [    0,   642,   406,     2],
        [    0,   642,   398,     2],
        [    0,   642,   466,     2],
        [    0,   642,   698,     2],
        [    0,   642,  1225,     2],
        [    0,   642,  1092,     2],
        [    0,   642,  1558,     2],
        [    0,   642,  1570,     2],
        [    0,   642,   996,     2],
        [    0,   642,  1549,     2],
        [    0,   642,  1360,     2],
        [    0,   642,  1366,     2],
        [    0,   642,  1646,     2],
        [    0,   642,   844,     2],
        [    0,   642,  2146,     2],
        [    0,   642,  2036,     2],
        [    0,   642,  1922,     2],
        [    0,   642,  1978,     2],
        [    0,   642,  1244,     2],
        [    0,   642,  2481,     2]

In [46]:
model(para_nodes_tokens.get('input_ids'))

Seq2SeqLMOutput(loss=None, logits=tensor([[[ 1.0549e+01,  7.4387e-01,  3.2516e+00,  ...,  5.7467e-01,
           4.4127e-01,  3.3234e-01],
         [ 1.0549e+01,  7.4387e-01,  3.2516e+00,  ...,  5.7466e-01,
           4.4127e-01,  3.3234e-01],
         [-5.6332e-01,  6.8606e-01,  3.0796e+00,  ...,  6.5707e-01,
           3.7168e-01,  6.2994e-01],
         [-3.2926e+00, -3.6146e-02,  6.6308e+00,  ...,  2.2226e-01,
          -4.3229e-01,  3.4812e-03]],

        [[ 1.0797e+01,  7.2164e-01,  3.2840e+00,  ...,  5.5690e-01,
           4.3518e-01,  3.1876e-01],
         [ 1.0797e+01,  7.2164e-01,  3.2840e+00,  ...,  5.5690e-01,
           4.3518e-01,  3.1876e-01],
         [-1.8039e-01,  6.2079e-01,  2.9191e+00,  ...,  4.6676e-01,
           2.7982e-01,  3.6204e-01],
         [-2.5774e+00, -2.2652e-01,  7.5040e+00,  ...,  5.3335e-02,
          -6.8023e-01, -1.1983e-01]],

        [[ 1.0369e+01,  6.7868e-01,  3.1721e+00,  ...,  5.3400e-01,
           3.3986e-01,  3.3300e-01],
         [ 1.0369

In [45]:
chapter_nodes_tokens = tokenizer(chapter_nodes,return_tensors='pt', 
                                 truncation=True, padding=True)
chapter_nodes_tokens 

{'input_ids': tensor([[    0, 46897, 29203,     2,     1,     1],
        [    0, 41623, 19551,     2,     1,     1],
        [    0, 41623, 34302,     2,     1,     1],
        [    0,  9662,  3063, 10207,  9162,     2]]), 'attention_mask': tensor([[1, 1, 1, 1, 0, 0],
        [1, 1, 1, 1, 0, 0],
        [1, 1, 1, 1, 0, 0],
        [1, 1, 1, 1, 1, 1]])}

In [None]:
model(chapter_nodes_tokens)

In [None]:
para_sentence_nodes_token = tokenizer(para_sentence_nodes,return_tensors='pt', 
                                 truncation=True, padding=True)
para_sentence_nodes_token 

In [None]:
para_sentence_nodes_token.get('input_ids')[0].shape

In [None]:
chapter_nodes_tokens.get('input_ids')

In [None]:
chapter_nodes_tokens.get('input_ids')[0]

In [None]:
model

In [None]:
dgl.graph((torch.flatten(chapter_nodes_tokens.get('input_ids')),
torch.flatten(chapter_nodes_tokens.get('input_ids'))))

In [48]:
#### DGL tutorial
### for feature less graph, it is a common practice to include embeddings for each nodes
### before training, for this use the following approach
n_embed = nn.Embedding(10,3)#here 10 should be no of nodes

In [49]:
n_embed.weight

Parameter containing:
tensor([[-0.1929, -1.4352,  0.2694],
        [ 0.1583,  1.5303, -1.2957],
        [-0.1501,  0.7145,  1.7232],
        [-0.4000,  0.0812,  1.5697],
        [-0.5139,  0.3980, -1.0717],
        [-0.0657, -0.8058,  1.0126],
        [-1.2782, -2.2439,  1.4022],
        [ 0.4192,  1.4506, -0.4621],
        [-1.8659,  1.2447,  0.6979],
        [-0.6714, -0.0165, -0.2389]], requires_grad=True)

In [51]:
nn.init.xavier_uniform(n_embed.weight)

  nn.init.xavier_uniform(n_embed.weight)


Parameter containing:
tensor([[-0.3094,  0.2012,  0.1212],
        [ 0.0898,  0.6416, -0.4411],
        [-0.4324,  0.5715, -0.1998],
        [ 0.5071,  0.4990,  0.3110],
        [ 0.0160, -0.0930, -0.1378],
        [ 0.0920,  0.1256, -0.4895],
        [-0.0146,  0.2954, -0.6102],
        [-0.3967, -0.6280,  0.3653],
        [ 0.0612, -0.5646,  0.2377],
        [-0.5821,  0.0865,  0.1691]], requires_grad=True)

In [53]:
?torch.optim.Adam

[0;31mInit signature:[0m
[0mtorch[0m[0;34m.[0m[0moptim[0m[0;34m.[0m[0mAdam[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mparams[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mlr[0m[0;34m=[0m[0;36m0.001[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mbetas[0m[0;34m=[0m[0;34m([0m[0;36m0.9[0m[0;34m,[0m [0;36m0.999[0m[0;34m)[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0meps[0m[0;34m=[0m[0;36m1e-08[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mweight_decay[0m[0;34m=[0m[0;36m0[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mamsgrad[0m[0;34m=[0m[0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0;34m*[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mforeach[0m[0;34m:[0m [0mOptional[0m[0;34m[[0m[0mbool[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmaximize[0m[0;34m:[0m [0mbool[0m [0;34m=[0m [0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcapturable[0m[0;34m:[0m [0mbool[0m [0;34m=[0m [