In [1]:
!which python

/home/ec2-user/anaconda3/envs/python3/bin/python


In [2]:
!conda install cudatoolkit=10.0 -y
!pip install git+https://github.com/allenai/longformer.git

Collecting package metadata (current_repodata.json): done
Solving environment: / 
The environment is inconsistent, please check the package plan carefully
The following packages are causing the inconsistency:

  - conda-forge/noarch::tqdm==4.62.3=pyhd8ed1ab_0
  - conda-forge/noarch::black==21.11b1=pyhd8ed1ab_0
  - conda-forge/linux-64::conda-package-handling==1.7.3=py38h497a2fe_1
  - conda-forge/noarch::dask-core==2021.11.2=pyhd8ed1ab_0
  - conda-forge/noarch::imageio==2.9.0=py_0
  - conda-forge/linux-64::pytest==6.2.5=py38h578d9bd_1
  - conda-forge/linux-64::watchdog==2.1.6=py38h578d9bd_1
  - conda-forge/linux-64::aiohttp==3.8.1=py38h497a2fe_0
  - conda-forge/linux-64::astropy==5.0=py38h6c62de6_0
  - conda-forge/linux-64::bokeh==2.4.2=py38h578d9bd_0
  - conda-forge/linux-64::distributed==2021.11.2=py38h578d9bd_0
  - conda-forge/noarch::flask==2.0.2=pyhd8ed1ab_0
  - conda-forge/linux-64::matplotlib-base==3.5.0=py38hf4fb855_0
  - conda-forge/noarch::nbformat==5.1.3=pyhd8ed1ab_0
  - cond

In [3]:
import torch
from longformer.longformer import Longformer, LongformerConfig
from longformer.sliding_chunks import pad_to_window_size
from transformers import RobertaTokenizer

config = LongformerConfig.from_pretrained('longformer-base-4096/') 
# choose the attention mode 'n2', 'tvm' or 'sliding_chunks'
# 'n2': for regular n2 attantion
# 'tvm': a custom CUDA kernel implementation of our sliding window attention
# 'sliding_chunks': a PyTorch implementation of our sliding window attention
config.attention_mode = 'sliding_chunks'

model = Longformer.from_pretrained('longformer-base-4096/', config=config)
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
tokenizer.model_max_length = model.config.max_position_embeddings

SAMPLE_TEXT = ' '.join(['Hello world! '] * 1000)  # long input document

input_ids = torch.tensor(tokenizer.encode(SAMPLE_TEXT)).unsqueeze(0)  # batch of size 1

# TVM code doesn't work on CPU. Uncomment this if `config.attention_mode = 'tvm'`
# model = model.cuda(); input_ids = input_ids.cuda()

# Attention mask values -- 0: no attention, 1: local attention, 2: global attention
attention_mask = torch.ones(input_ids.shape, dtype=torch.long, device=input_ids.device) # initialize to local attention
attention_mask[:, [1, 4, 21,]] =  2  # Set global attention based on the task. For example,
                                     # classification: the <s> token
                                     # QA: question tokens

# padding seqlen to the nearest multiple of 512. Needed for the 'sliding_chunks' attention
input_ids, attention_mask = pad_to_window_size(
        input_ids, attention_mask, config.attention_window[0], tokenizer.pad_token_id)

output = model(input_ids, attention_mask=attention_mask)[0]

In [4]:
!pip install datasets 

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com


# prepare data

In [72]:
#preprocess
import json
import os
import pandas as pd
from sklearn.model_selection import train_test_split

In [73]:
def load_json(json_file):
    # Opening JSON file 
    f = open(json_file) 

    # returns JSON object as  
    # a dictionary 
    data = json.load(f) 
    content_ls = [' '.join(data['content'][str(i)]['word_list']) for i in range(len(data['content']))]
    #label_ls = [data['content'][str(i)]['dialogue_label'] for i in range(len(data['content']))]
    label_ls = [int(data['content'][str(i)]['ner_label'][0]) for i in range(len(data['content']))]
    role_dict = data['role_id']
    return content_ls,label_ls,role_dict

def edit_b(x,role):
    res = []
    for i in range(len(x)):
        res_str = "center: "+x[i] + " roles: "+str(list(role.values())) + " paragraph: " + ','.join(x)
        res.append(res_str) 
    return res

In [74]:
#load one book
def load_book(book_path,tag):
    chapter_ls = os.listdir(book_path)
    cut=int(len(chapter_ls)*0.9)
    if tag is True:
        chapter_ls = chapter_ls[:cut]
    else:
        chapter_ls = chapter_ls[cut:]
        
    print ("<<< books: ", chapter_ls)
    res = []
    for i in chapter_ls:
        if i[-4:]=="json":
            json_file = os.path.join(book_path,i)
            content_ls, label_ls, role_dict = load_json(json_file)
            content_ls = edit_b(content_ls,role_dict)
            
            df_res = pd.DataFrame({'sentence1_key':content_ls,'label':label_ls})
            df_res = df_res[df_res['label']!=0]
            df_res["label"] = df_res["label"].map(lambda x: role_dict[str(x)])
            df_res["label"] = df_res["label"].map(lambda x: x+" said the sentence")

            df_res['sentence1_key'] = df_res['sentence1_key'].map(lambda x: x.replace('“','"'))
            df_res['sentence1_key'] = df_res['sentence1_key'].map(lambda x: x.replace('”','"'))
            res.append(df_res)
    res_table = pd.concat(res)

    return res_table

In [75]:
book_path = 'stary/new_example_ten_json'
train = load_book(book_path,True)
test = load_book(book_path,False)

<<< books:  ["20%(2271377The Mafia's Good Wife)(1).json", '80%(2059119Heart of Freeman).json', '60%_(2040244Lunar wolvesHis to own Book 1Complete).json', '30%_(2165912The Curse Of Violet Wraith).json', '60%_(2144894A Moonlit Encounter).json', '.ipynb_checkpoints', '20%_(1993322ASHER RICK).json', '20%_(2070697Revenge on my Ex-Husband).json', '60%_(2164082New Husband For My Wife) .json']
<<< books:  ['80%(2192588love&mate) (1).json', '85%_(2061307His Ruthless Assistant (completed )).json']


In [80]:
train['len'] = train['sentence1_key'].map(lambda x: len(x))

In [82]:
train['len'].describe()

count      1993.000000
mean     102760.614150
std       23012.371911
min       62817.000000
25%       68870.000000
50%      117798.000000
75%      122920.000000
max      123295.000000
Name: len, dtype: float64

In [83]:
test1, test2 = train_test_split(test,test_size=0.1,random_state=0)
train = pd.concat([train,test1])
print ("train size {}, test size{}".format(train.shape,test2.shape))

train size (2611, 3), test size(69, 2)


In [84]:
import os

path = "stary/model_b_longdata"

# Check whether the specified path exists or not
isExist = os.path.exists(path)

if not isExist:
    os.makedirs(path)
    print("The new directory is created!")
    
train[["label","sentence1_key"]].to_csv('stary/model_b_longdata/train.csv',index=False,encoding='utf-8')
test[["label","sentence1_key"]].to_csv('stary/model_b_longdata/test.csv',index=False,encoding='utf-8')

The new directory is created!


# train

In [3]:
python -u longformer/scripts/summarization_opt.py \
--model_path 'longformer-encdec-base-16384' \
--tokenizer 'longformer-encdec-base-16384' \
--epochs 5 \
--max_input_len 10000 \
--batch_size 1 \
--dataset_name "test" \
--train_file 'stary/model_b_longdata/train.csv' \
--test_file 'stary/model_b_longdata/test.csv' \
--validation_file 'stary/model_b_longdata/test.csv' 

OSError: Background processes not supported.

In [None]:
from collections import OrderedDict   #导入此模块
base_weights = torch.load(ckpt_pth)['state_dict']
new_state_dict = OrderedDict()

for k, v in base_weights.items():
    #print (k)
    if k=='model.final_logits_bias':
        new_state_dict['final_logits_bias'] = v 
        new_state_dict[k] = v 
    else:
        new_state_dict[k] = v 


In [44]:
import torch 
from longformer import LongformerEncoderDecoderForConditionalGeneration, LongformerEncoderDecoderConfig
from transformers import AutoTokenizer

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

#config = LongformerEncoderDecoderConfig.from_pretrained('/home/ec2-user/SageMaker/summarization/test/_ckpt_epoch_0_v0.ckpt')
config = LongformerEncoderDecoderConfig.from_pretrained('longformer-encdec-base-16384')
#config.attention_dropout = self.args.attention_dropout
#config.gradient_checkpointing = self.args.grad_ckpt
config.attention_mode = 'sliding_chunks'
#config.attention_window = [self.args.attention_window] * config.encoder_layers

model.model = LongformerEncoderDecoderForConditionalGeneration.from_pretrained('longformer-encdec-base-16384',config = config)
#ckpt_pth = '/home/ec2-user/SageMaker/summarization/test/_ckpt_epoch_0_v2.ckpt'
#x = torch.load(ckpt_pth)['state_dict']
#x['model.model.final_logits_bias'] = x['model.final_logits_bias']

model.load_state_dict(new_state_dict)
#model = torch.load(ckpt_pth,map_location=device)
model.eval()

tokenizer = AutoTokenizer.from_pretrained('longformer-encdec-base-16384')

tokenizer.model_max_length = 4000

In [63]:

SAMPLE_TEXT = ' '.join(['Hello world! '] * 200)  # long input document

input_ids = torch.tensor(tokenizer.encode(SAMPLE_TEXT)).unsqueeze(0)

attention_mask = torch.ones(input_ids.shape, dtype=torch.long, device=input_ids.device)
attention_mask[input_ids == tokenizer.pad_token_id] = 0
attention_mask[:, 0] = 2 
half_padding_mod = model.config.attention_window[0]
input_ids, attention_mask = pad_to_window_size(  # ideally, should be moved inside the LongformerModel
                input_ids, attention_mask, half_padding_mod, tokenizer.pad_token_id)



In [65]:
generated_ids =  model.model.generate(input_ids=input_ids, attention_mask=attention_mask,
                                            use_cache=True, max_length=256,
                                            num_beams=1)

In [67]:
generated_str = tokenizer.batch_decode(generated_ids.tolist(), skip_special_tokens=True)

In [68]:
generated_str

['Bro said the sentence']