In [21]:

import transformers
from transformers import pipeline, set_seed
from transformers import (
    CONFIG_MAPPING,
    MODEL_FOR_CAUSAL_LM_MAPPING,
    AutoConfig,
    AutoModelForCausalLM,
    AutoTokenizer,
    HfArgumentParser,
    Trainer,
    TrainingArguments,
    default_data_collator,
    set_seed,
    BertTokenizer,
    GPT2Tokenizer
)
from transformers.trainer_utils import get_last_checkpoint, is_main_process
from transformers import GPT2LMHeadModel, AutoTokenizer, AutoModelForMaskedLM

model_name_or_path = "/home2/zhanghanqing/pretrained_model/gpt2/large"
model = GPT2LMHeadModel.from_pretrained(
            model_name_or_path)

tokenizer = GPT2Tokenizer.from_pretrained(model_name_or_path)


In [23]:
a = ''
tokenizer(a)

{'input_ids': [], 'attention_mask': []}

In [40]:
print(f"a{len([1,3,4])}")

a3


In [108]:
tokenizer.padding_side = "left" 
tokenizer.pad_token = tokenizer.eos_token # to avoid an error

sentences = ["Hello, my dog is a little",
            "Text and words include: costume, dance, perform, stage, wear. Start generation:"
            ]
inputs = tokenizer(sentences, return_tensors="pt", padding=True)

output_sequences = model.generate(
    input_ids=inputs['input_ids'],
    attention_mask=inputs['attention_mask'],
    num_beams=3,
    do_sample=False, 
    max_length=30 # disable sampling to test if batching affects output
)
print(output_sequences)

for i in range(len(sentences)):
    print(tokenizer.decode(output_sequences[i]))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


tensor([[50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 15496,    11,   616,  3290,   318,   257,  1310,  1643,   286,
           257,  2356,   287,   262,   840,    11,   475,   339,   338,   257],
        [ 8206,   290,  2456,  2291,    25, 16569,    11,  9280,    11,  1620,
            11,  3800,    11,  5806,    13,  7253,  5270,    25,   352,    12,
            17,  2745,    13,   198,   198,    16,    12,    17,  2745,    13]])
<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|>Hello, my dog is a little bit of a pain in the ass, but he's a
Text and words include: costume, dance, perform, stage, wear. Start generation: 1-2 weeks.

1-2 weeks.


In [9]:
# %%
import os
import json
import spacy
import nltk
from nltk.tokenize import sent_tokenize
import random
from glob import glob
from pathlib import Path
from multiprocessing import Pool
from tqdm import tqdm
import pandas as pd


def encoder_data_target(data):
    
    nn = ['VERB', 'NOUN'] 
    nlp = spacy.load('en_core_web_sm')# 加载预训练模型
    res = []
    
    for line in data:
        line_content  = ' '.join(line[:-1])
        target = line[-1]
        
        if len(target.split())<6:
            continue
                
        doc = nlp(target.lower())
        tokens = [token for token in doc]           # 将句子切分成单词
        pos = [token.pos_ for token in doc]         # 词性标注
        lem = [token.lemma_ for token in doc]       # 词性还原
        
        sen = []
        for t,p,l in zip(tokens, pos, lem):
            if (p in nn) and (len(str(l))>=3) and ('-'  not in str(l)) and ('.' not in str(l)):
                sen.append(str(l))
                
        sen = list(set(sen))
        
        if len(sen)<4:
            continue
            
        if len(sen)>5:
            
            sen =  random.sample(sen, 5)
            
        random.shuffle(sen)
 
        res.append({
                    "content":line_content,
                    "target":target,
                    "keywords": '#'.join(sen)})
        
    return res


def encoder_data(data):
    
    nn = ['VERB', 'NOUN'] 
    nlp = spacy.load('en_core_web_sm')# 加载预训练模型

    res = []
    
    for line in data:
        
        # cut_pos = random.randint(8,25)
        line = line.strip(' \n\t\r),\\')
        
        
        # if line.endswith('.'):
        #     line= line[:-1].strip()
        
        
        if len(line)<3:
            continue
        
        
        if line[0].islower():
            continue
        
        if "amp" in line:
            continue
        
        split_line= line.split()
        
        # if cut_pos>len(split_line) or len(split_line)>150 or len(split_line)<3:
        #     continue
        
        # line_content  = ' '.join(split_line[:-cut_pos])
        # target = ' '.join(split_line[-cut_pos:])
        
        line_content=''
        target = line
                
        doc = nlp(target.lower())
        tokens = [token for token in doc]           # 将句子切分成单词
        pos = [token.pos_ for token in doc]         # 词性标注
        lem = [token.lemma_ for token in doc]       # 词性还原
        
        sen = []
        for t,p,l in zip(tokens, pos, lem):
            if (p in nn) and (len(str(l))>=3) and ('-'  not in str(l)) and ('.' not in str(l)):
                sen.append(str(l))
                
        sen = list(set(sen))
        
        if len(sen)<4:
            continue
            
        if len(sen)>5:
            
            sen =  random.sample(sen, random.randint(4,5))
            
        random.shuffle(sen)
 
        res.append({
                    "content":line_content,
                    "target":target,
                    "keywords": '#'.join(sen)})
        
    return res 

json_path = '/home2/zhanghanqing/formatted_wikipedia'

out_file_train = os.path.join('./data/', 'wiki_train.json')
out_file_train = open(out_file_train, 'w', encoding='utf8')


out_file_val = os.path.join('./data/', 'wiki_val.json')
out_file_val = open(out_file_val, 'w', encoding='utf8')


#######################################wiki dataset########################################

print("reading data from %s ..." % json_path)

filenames = []                
filenames += glob(os.path.join(json_path,'wiki**.format'))
data =  []
record = []

for data_file in filenames:
    data +=[x for x in Path(data_file).open().readlines()[-40000:]]
    
d_0_15 = 0  #6w
d_15_20 = 0 #3w
d_20=0  #1w
    
filter_data = []  

for d in data:
    l_d = len(d.split())
    
    if d_0_15>20000 and d_15_20>100000 and d_20>40000:
        break
    
    if l_d>=10 and l_d<15  and d_0_15<=20000:
        d_0_15 +=1
        filter_data.append(d)
        
    elif l_d>=15 and l_d<=20 and d_15_20<=100000:
        d_15_20 +=1
        filter_data.append(d)
        
    # elif l_d>20 and l_d<=32 and d_20<=40000:
    #     d_20 += 1
    #     filter_data.append(d)
             
print(d_0_15,d_15_20, d_20)
        
data = filter_data
print("data num is:", len(data))

n = int(len(data)/48)
data_list =[data[i:i + n] for i in range(0, len(data), n)]

with Pool(48) as p:

    data =list(tqdm(p.imap(encoder_data, data_list), total=len(data_list)))

record = [item for subl in data for item in subl]

print("the record is:",len(record))

#######################################roc dataset########################################
# roc = pd.read_csv("../data/roc/roc.csv")
# data = []
# for row in roc.itertuples():
#     # if random.randint(1,5)%5==1:
#     cut_pos = random.randint(3,5)
#     # else:
#     #     cut_pos = 5
#     context = [row[i+3] for i in range(cut_pos)]
#     data.append(context)
    
# random.shuffle(data)


# n = int(len(data)/48)
# data_list =[data[i:i + n] for i in range(0, len(data), n)]

# with Pool(48) as p:

#     data =list(tqdm(p.imap(encoder_data_target, data_list), total=len(data_list)))

# record += [item for subl in data for item in subl]

#######################################hc dataset########################################
# data_file = "../data/hc/train.src"
# data =[eval(x.strip(' \n\t\r),\\')) for x in Path(data_file).open().readlines()]

# data_file = "../data/hc/valid.src"
# data +=[eval(x.strip(' \n\t\r),\\')) for x in Path(data_file).open().readlines()]

# print("data num is:", len(data))

# n = int(len(data)/48)
# data_list =[data[i:i + n] for i in range(0, len(data), n)]

# with Pool(48) as p:

#     data =list(tqdm(p.imap(encoder_data, data_list), total=len(data_list)))

# record += [item for subl in data for item in subl]

#######################################commonsence dataset########################################

# json_path = "../data/commongen.train.jsonl"
# with open(json_path) as out:
#     lines = out.readlines()

#     for l in tqdm(lines):
#         item = json.loads(l.strip())
#         concept_set = item['concept_set']
#         for c in item['scene']:
#             c = c.strip()
#             # if c.endswith('.'):
#             #     c= c[:-1].strip()
#             # if c[0].islower():
#             #     continue
#             record.append({
#                             "content":"",
#                             "target":c,
#                             "keywords":concept_set})

#######################################end########################################
random.shuffle(record) 

print("total data number is:", len(record))
out_file_train.write(json.dumps(record[:int(len(record)*0.99)])+'\n')
out_file_train.flush()

out_file_val.write(json.dumps(record[int(len(record)*0.01):])+'\n')
out_file_val.flush()

reading data from /home2/zhanghanqing/formatted_wikipedia ...
20001 100001 0
data num is: 120002


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 49/49 [00:42<00:00,  1.15it/s]


the record is: 98516
total data number is: 98516


In [33]:
json_path = "../data/commongen.train.jsonl"
data = []
with open(json_path) as out:
    lines = out.readlines()

    for l in tqdm(lines):
        item = json.loads(l.strip())
        concept_set = item['concept_set']
        for c in item['scene']:
            c = c.strip()
            if c[0].islower():
                continue
            data.append({
                            "content":"",
                            "target":c,
                            "keywords":concept_set})

100%|█████████████████████████████████| 32651/32651 [00:00<00:00, 277457.89it/s]


In [34]:
len(data)


15179

In [31]:
a ="ni hao hello"
c = a.split('.')[0]
print(c)

ni hao hello


In [10]:
json_path = os.path.join('./data/', 'wiki_train.json')
data = []
with open(json_path) as out:

    lines = json.load(out)
    for item in tqdm(lines):
        c = item["target"]
        if c.endswith('.'):
            c= c[:-1].strip()+'.'
        
        data.append({
                            "content":item["content"],
                            "target":c,
                            "keywords":item["keywords"]})
        
print(len(data))

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 158344/158344 [00:00<00:00, 1023837.44it/s]

158344





In [11]:
out_file_train = os.path.join('./data/', 'n_wiki_train.json')
out_file_train = open(out_file_train, 'w', encoding='utf8')
print("total data number is:", len(data))
out_file_train.write(json.dumps(data)+'\n')
out_file_train.flush()

total data number is: 158344


In [23]:
# %%
import os
import json
import spacy
import nltk
from nltk.tokenize import sent_tokenize
import random
from glob import glob
from pathlib import Path
from multiprocessing import Pool
from tqdm import tqdm
import pandas as pd




def encoder_data(data):
    
    nn = ['VERB', 'NOUN'] 
    nlp = spacy.load('en_core_web_sm')# 加载预训练模型

    res = []
    
    for line in data:
        
        # cut_pos = random.randint(8,25)
        line = line.strip(' \n\t\r),\\')
        
        
        # if line.endswith('.'):
        #     line= line[:-1].strip()
        
        
        if len(line)<3:
            continue
        
        
        if line[0].islower():
            continue
        
        if "amp" in line:
            continue
        
        split_line= line.split()
        
        # if cut_pos>len(split_line) or len(split_line)>150 or len(split_line)<3:
        #     continue
        
        # line_content  = ' '.join(split_line[:-cut_pos])
        # target = ' '.join(split_line[-cut_pos:])
        
        line_content=''
        target = line
                
        doc = nlp(target.lower())
        tokens = [token for token in doc]           # 将句子切分成单词
        pos = [token.pos_ for token in doc]         # 词性标注
        lem = [token.lemma_ for token in doc]       # 词性还原
        
        sen = []
        for t,p,l in zip(tokens, pos, lem):
            if (p in nn) and (len(str(l))>=3) and ('-'  not in str(l)) and ('.' not in str(l)):
                sen.append(str(l))
                
        sen = list(set(sen))
        
        if len(sen)<4:
            continue
            
        if len(sen)>5:
            
            sen =  random.sample(sen, random.randint(4,5))
            
        random.shuffle(sen)
 
        res.append({
                    "content":line_content,
                    "target":target,
                    "keywords": '#'.join(sen)})
        
    return res 

json_path = '/home2/zhanghanqing/formatted_wikipedia'

out_file_train = os.path.join('./data/', 'wiki_sen.src')
out_file_train = open(out_file_train, 'w', encoding='utf8')

#######################################wiki dataset########################################

print("reading data from %s ..." % json_path)

filenames = []                
filenames += glob(os.path.join(json_path,'wiki**.format'))
data =  []
record = []

for data_file in filenames:
    data +=[x for x in Path(data_file).open().readlines()[-50000:]]
    
d_20=0  #1w
    
filter_data = []

for d in data:
    l_d = len(d.split())
    
    if l_d>25 and l_d<=32 and d_20<=100000:
        d_20 += 1
        filter_data.append(d)
        
    if d_20>100000:
        break
             
data = filter_data
print("data num is:", len(data))

random.shuffle(data) 
data = list(filter(lambda x: "amp" not in x ,data))
print("total data number is:", len(data))

# lists=[line +"\n" for line in data]
out_file_train.writelines(data)

reading data from /home2/zhanghanqing/formatted_wikipedia ...
data num is: 100001
total data number is: 95038


19409

In [20]:
data[:10]

['Of particular note in the Queen Mary Bedroom are two chairs covered with needlework created by Albert, who was once the chairman of the Royal School of Needlework.\n',
 'Formerly known as the Patchwork Bedroom, the Ante Room was established by Charlotte, Countess Spencer and her sister during the Victorian period, and today forms part of a suite of state chambers.\n',
 "two are Grade II* listed, including the Stable Block and Gardener's House, Althorp, and the remainder have a Grade II designation, mainly garden screens, gates and gateways aside from the planting stones.\n",
 'The mustard-yellow Grade II* listed Stable Block, designed by architect Roger Morris with a Palladian influence, was ordered by Charles, Fifth Earl of Sutherland in the early 1730s.\n',
 "Morris designed the building with a clear Tuscan architectural design, drawing upon earlier inspiration from his stables at Inigo Jones's St Paul's Church in Covent Garden.\n",
 "Several rooms were built within the stable bloc

In [17]:
# %%
import os
import json
import spacy
import nltk
from nltk.tokenize import sent_tokenize
import random
from glob import glob
from pathlib import Path
from multiprocessing import Pool
from tqdm import tqdm
import pandas as pd


def encoder_data_target(data):
    
    nn = ['VERB', 'NOUN'] 
    nlp = spacy.load('en_core_web_sm')# 加载预训练模型
    res = []
    
    for line in data:
        line_content  = ' '.join(line[:-1])
        target = line[-1]
        
        if len(target.split())<6:
            continue
                
        doc = nlp(target.lower())
        tokens = [token for token in doc]           # 将句子切分成单词
        pos = [token.pos_ for token in doc]         # 词性标注
        lem = [token.lemma_ for token in doc]       # 词性还原
        
        sen = []
        for t,p,l in zip(tokens, pos, lem):
            if (p in nn) and (len(str(l))>=3) and ('-'  not in str(l)) and ('.' not in str(l)):
                sen.append(str(l))
                
        sen = list(set(sen))
        
        if len(sen)<3:
            continue
            
        if len(sen)>5:
            
            sen =  random.sample(sen, 5)
            
        random.shuffle(sen)
 
        res.append({
                    "content":line_content,
                    "target":target,
                    "keywords": '#'.join(sen)})
        
    return res



def encoder_data(data):
    
    nn = ['VERB', 'NOUN'] 
    nlp = spacy.load('en_core_web_sm')# 加载预训练模型

    res = []
    
    for line in data:
        
        cut_pos = random.randint(8,25)
        line = line.strip(' \n\t\r),\\')
        
        if "amp" in line:
            continue
        
        split_line= line.split()
        
        # if cut_pos>len(split_line) or len(split_line)>150 or len(split_line)<3:
        #     continue
        
        
        line_content  = ' '.join(split_line[:-cut_pos])
        target = ' '.join(split_line[-cut_pos:])
                
        doc = nlp(target.lower())
        tokens = [token for token in doc]           # 将句子切分成单词
        pos = [token.pos_ for token in doc]         # 词性标注
        lem = [token.lemma_ for token in doc]       # 词性还原
        
        sen = []
        for t,p,l in zip(tokens, pos, lem):
            if (p in nn) and (len(str(l))>=3) and ('-'  not in str(l)) and ('.' not in str(l)):
                sen.append(str(l))
                
        sen = list(set(sen))
        
        if len(sen)<3:
            continue
            
        if len(sen)>5:
            
            sen =  random.sample(sen, random.randint(3,5))
            
        random.shuffle(sen)
 
        res.append({
                    "content":line_content,
                    "target":target,
                    "keywords": '#'.join(sen)})
        
    return res 

json_path = '/home2/zhanghanqing/formatted_wikipedia'

out_file_train = os.path.join('./data/', 'wiki_train.json')
out_file_train = open(out_file_train, 'w', encoding='utf8')


out_file_val = os.path.join('./data/', 'wiki_val.json')
out_file_val = open(out_file_val, 'w', encoding='utf8')


#######################################wiki dataset########################################

print("reading data from %s ..." % json_path)

filenames = []                
filenames += glob(os.path.join(json_path,'wiki**.format'))

data =  []

for data_file in filenames:
    data +=[x for x in Path(data_file).open().readlines()[-3000000:]]
random.shuffle(data)

d_0_50 = 0  #5w
d_50_90 = 0 #3w
d_90=0  #1w

    
filter_data = []

for d in data:
    l_d = len(d.split())
    
    if d_0_50>80000 and d_50_90>40000 and d_90>10000:
        break
    
    if l_d<50 and d_0_50<=80000:
        d_0_50 +=1
        filter_data.append(d)
            
        
    elif l_d>=50 and l_d<90 and d_50_90<=40000:
        d_50_90 +=1
        filter_data.append(d)
        
    elif l_d>=90 and d_90<=10000:
        d_90+=1
        filter_data.append(d)
             
print(d_0_50,d_50_90, d_90)
        
data = filter_data
print("data num is:", len(data))

n = int(len(data)/48)
data_list =[data[i:i + n] for i in range(0, len(data), n)]

with Pool(48) as p:

    data =list(tqdm(p.imap(encoder_data, data_list), total=len(data_list)))

record = [item for subl in data for item in subl]

print("the record is:",len(record))

#######################################roc dataset########################################
roc = pd.read_csv("../data/roc/roc.csv")
data = []
for row in roc.itertuples():
    # if random.randint(1,5)%5==1:
    cut_pos = random.randint(3,5)
    # else:
    #     cut_pos = 5
    context = [row[i+3] for i in range(cut_pos)]
    data.append(context)
    
random.shuffle(data)


n = int(len(data)/48)
data_list =[data[i:i + n] for i in range(0, len(data), n)]

with Pool(48) as p:

    data =list(tqdm(p.imap(encoder_data_target, data_list), total=len(data_list)))

record += [item for subl in data for item in subl]

#######################################hc dataset########################################
# data_file = "../data/hc/train.src"
# data =[eval(x.strip(' \n\t\r),\\')) for x in Path(data_file).open().readlines()]

# data_file = "../data/hc/valid.src"
# data +=[eval(x.strip(' \n\t\r),\\')) for x in Path(data_file).open().readlines()]

# print("data num is:", len(data))

# n = int(len(data)/48)
# data_list =[data[i:i + n] for i in range(0, len(data), n)]

# with Pool(48) as p:

#     data =list(tqdm(p.imap(encoder_data, data_list), total=len(data_list)))

# record += [item for subl in data for item in subl]

#######################################commonsence dataset########################################


# json_path = "../data/commongen.train.jsonl"
# with open(json_path) as out:
#     lines = out.readlines()

#     for l in tqdm(lines):
#         item = json.loads(l.strip())
#         concept_set = item['concept_set']
#         for c in item['scene']:
#             c = c.strip()
#         record.append({
#                         "content":"",
#                         "target":c,
#                         "keywords":concept_set})

#######################################end########################################


random.shuffle(record)

print("total data number is:", len(record))
out_file_train.write(json.dumps(record[:int(len(record)*0.95)])+'\n')
out_file_train.flush()

out_file_val.write(json.dumps(record[int(len(record)*0.95):])+'\n')
out_file_val.flush()

reading data from /home2/zhanghanqing/formatted_wikipedia ...
80001 40001 10001
data num is: 130003


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 49/49 [00:48<00:00,  1.01it/s]


the record is: 104569


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 49/49 [00:16<00:00,  2.94it/s]


total data number is: 139128
