In [1]:
# !usr/bin/env python
# -*- coding:utf-8 _*-
"""
@Author: Huiqiang Xie
@File: text_preprocess.py
@Time: 2021/3/31 22:14
"""
# !/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Jan 30 16:44:08 2020

@author: hx301
"""
import unicodedata
import re
from w3lib.html import remove_tags
import pickle
import argparse
import os
import json
from tqdm import tqdm

parser = argparse.ArgumentParser()
parser.add_argument('--input-data-dir', default='F:/jupyter/semantic/Attack-DeepSC/data/txt/DeepSC.pkl', type=str)
parser.add_argument('--input-data-flickr-dir', default='F:/jupyter/semantic/Attack-DeepSC/data/txt/flick/flick30k.pkl', type=str)
parser.add_argument('--output-train-dir', default='txt/train_data.pkl', type=str)
parser.add_argument('--output-test-dir', default='txt/test_data.pkl', type=str)
parser.add_argument('--output-vocab', default='F:/jupyter/semantic/Attack-DeepSC/data/txt/combined_vocab.json', type=str)

SPECIAL_TOKENS = {
  '<PAD>': 0,
  '<START>': 1,
  '<END>': 2,
  '<UNK>': 3,
}

def unicode_to_ascii(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s)
                   if unicodedata.category(c) != 'Mn')

def normalize_string(s):
    # normalize unicode characters
    s = unicode_to_ascii(s)
    # remove the XML-tags
    s = remove_tags(s)
    # add white space before !.?
    s = re.sub(r'([!.?])', r' \1', s)
    s = re.sub(r'[^a-zA-Z.!?]+', r' ', s)
    s = re.sub(r'\s+', r' ', s)
    # change to lower letter
    s = s.lower()
    return s

def cutted_data(cleaned, MIN_LENGTH=4, MAX_LENGTH=30):
    cutted_lines = list()
    for line in cleaned:
        length = len(line.split())
        if length > MIN_LENGTH and length < MAX_LENGTH:
            line = [word for word in line.split()]
            cutted_lines.append(' '.join(line))
    return cutted_lines

def save_clean_sentences(sentence, save_path):
    pickle.dump(sentence, open(save_path, 'wb'))
    print('Saved: %s' % save_path)

def process(text_path):
    fop = open(text_path, 'r', encoding='utf8')
    raw_data = fop.read()
    sentences = raw_data.strip().split('\n')
    raw_data_input = [normalize_string(data) for data in sentences]
    raw_data_input = cutted_data(raw_data_input)
    fop.close()

    return raw_data_input


def tokenize(s, delim=' ',  add_start_token=True, add_end_token=True,
             punct_to_keep=None, punct_to_remove=None):
    """
    Tokenize a sequence, converting a string s into a list of (string) tokens by
    splitting on the specified delimiter. Optionally keep or remove certain
    punctuation marks and add start and end tokens.
    """
    if punct_to_keep is not None:
        for p in punct_to_keep:
            s = s.replace(p, '%s%s' % (delim, p))

    if punct_to_remove is not None:
        for p in punct_to_remove:
            s = s.replace(p, '')

    tokens = s.split(delim)
    if add_start_token:
        tokens.insert(0, '<START>')
    if add_end_token:
        tokens.append('<END>')
    return tokens


def build_vocab(sequences, token_to_idx = { }, min_token_count=1, delim=' ',
                punct_to_keep=None, punct_to_remove=None, ):
    token_to_count = {}

    for seq in sequences:
      seq_tokens = tokenize(seq, delim=delim, punct_to_keep=punct_to_keep,
                      punct_to_remove=punct_to_remove,
                      add_start_token=False, add_end_token=False)
      for token in seq_tokens:
        if token not in token_to_count:
          token_to_count[token] = 0
        token_to_count[token] += 1

    for token, count in sorted(token_to_count.items()):
      if count >= min_token_count:
        token_to_idx[token] = len(token_to_idx)

    return token_to_idx


def encode(seq_tokens, token_to_idx, allow_unk=False):
    seq_idx = []
    for token in seq_tokens:
      if token not in token_to_idx:
        if allow_unk:
          token = '<UNK>'
        else:
          raise KeyError('Token "%s" not in vocab' % token)
      seq_idx.append(token_to_idx[token])
    return seq_idx


def decode(seq_idx, idx_to_token, delim=None, stop_at_end=True):
    tokens = []
    for idx in seq_idx:
      tokens.append(idx_to_token[idx])
      if stop_at_end and tokens[-1] == '<END>':
        break
    if delim is None:
      return tokens
    else:
      return delim.join(tokens)


In [2]:
args = parser.parse_known_args()[0]

print(args.input_data_dir)
sentences = []
#预处理
print('Preprocess Raw Text')
#提取欧洲会议数据集
with open(args.input_data_dir, "rb") as f:
    sentences = pickle.load(f)
#提取flickr数据集的内容并处理
with open(args.input_data_flickr_dir, "rb") as f:
    f_sentences = pickle.load(f)    
f_s = []
for seq in tqdm(f_sentences):
    words = tokenize(seq, punct_to_keep=[';', ','], punct_to_remove=['?', '.'])
    words = list(filter(lambda x : x != ",", words))#去掉逗号和空格
    words = list(filter(lambda x : x != "", words))
    dig = False
    error_list = []
    for i in range(len(words)):
        words[i] = words[i].lower()#改小写
        
    words = words[1:-1]#去掉start和end
    for i in range(len(words)):
        if not words[i].isidentifier():
            error_list.append(words[i])#要改的词存在列表里
            dig = True
    if dig:
        for e in error_list:
            words.remove(e)#去掉非字母
    ss = ' '.join(words)
    f_s.append(ss)


F:/jupyter/semantic/Attack-DeepSC/data/txt/DeepSC.pkl
Preprocess Raw Text


100%|██████████████████████████████████████████████████████████████████████| 158915/158915 [00:01<00:00, 141856.13it/s]


In [3]:
for s in f_s:
    sentences.append(s)
print(len(sentences))


257450


In [4]:
# remove the same sentences

a = {}
for set in sentences:
    if set not in a:
        a[set] = 0
    a[set] += 1
sentences = list(a.keys())
print('Number of sentences: {}'.format(len(sentences)))
#建立vocab（词表）
print('Build Vocab')
token_to_idx = build_vocab(
    sentences, SPECIAL_TOKENS,
    punct_to_keep=[';', ','], punct_to_remove=['?', '.']
)

vocab = {'token_to_idx': token_to_idx}
print('Number of words in Vocab: {}'.format(len(token_to_idx)))

# save the vocab
if args.output_vocab != '':
    with open(args.output_vocab, 'w') as f:
        json.dump(vocab, f)
"""
print('Start encoding txt')
results = []
count_len = []
for seq in tqdm(sentences):
    words = tokenize(seq, punct_to_keep=[';', ','], punct_to_remove=['?', '.'])#tokenize令牌化，将句子中加入令牌<>。
    tokens = [token_to_idx[word] for word in words]
    count_len.append(len(tokens))
    results.append(tokens)


print('Writing Data')
train_data = results[: round(len(results) * 0.9)]
test_data = results[round(len(results) * 0.9):]

with open(args.output_train_dir, 'wb') as f:
    pickle.dump(train_data, f)
with open(args.output_test_dir, 'wb') as f:
    pickle.dump(test_data, f)
"""

Number of sentences: 231766
Build Vocab
Number of words in Vocab: 33163


"\nprint('Start encoding txt')\nresults = []\ncount_len = []\nfor seq in tqdm(sentences):\n    words = tokenize(seq, punct_to_keep=[';', ','], punct_to_remove=['?', '.'])#tokenize令牌化，将句子中加入令牌<>。\n    tokens = [token_to_idx[word] for word in words]\n    count_len.append(len(tokens))\n    results.append(tokens)\n\n\nprint('Writing Data')\ntrain_data = results[: round(len(results) * 0.9)]\ntest_data = results[round(len(results) * 0.9):]\n\nwith open(args.output_train_dir, 'wb') as f:\n    pickle.dump(train_data, f)\nwith open(args.output_test_dir, 'wb') as f:\n    pickle.dump(test_data, f)\n"

In [6]:
for seq in tqdm(sentences[0:3]):
    words = tokenize(seq, punct_to_keep=[';', ','], punct_to_remove=['?', '.'])
    tokens = [token_to_idx[word] for word in words]
    print(tokens)

100%|████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<?, ?it/s]

[1, 19903, 9520, 17299, 838, 13710, 6, 12737, 17412, 18249, 2]
[1, 19968, 10626, 640, 9883, 157, 21933, 19903, 15480, 19901, 21715, 9124, 732, 21078, 4, 2]
[1, 19895, 22135, 13088, 17861, 9628, 18052, 5902, 18477, 8605, 4, 9993, 10661, 10626, 15969, 9883, 10989, 21933, 19903, 15175, 19968, 9520, 9099, 732, 349, 4, 2]



