In [1]:
import pandas as pd

In [2]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [8]:
finance_news = pd.read_excel('finance_data/SmoothNLP专栏资讯数据集样本10k.xlsx')
finance_news = finance_news.drop(columns = ['Unnamed: 0'])

In [9]:
finance_news.columns

Index(['title', 'content', 'pub_ts'], dtype='object')

In [10]:
finance_news['content'].head()

0    文/易北辰\n在新零售业态当中，无人货架启动和运营成本貌似最低，主要面向2亿白领人群的上班时...
1    \n\n\n\n导读：2018年是网易游戏发展进程当中最重要的一年，甚至比之2015年全面进...
2    \n \n　　文/于斌\n　　其实这句话是个伪命题。按理说，我们早已进入了“扫码取款”的时代...
3    \n      67岁的王石，终究还是回到了他熟悉的商业领域这一战场。在此前一段时间，王石似...
4    \n\n文 | 草原骑士\n来源 | 智能相对论（aixdlun）\n\n1954年，美国成...
Name: content, dtype: object

In [None]:
for index, content in enumerate(finance_news['content']):
    if '三个第一的背后：“移动搜索 应用商店”双核驱动模式' in content:
        print(content)
        print(index)
        break

In [None]:
finance_news['content'].iloc[9]

In [12]:
import jieba
jieba.enable_paddle()

Paddle enabled successfully......


In [13]:
tokens = jieba.cut("所以，整个过程有点像勘探。首先是有人发起，然后有千百人买了镐和铁锹跟进。有人会满载而归，但看走眼的人也会空手回来。"
                   ,cut_all=False)

In [14]:
from zhon.hanzi import punctuation as punctuation_zh
from string import punctuation
import collections
import re

In [18]:
def read_stop_words():
    stop_words = set()
    with open('words_utils/stopwords','r', encoding='UTF-8') as file:
        for word in file:
            word = word.strip()
            stop_words.add(word)
    return stop_words

In [19]:
stop_words = read_stop_words()

In [20]:
punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [21]:
class JiebaTokenizer():
    def __init__(self, stop_words, punctuations, vocab = None):
        self.stop_words = stop_words
        self.punctuation = set()
        for punctuation in punctuations:
            self.punctuation.update(set(punctuation))
        self.vocab = self.init_vocab(vocab)
            
    def init_vocab(self, vocab):
        init_vocab = {'<pad>':0,
                 '<unk>':1,
                 'CLS':2,
                 'SEP':3,
                 'MASK':4}
        init_vocab = collections.OrderedDict(init_vocab)
        if vocab is not None:
            for word in vocab:
                init_vocab[word] = len(init_vocab)
        return init_vocab
    
    def tokenize(self, text):
        tokens = []
        for token in jieba.cut(text,cut_all=False):
            if token in self.stop_words:
                continue
            if token in self.punctuation:
                continue
            if token == ' ' or token == '\n' or token[0].isnumeric():
                continue
            if len(token) == 1 and token[0].isalpha():
                continue
            word = token.lower()
            tokens.append(word)
            self.vocab[word] = len(self.vocab)
        return tokens

In [22]:
tokenizer = JiebaTokenizer(stop_words, [punctuation, punctuation_zh])

In [140]:
def clean_text(content):
    content = re.sub('\xa0 ?', '', content)
    content = re.sub('\u3000+', '', content)
    content = content.replace(' \n','\n')
    content = re.sub('\n+', '\n', content)
    content = content.strip('\n')
    return content

In [141]:
def write_data_txt(finance_news):
    with open('finance_data/data.txt','w', encoding='UTF-8') as file:
        for index, row in finance_news.iterrows():
            content = row['content']
            if not isinstance(content, str):
                continue
            
            if len(content) == 0:
                continue
            
            content = clean_text(content)
            try:                    
                file.write(content)
                file.write('\n\n')
            except Exception as e:
                print(content, e)
                continue

In [142]:
write_data_txt(finance_news)

In [91]:
tokenizer = JiebaTokenizer(stop_words, [punctuation, punctuation_zh])

In [93]:
tokenizer.tokenize('这里藏匿着BAT之间最狠的撕逼、最野的巨头八卦、最不为人知的行业潜规则、各个职位上的奇人... ')

Building prefix dict from the default dictionary ...
Dumping model to file cache C:\Users\ADMINI~1\AppData\Local\Temp\jieba.cache
Loading model cost 0.708 seconds.
Prefix dict has been built successfully.


['藏匿', 'bat', '之间', '最狠', '最野', '巨头', '不为人知', '行业', '潜规则', '职位', '奇人', '\xa0']

In [89]:
import json
from albertlib import albert_model, tokenization
from albertlib.create_pretraining_data import create_training_instances, write_instance_to_example_files
from albertlib.albert import AlbertConfig
from albertlib.input_pipeline import create_pretrain_dataset
import random
from absl import logging, flags
import tensorflow as tf

In [90]:
train_meta_data = {
    "max_seq_length": 50,
    "max_predictions_per_seq": 10
}

In [75]:
logging.set_verbosity(logging.INFO)
FLAGS = flags.FLAGS
FLAGS.meta_data_file_path = 'processed_data/train_meta_data'
FLAGS.input_file = 'finance_data/data.txt'
FLAGS.max_seq_length = train_meta_data['max_seq_length']
FLAGS.max_predictions_per_seq = train_meta_data['max_predictions_per_seq']
FLAGS.output_file = 'processed_data/train.tf_record'
FLAGS.dupe_factor = 40
FLAGS.masked_lm_prob = 0.15
FLAGS.short_seq_prob = 0.1
FLAGS.mark_as_parsed()

tokenizer = JiebaTokenizer(stop_words, [punctuation, punctuation_zh])

input_files = []
for input_pattern in FLAGS.input_file.split(","):
    input_files.extend(tf.io.gfile.glob(input_pattern))

logging.info("*** Reading from input files ***")
for input_file in input_files:
    logging.info("  %s", input_file)

rng = random.Random(FLAGS.random_seed)
instances = create_training_instances(
  input_files, tokenizer, FLAGS.max_seq_length, FLAGS.dupe_factor,
  FLAGS.short_seq_prob, FLAGS.masked_lm_prob, FLAGS.max_predictions_per_seq,
  rng)

logging.info("number of instances: %i", len(instances))

output_files = FLAGS.output_file.split(",")
logging.info("*** Writing to output files ***")
for output_file in output_files:
    logging.info("  %s", output_file)

write_instance_to_example_files(instances, tokenizer, FLAGS.max_seq_length,
                              FLAGS.max_predictions_per_seq, output_files)

INFO:absl:*** Reading from input files ***
INFO:absl:  finance_data/data.txt
  0%|          | 0/1 [00:00<?, ?it/s]

KeyboardInterrupt: 