In [3]:
%config ZMQInteractiveShell.ast_node_interactivity = "all"
%pprint

Pretty printing has been turned OFF


In [4]:
import numpy as np
import pandas as pd
from tqdm import tqdm_notebook as tqdm
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
import warnings
warnings.filterwarnings("ignore")

In [5]:
train_df = pd.read_csv("../data/train_set.csv", sep = "\t", index_col = False)
train_df.head()

Unnamed: 0,label,text
0,2,2967 6758 339 2021 1854 3731 4109 3792 4149 15...
1,11,4464 486 6352 5619 2465 4802 1452 3137 5778 54...
2,3,7346 4068 5074 3747 5681 6093 1777 2226 7354 6...
3,2,7159 948 4866 2109 5520 2490 211 3956 5520 549...
4,3,3646 3055 3055 2490 4659 6065 3370 5814 2465 5...


In [6]:
# 用于评论的分词
class SegmentVocab(object):
    """
    参数/属性 data:需要分词的数据集
    参数 class_sentence_seg_dict：按类划分及按数据集划分的词典--->{'class1':[[seg1, seg2], [seg3, ..]]}
    参数 vocab_dict: 整个词表
    参数 class_seg_dict： {'class1':[seg1, seg2, ...]}
    参数 common_vocab: 公共词（区分度不高的词）
    """
    def __init__(self, data, min_seq):
        self.data = data
        self.class_sentence_seg_dict, self.vocab_dict, self.class_seg_dict \
            = self.class_segment(min_seq)
        self.common_vocab = self.get_common_vocab()

    def class_segment(self, min_seq):
        """
        功能：按类别进行分词/去停词/过滤低频词
        参数 min_seq:指定低频词的阈值, 用于过滤低频词
        返回：按类保存的分词结果的分词字典
        """
        # 存储留言类别（唯一的）
        categories = self.data['label'].unique()
        # 按类存储分词结果
        vocab = {}

        # 按类获取sentence列表
        for category in categories:
            # 提取评价
            data_class = self.data.loc[self.data['label'] == category, ['text']]
            sentence_list = data_class['text'].values.tolist()
            # 获得词典
            vocab[category] = sentence_list

        # 按类进行分词
        print("segmentation start...")
        # vocab_lcut为{'class1':[[seg1, seg2], [seg3, ..]]}
        # vocab_lcut_sum为{'class1':[seg1, seg2, ...]}
        # words_dict是整个词表
        vocab_lcut, vocab_lcut_sum, words_dict = {}, {}, {}
        for key, value in tqdm(vocab.items()):
            segment1, segment2 = [], []
            for line in value:
                try:
                    segs = line.split(" ")
                    segment1.append(segs)
                    # 统计每个词出现的频率, 之后用于过滤低频词以及作为整个词表
                    for seg in segs:
                        words_dict[seg] = words_dict.get(seg, 0) + 1
                        segment2.append(seg)
                except:
                    print(line)
                    continue
            vocab_lcut[key] = segment1
            vocab_lcut_sum[key] = segment2
        print("segmentation finish...\n")

        # 过滤低频词
        if min_seq > 0:
            print("filter low-tf vocab start...")
            high_tf_dict, high_tf_sum_dict = {}, {}
            for key, value in tqdm(vocab_lcut.items()):
                high_tf_list, high_tf_sum_list = [], []
                for line in value:
                    line = list(filter(lambda x: words_dict[x] > min_seq, line))
                    if len(line) > 0:
                        high_tf_list.append(line)
                        high_tf_sum_list.extend(line)
                high_tf_dict[key] = high_tf_list
                high_tf_sum_dict[key] = high_tf_sum_dict

            vocab_lcut = high_tf_dict
            vocab_lcut_sum = high_tf_sum_dict
            print("filter low-tf vocab finish...\n")

        return vocab_lcut, words_dict, vocab_lcut_sum
    
    def get_common_vocab(self):
        """
        功能：获取各类的共有词（这类词的区分度不大, 建立特征时需要丢弃这些词）
        返回：共有词表
        """
        count = 0
        common_set = set()
        for key in self.class_seg_dict.keys():
            if count == 0:
                common_set = set(self.class_seg_dict[key])
            else:
                common_set = common_set.intersection(set(self.class_seg_dict[key]))
            count += 1
        return common_set

In [None]:
seg_vocab = SegmentVocab(train_df, 0)

segmentation start...


HBox(children=(FloatProgress(value=0.0, max=14.0), HTML(value='')))