# 文本特征提取
## 什么是特征提取

特征提取，这是一个文档降维的过程。首先明晰几个概念，特征提取、特征选择（特征选取）Feature Selection、特征抽取Feature Extraction。一般来说，特征提取实际上有两大类方法。一类称为特征选择，指的是从原有的特征（那许多有用无用混在一起的词汇）中提取出少量的，具有代表性的特征，但特征的类型没有变化（原来是一堆词，特征提取后仍是一堆词，数量大大减少了而已）。另一类称为特征抽取的方法则有所不同，它从原有的特征中重构出新的特征（原来是一堆词，重构后变成了别的，例如LSI将其转为矩阵，文档生成模型将其转化为某个概率分布的一些参数），新的特征具有更强的代表性，并耗费更少的计算资源。

## 向量空间模型VSM

向量空间模型是由Salton等人于20世纪60年代末提出，是一种简便、高效的文本表示模型，其理论基础是代数学。向量空间模型把用户的查询要求和数据库文档信息表示成由检索项构成的向量空间中的点，通过计算向量之间的距离来判定文档和查询之间的相似程度。然后，根据相似程度排列查询结果。向量空间模型的关键在于特征向量的选取和特征向量的权值计算两个部分。

## sklearn

sklearn有两种方法来实现特征值的提取:
count vectorizer：仅考虑每种词汇出现的频率
tfidf vectorizer：除了考虑词汇出现的频率，还考虑词汇在样本总体中出现频率的倒数，可以理解为抑制每个样本中都经常出现的词汇

## 文本预处理

对文本进行预处理，以方便对文本的分析和特征的提取

In [None]:
import re
content = 'The 123456 is my one phone number.今天很残酷，明天更残酷！'
result = re.sub(r'[^\u4e00-\u9fa5]', "",content)#只保留中文
result = re.sub(r'[^\0-9\.\u4e00-\u9fa5,。？！，、；：“ ”‘ ’（ ）《 》〈 〉]', "", content)#只保留中文和标点和数字
result = re.sub(r'[^\u4e00-\u9fa5,A-Za-z0-9]', "",content)#只保留中文、英文和数字

### 去停用词

In [None]:
def text_to_wordlist(text):
    result = re.sub(r'[^\u4e00-\u9fa5]', "",text)
    f1_seg_list = jieba.cut(result)#需要添加一个词典，来弥补结巴分词中没有的词语，从而保证更高的正确率
    f_stop = codecs.open(".\stopword.txt","r","utf-8")
    try:
        f_stop_text = f_stop.read()
    finally:
        f_stop.close()
    f_stop_seg_list = f_stop_text.split()
 
    test_words = []
 
    for myword in f1_seg_list:
        if myword not in f_stop_seg_list:
            test_words.append(myword)
            
    return test_words

## 计算TF-IDF

TF-IDF是一种统计方法，用以评估一字词对于一个文件集或一个语料库中的其中一份文件的重要程度。字词的重要性随着它在文件中出现的次数成正比增加，但同时会随着它在语料库中出现的频率成反比下降。TF-IDF加权的各种形式常被搜索引擎应用，作为文件与用户查询之间相关程度的度量或评级。除了TF-IDF以外，因特网上的搜索引擎还会使用基于链接分析的评级方法，以确定文件在搜寻结果中出现的顺序。
通过计算TF-IDF来做到对文本的特征值的提取

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
import jieba
def cutWord():
    con1 = jieba.cut("今天很残酷，明天更残酷！")
    content1 =list(con1)
    c1 = ' '.join(content1)
    con2 =jieba.cut("生活不止眼前和苟且，还有诗与远方！")
    content2 = list(con2)
    c2 = ' '.join(content2)
    con3 = jieba.cut("心有多大，舞台就有多大！")
    content3 = list(con3)
    c3 = ' '.join(content3)
    return c1,c2,c3
def tfidfvect():
    tf = TfidfVectorizer()
    c1,c2,c3 = cutWord()
    data = tf.fit_transform([c1,c2,c3])
    print(tf.get_feature_names())
    print(data.toarray())
tfidfvect()

['不止', '今天', '心有', '明天', '残酷', '生活', '眼前', '舞台', '苟且', '还有', '远方']
[[0.         0.40824829 0.         0.40824829 0.81649658 0.
  0.         0.         0.         0.         0.        ]
 [0.40824829 0.         0.         0.         0.         0.40824829
  0.40824829 0.         0.40824829 0.40824829 0.40824829]
 [0.         0.         0.70710678 0.         0.         0.
  0.         0.70710678 0.         0.         0.        ]]


In [4]:
# coding=utf-8
from __future__ import division

import collections
import os

import config
from codecs import open
stopwords_path = config.stopwords_path
segmented_path = config.segmented_path
sentence_symbol_path = config.sentence_symbol_path
test_path = config.test_path
raw_path = config.raw_path
result_path = config.result_path


def analysis_word(path=segmented_path):
    result_file_path = result_path + '/word_result.txt'
    files = get_files(path)
    result_file = open(result_file_path, "w+", encoding='utf-8')
    result_file.writelines(
        "文件名" + "\t" + "名词数" + "\t" + "名词数占比" + "\t" + "动词数" + "\t" + "动词数占比" + "\t" + "形容词数" + "\t" +
        "形容词数占比" + "\t" + "数词数" + "\t" + "数词数占比" + "\t" + "代词数" + "\t" + "代词数占比" + "\t" +
        "量词数" + "\t" + "量词数占比" + "\t" + "副词数" + "\t" + "副词数占比" + "\t" + "介词数" + "\t" + "介词数占比" + "\t" +
        "连词数" + "\t" + "连词数占比" + "\t" + "助词数" + "\t" + "助词数占比" + "\t" + "叹词数" + "\t" + "叹词数占比" + "\t" +
        "拟声词数" + "\t" + "拟声词数占比" + "\t" + "标点数" + "\t" + "标点数占比" + "\t" + "人名数" + "\t" + "人名数占比" + "\t" +
        "词总数" + "\t" + "字总数" + "\t" + "平均词长" + "\t" + "一字词数" + "\t" + "一字词占比" + "\t" +
        "二字词数" + "\t" + "二字词占比" + "\t" + "三字词数" + "\t" + "三字词占比" + "\t" + "四字词数" + "\t" +
        "四字词占比" + "\t" + "段落数" + "\t" + "段落平均字数" + "\t" +
        "句子数（长句）" + "\t" + "句子平均字数（长句）" + "\t" + "句子数（短句）" + "\t" + "句子平均字数（短句）" + "\t""\n")
    for f in files:
        f_word_result = open(result_path + '/' + f.split('/')[-1][:-4] + "_word_result.txt", "w+", encoding='utf-8')
        c = open(f,encoding='utf-8').read()
        print('paragraphs:', c.count('\n'))
        num_paragraph = c.count('\n')
        print('num_word:', len(c.split()))
        content_list = c.split()
        text_list = [word for word in content_list]
        print(" ", path, " text_list:", len(text_list))

        n_list = [w for w in text_list if w.endswith('/n')]
        print(len(n_list))  # 名词
        # 利用collections库中的Counter模块，可以很轻松地得到一个由单词和词频组成的字典。
        freq = collections.Counter(n_list)
        print(freq)
        # 词频前N的单词
        top_freq = freq.most_common(2)
        print(top_freq)

        # 文件名称
        result_file.write('' + f.split("/")[-1][:-4] + "\t")
        num_word = len(text_list)  # 词总数
        n_set = sorted([w for w in text_list if w.endswith('/n')])
        print(len(n_set))  # 名词
        result_file.writelines(str(len(n_set)) + "\t")
        result_file.write(str(float(len(n_set) / num_word)) + "\t")
        n_top = collections.Counter(n_set).most_common(100)
        f_word_result.write("名词:" + "\n")
        write_word(f_word_result, n_top)

        v_set = sorted([w for w in text_list if w.endswith('/v')])
        print(len(v_set))  # 动词
        result_file.writelines(str(len(v_set)) + "\t")
        result_file.write(str(float(len(v_set) / num_word)) + "\t")
        v_top = collections.Counter(v_set).most_common(100)
        f_word_result.write("\n\n动词:" + "\n")
        write_word(f_word_result, v_top)

        a_set = sorted([w for w in text_list if w.endswith('/a')])
        print(len(a_set))  # 形容词
        result_file.writelines(str(len(a_set)) + "\t")
        result_file.write(str(float(len(a_set) / num_word)) + "\t")
        a_top = collections.Counter(a_set).most_common(100)
        f_word_result.write("\n\n形容词:" + "\n")
        write_word(f_word_result, a_top)

        m_set = sorted([w for w in text_list if w.endswith('/m')])
        print(len(m_set))  # 数词
        result_file.writelines(str(len(m_set)) + "\t")
        result_file.write(str(float(len(m_set) / num_word)) + "\t")
        m_top = collections.Counter(m_set).most_common(100)
        f_word_result.write("\n\n数词:" + "\n")
        write_word(f_word_result, m_top)

        r_set = sorted([w for w in text_list if w.endswith('/r')])
        print(len(r_set))  # 代词
        result_file.writelines(str(len(r_set)) + "\t")
        result_file.write(str(float(len(r_set) / num_word)) + "\t")
        r_top = collections.Counter(r_set).most_common(100)
        f_word_result.write("\n\n代词:" + "\n")
        write_word(f_word_result, r_top)

        q_set = sorted([w for w in text_list if w.endswith('/q')])
        print(len(q_set))  # 量词
        result_file.writelines(str(len(q_set)) + "\t")
        result_file.write(str(float(len(q_set) / num_word)) + "\t")
        q_top = collections.Counter(q_set).most_common(100)
        f_word_result.write("\n\n量词:" + "\n")
        write_word(f_word_result, q_top)

        d_set = sorted([w for w in text_list if w.endswith('/d')])
        print(len(d_set))  # 副词
        result_file.writelines(str(len(d_set)) + "\t")
        result_file.write(str(float(len(d_set) / num_word)) + "\t")
        d_top = collections.Counter(d_set).most_common(100)
        f_word_result.write("\n\n副词:" + "\n")
        write_word(f_word_result, d_top)

        p_set = sorted([w for w in text_list if w.endswith('/p')])
        print(len(p_set))  # 介词
        result_file.writelines(str(len(p_set)) + "\t")
        result_file.write(str(float(len(p_set) / num_word)) + "\t")
        p_top = collections.Counter(p_set).most_common(100)
        f_word_result.write("\n\n介词:" + "\n")
        write_word(f_word_result, p_top)

        c_set = sorted([w for w in text_list if w.endswith('/c')])
        print(len(c_set))  # 连词
        result_file.writelines(str(len(c_set)) + "\t")
        result_file.write(str(float(len(c_set) / num_word)) + "\t")
        c_top = collections.Counter(c_set).most_common(100)
        f_word_result.write("\n\n连词:" + "\n")
        write_word(f_word_result, c_top)

        u_set = sorted([w for w in text_list if w.endswith('/u')])
        print(len(u_set))  # 助词
        result_file.writelines(str(len(u_set)) + "\t")
        result_file.write(str(float(len(u_set) / num_word)) + "\t")
        u_top = collections.Counter(u_set).most_common(100)
        f_word_result.write("\n\n助词:" + "\n")
        write_word(f_word_result, u_top)

        e_set = sorted([w for w in text_list if w.endswith('/e')])
        print(len(e_set))  # 叹词
        result_file.writelines(str(len(e_set)) + "\t")
        result_file.write(str(float(len(e_set) / num_word)) + "\t")
        e_top = collections.Counter(e_set).most_common(100)
        f_word_result.write("\n\n叹词:" + "\n")
        write_word(f_word_result, e_top)

        o_set = sorted([w for w in text_list if w.endswith('/o')])
        print(len(o_set))  # 拟声词
        result_file.writelines(str(len(o_set)) + "\t")
        result_file.write(str(float(len(o_set) / num_word)) + "\t")
        o_top = collections.Counter(o_set).most_common(100)
        f_word_result.write("\n\n拟声词:" + "\n")
        write_word(f_word_result, o_top)

        w_set = sorted([w for w in text_list if w.endswith('/w')])
        print(len(w_set))  # 标点
        result_file.writelines(str(len(w_set)) + "\t")
        result_file.write(str(float(len(w_set) / num_word)) + "\t")
        w_top = collections.Counter(w_set).most_common(100)
        f_word_result.write("\n\n标点:" + "\n")
        write_word(f_word_result, w_top)

        nh_set = sorted([w for w in text_list if '/nh' in w])
        print(len(nh_set))  # 人名
        result_file.writelines(str(len(nh_set)) + "\t")
        result_file.write(str(float(len(nh_set) / num_word)) + "\t")
        nh_top = collections.Counter(nh_set).most_common(100)
        f_word_result.write("\n\n人名:" + "\n")
        write_word(f_word_result, nh_top)

        result_file.write(str(num_word) + "\t")  # 词总数
        word_list = [w.split('/')[0] for w in text_list]
        sentence_symbol = [word for word in open(sentence_symbol_path,encoding='utf-8').read().split()]
        sentence_list_long = [w for w in word_list if w in sentence_symbol[:6]]  # 长句
        sentence_list_short = [w for w in word_list if w in sentence_symbol]  # 短句
        num_sentence_long = len(sentence_list_long)  # 段落数
        num_sentence_short = len(sentence_list_short)  # 段落数
        word_no_pos_len_list = [len(w.split('/')[0]) for w in text_list]

        num_char = sum(len(w.split('/')[0]) for w in text_list)  # 字总数
        result_file.write(str(num_char) + "\t")  # 字总数
        average_word_len = float(num_char / num_word)
        print("word average length: ", str(average_word_len))
        result_file.write(str(average_word_len) + "\t")  # 单词平均长度

        # 利用collections库中的Counter模块，可以很轻松地得到一个由单词和词频组成的字典。
        len_counts = collections.Counter(word_no_pos_len_list)
        if len_counts.get(1):
            result_file.write(str(len_counts.get(1)) + "\t")  # 1字词个数
            result_file.write(str(float(len_counts.get(1) / num_word)) + "\t")  # 1字词占比
        else:
            result_file.write(str(0) + "\t" + str(0) + "\t")
        if len_counts.get(2):
            result_file.write(str(len_counts.get(2)) + "\t")  # 2字词个数
            result_file.write(str(float(len_counts.get(2) / num_word)) + "\t")  # 2字词占比
        else:
            result_file.write(str(0) + "\t" + str(0) + "\t")
        if len_counts.get(3):
            result_file.write(str(len_counts.get(3)) + "\t")  # 3字词个数
            result_file.write(str(float(len_counts.get(3) / num_word)) + "\t")  # 3字词占比
        else:
            result_file.write(str(0) + "\t" + str(0) + "\t")
        if len_counts.get(4):
            result_file.write(str(len_counts.get(4)) + "\t")  # 4字词个数
            result_file.write(str(float(len_counts.get(4) / num_word)) + "\t")  # 4字词占比
        else:
            result_file.write(str(0) + "\t" + str(0) + "\t")
        if num_paragraph > 0:
            result_file.write(str(num_paragraph) + "\t")  # 段落数
            result_file.write(str(float(num_char / num_paragraph)) + "\t")  # 段落平均字数
        else:
            result_file.write(str(0) + "\t" + str(0) + "\t")
        if num_sentence_long > 0:
            result_file.write(str(num_sentence_long) + "\t")  # 句子数（长句）
            result_file.write(str(float(num_char / num_sentence_long)) + "\t")  # 句子平均字数
        else:
            result_file.write(str(0) + "\t" + str(0) + "\t")
        if num_sentence_short > 0:
            result_file.write(str(num_sentence_short) + "\t")  # 句子数(短句)
            result_file.write(str(float(num_char / num_sentence_short)) + "\t")  # 句子平均字数（短句）
        else:
            result_file.write(str(0) + "\t" + str(0) + "\t")
        result_file.write("\n")
        f_word_result.close()
    result_file.close()


def get_files(path):
    files = []
    for parent, dirnames, filenames in os.walk(path):  # 三个参数：分别返回1.父目录 2.所有文件夹名字（不含路径） 3.所有文件名字
        for filename in filenames:  # 输出文件信息
            # print("parent is:" + parent
            # print("filename is:" + filename
            # print("the full name of the file is:" + os.path.join(parent, filename)  # 输出文件路径信息
            files.append(os.path.join(parent, filename))
        return files


def write_word(f_word_result, tops):
    for top in tops:
        f_word_result.write(top[0].split('/')[0] + "\t")
        f_word_result.write(str(top[1]))
        f_word_result.write("\n")


if __name__ == '__main__':
    analysis_word()

paragraphs: 637
num_word: 44953
  data/segmented  text_list: 44953
5348
Counter({'人/n': 227, '底/n': 88, '奴隶/n': 85, '生活/n': 83, '朋友/n': 80, '心/n': 78, '时候/n': 72, '父亲/n': 65, '事/n': 63, '东西/n': 55, '母亲/n': 54, '声音/n': 46, '话/n': 46, '手/n': 46, '船/n': 43, '地方/n': 41, '狗/n': 41, '眼睛/n': 40, '信/n': 39, '世界/n': 38, '眼泪/n': 38, '祖父/n': 38, '头/n': 37, '句/n': 34, '事情/n': 33, '脸/n': 32, '钱/n': 31, '主人/n': 28, '船夫/n': 27, '件/n': 27, '先生/n': 26, '人们/n': 26, '书/n': 25, '身子/n': 23, '青年/n': 23, '字/n': 22, '最后/n': 22, '眼光/n': 22, '道/n': 21, '缘故/n': 21, '生命/n': 21, '神/n': 21, '光明/n': 20, '大声/n': 19, '带/n': 19, '脑子/n': 19, '结果/n': 19, '学校/n': 19, '月光/n': 17, '勇气/n': 17, '样子/n': 16, '路/n': 16, '点/n': 16, '家/n': 16, '哥哥/n': 16, '时间/n': 15, '办法/n': 15, '心情/n': 14, '声/n': 14, '女人/n': 14, '身体/n': 14, '少女/n': 14, '故事/n': 14, '胸膛/n': 14, '鱼/n': 14, '思想/n': 13, '工作/n': 13, '妻/n': 13, '祖先/n': 13, '本/n': 13, '消息/n': 13, '经历/n': 13, '贩子/n': 13, '弟弟/n': 13, '钟/n': 12, '机会/n': 12, '意思/n': 12, '问题/n': 12, '门/n': 12

2940
Counter({'人/n': 108, '友/n': 45, '教士/n': 45, '事情/n': 39, '女人/n': 38, '姊/n': 29, '时候/n': 27, '钱/n': 27, '兵/n': 25, '眼睛/n': 23, '地方/n': 22, '话/n': 22, '声音/n': 22, '长/n': 21, '身子/n': 18, '东西/n': 18, '带/n': 18, '卢布/n': 17, '修士/n': 16, '好友/n': 15, '件/n': 15, '礼拜堂/n': 15, '联队/n': 15, '头/n': 14, '保姆/n': 14, '总督/n': 14, '圣/n': 14, '亲爱的/n': 13, '工作/n': 13, '手/n': 13, '笔/n': 13, '病人/n': 13, '小姐/n': 12, '土地/n': 12, '间/n': 12, '囚犯/n': 12, '矿坑/n': 12, '堡垒/n': 11, '皇后/n': 11, '儿/n': 11, '农民/n': 11, '犯人/n': 11, '男人/n': 10, '血/n': 10, '瓦/n': 10, '阳台/n': 10, '修道院/n': 10, '心/n': 9, '罪/n': 9, '审判官/n': 9, '生活/n': 9, '衣服/n': 9, '丈夫/n': 9, '脸/n': 9, '信/n': 8, '狗/n': 8, '村庄/n': 8, '点/n': 8, '母亲/n': 8, '学校/n': 8, '一对/n': 8, '事/n': 8, '马车/n': 8, '声/n': 8, '身体/n': 8, '党/n': 7, '叔父/n': 7, '花园/n': 7, '花/n': 7, '根/n': 7, '阳光/n': 7, '监房/n': 7, '朋友/n': 7, '妻子/n': 7, '眼泪/n': 7, '理想/n': 7, '父亲/n': 7, '思想/n': 7, '外国人/n': 7, '军队/n': 7, '烧酒/n': 7, '科/n': 7, '皮鞭/n': 7, '伤痕/n': 7, '名字/n': 7, '船/n': 7, '贵族/n': 6, '时间/n'

In [10]:
# coding:utf-8
"""
@author:XuMing
"""

from gensim import corpora, models
import config
import jieba
import jieba.analyse
import train
from codecs import open

stopwords_path = config.stopwords_path
segmented_path = config.segmented_path
test_path = config.test_path
raw_path = config.raw_path
result_path = config.result_path
topic_num = 20


def get_stopwords_set(file_name):
    with open(file_name, 'r', encoding='utf-8') as f:
        return set([line.strip() for line in f])


def get_words_list(file_name, stop_word_file):
    stop_words_set = get_stopwords_set(stop_word_file)
    word_list = []
    with open(file_name, 'r', encoding='utf-8') as f:
        for line in f:
            tmp_list = list(jieba.cut(line.strip(), cut_all=False))
            word_list.append([i for i in tmp_list if i not in stop_words_set])
        return word_list


def extract_theme(raw_file, stop_word_file, num_topics=10):
    result = []
    # 列表，每个元素也是列表，即分词后的词语列表
    word_list = get_words_list(raw_file, stop_word_file)
    # 生成文档的词典，每个此与一个整形索引值对应
    word_dict = corpora.Dictionary(word_list)
    # 词频统计，转化为空间向量格式
    corpus_list = [word_dict.doc2bow(text) for text in word_list]
    lda = models.ldamodel.LdaModel(corpus=corpus_list, id2word=word_dict, num_topics=num_topics, alpha='auto')
    for pattern in lda.show_topics(num_topics=num_topics, num_words=1, formatted=False):
        result.append(pattern[1][0][0])
    return result


def main():
    files = train.get_files(raw_path)
    file_name = result_path + "/theme_result.txt"
    f_word_result = open(file_name, "w+", encoding='utf-8')
    f_word_result.write("主题词提取" + "\n")
    for f in files:
        f_word_result.write('\n' + f.split("\\")[-1][:-4] + ":\n")
        topics = extract_theme(f, stopwords_path, 100)
        topic_list = []
        for t in topics:
            if t not in topic_list and len(topic_list) < topic_num:
                topic_list.append(t)
                f_word_result.write(t + '\n')
        print(f + ' save to: ' + file_name + " ok.")
    f_word_result.close()


if __name__ == '__main__':
    main()
file = open("data/result/theme_result.txt","r", encoding='UTF-8')
print(file.read())

data/raw\A-光明.txt save to: data/result/theme_result.txt ok.
data/raw\A-利娜.txt save to: data/result/theme_result.txt ok.
主题词提取

A-光明:
说
苏堤
一定
父亲
奴隶
知道
船夫
夜深
吃
生活
我要
人们
里
做
教育
罢
一天
放在
已经
看见

A-利娜:
波利
想
听
饥饿
知道
波兰人
封信
爱
里
房里
罪
说
女人
教士
几个
利娜
囚犯
声音
收入
德国人

