###   文本特征
#### 词袋模型

In [32]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt 
import nltk
import re
import warnings
warnings.filterwarnings("ignore")

In [39]:
words = [
    'Today is beautiful day day',
    'Love this day is beautiful',
    'This is test code',
    'The day is very beautiful today',
    'The cat is beautiful',
    'The dog is lazy',
]
labels = ['weather','weather','code','weather','animal','animal']
words = np.array(words)
words_df = pd.DataFrame({'Document':words,
                         'Labels':labels})
words_df

Unnamed: 0,Document,Labels
0,Today is beautiful day day,weather
1,Love this day is beautiful,weather
2,This is test code,code
3,The day is very beautiful today,weather
4,The cat is beautiful,animal
5,The dog is lazy,animal


In [40]:
# 加载停用词
wpt = nltk.WhitespaceTokenizer()
stop_words = nltk.corpus.stopwords.words('english')

# 数据清洗
def clean_words(doc):
    docs = []
    for strs in doc:
        # 去掉特殊字符
        doc = re.sub(r'[^a-zA-Z0-9\s]','',strs,re.I)
        # 转换为小写
        doc = doc.lower()
        doc = doc.strip()
        # 分词
        tokens = wpt.tokenize(strs)
        # 停用词
        filter_words = [token for token in tokens if token not in stop_words]
        
        # 重新合成文章
        doc = ' '.join(filter_words)
        docs.append(doc)
    return np.array(docs)

In [41]:
norm_words = clean_words(words)
norm_words

array(['Today beautiful day day', 'Love day beautiful', 'This test code',
       'The day beautiful today', 'The cat beautiful', 'The dog lazy'],
      dtype='<U23')

In [42]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(min_df=0.,max_df=1.)
cv.fit(norm_words)
print(cv.get_feature_names())
cv_matrix = cv.fit_transform(norm_words)
cv_matrix = cv_matrix.toarray()
cv_matrix

['beautiful', 'cat', 'code', 'day', 'dog', 'lazy', 'love', 'test', 'the', 'this', 'today']


array([[1, 0, 0, 2, 0, 0, 0, 0, 0, 0, 1],
       [1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0],
       [0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0],
       [1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1],
       [1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0],
       [0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0]], dtype=int64)

In [43]:
vocab = cv.get_feature_names()
'Today beautiful day'
pd.DataFrame(cv_matrix,columns=vocab)

Unnamed: 0,beautiful,cat,code,day,dog,lazy,love,test,the,this,today
0,1,0,0,2,0,0,0,0,0,0,1
1,1,0,0,1,0,0,1,0,0,0,0
2,0,0,1,0,0,0,0,1,0,1,0
3,1,0,0,1,0,0,0,0,1,0,1
4,1,1,0,0,0,0,0,0,1,0,0
5,0,0,0,0,1,1,0,0,1,0,0


### TF-IDF 
TF-IDF = 词频（TF） × 逆文档频率
词频（TF） = 某个词再文章中出现的次数/文章的总词数
逆文档频率（IDF） = log(语料库的文档总数/包含该词的文档总数+1)

In [46]:
from sklearn.feature_extraction.text import TfidfVectorizer
tv = TfidfVectorizer(min_df=0.,max_df=1.,use_idf=True)
tv_matrix = tv.fit_transform(norm_words)
tv_matrix = tv_matrix.toarray()

vocab = tv.get_feature_names()
pd.DataFrame(np.round(tv_matrix,2),columns=vocab)

Unnamed: 0,beautiful,cat,code,day,dog,lazy,love,test,the,this,today
0,0.35,0.0,0.0,0.81,0.0,0.0,0.0,0.0,0.0,0.0,0.48
1,0.44,0.0,0.0,0.51,0.0,0.0,0.74,0.0,0.0,0.0,0.0
2,0.0,0.0,0.58,0.0,0.0,0.0,0.0,0.58,0.0,0.58,0.0
3,0.42,0.0,0.0,0.49,0.0,0.0,0.0,0.0,0.49,0.0,0.58
4,0.44,0.74,0.0,0.0,0.0,0.0,0.0,0.0,0.51,0.0,0.0
5,0.0,0.0,0.0,0.0,0.64,0.64,0.0,0.0,0.44,0.0,0.0
