In [74]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import brown
from text_chunker import TextChunker

In [75]:
def chunk_text(text, N):
    """
    将文本分块
    :param text: 待分块的文本
    :param N: 块的大小
    :return: 块的列表
    """
    input_words = text.split(' ')
    output = []
    curr_chunks = []
    count = 0
    for word in input_words:
        count += 1
        if count == N:
            output.append(' '.join(curr_chunks))
            curr_chunks = []
            count = 0
        curr_chunks.append(word)
    if curr_chunks:
        output.append(' '.join(curr_chunks))
    
    return output

In [76]:
words = brown.words()[:5600]
input_data = ' '.join(words)
chunk_size = 900
chunker = TextChunker(maxlen=chunk_size)
text_chunks = chunk_text(input_data,chunk_size)
chunks = []
corpos = []
for i, chunk in enumerate(text_chunks):
    chunks.append({'index': i, 'text': chunk})
    corpos.append(chunk)
    print(f"Chunk {i+1}: {chunk[:100]}")


Chunk 1: The Fulton County Grand Jury said Friday an investigation of Atlanta's recent primary election produ
Chunk 2: Rd. Aj . Henry L. Bowden was listed on the petition as the mayor's attorney . Hartsfield has been ma
Chunk 3: House in a privilege resolution to `` endorse increased federal support for public education , provi
Chunk 4: to provide special schooling for more deaf students in the scholastic age at a reduced cost to the s
Chunk 5: that a water development bill passed by the Texas House of Representatives was an effort by big citi
Chunk 6: preserving family unity . Research projects as soon as possible on the causes and prevention of depe
Chunk 7: dismiss them . Washington , Feb. 9 -- President Kennedy today proposed a mammoth new medical care pr


In [77]:
vecotorizer = CountVectorizer(min_df=7,max_df=18)
X = vecotorizer.fit_transform(corpos)
print(X.shape)
vecabulary = vecotorizer.get_feature_names_out()
Arr = X.toarray().T
m,n = Arr.shape

(7, 18)


In [79]:
stemmer_names = ["Chunk " + str(i) for i in range(1,8)]

formatted_text = '{:>10}' * (len(stemmer_names) + 1)
print('\n',formatted_text.format('Words', *stemmer_names),'\n','*' * 80)

for i in range(m):
    word = vecabulary[i]
    counts = Arr[i].tolist()
    counts = [str(count) for count in counts]
    print('\n',formatted_text.format(word, *counts))


      Words   Chunk 1   Chunk 2   Chunk 3   Chunk 4   Chunk 5   Chunk 6   Chunk 7 
 ********************************************************************************

        and        27         5        15         8        16        17         6

        are         2         2         1         1         4         1         1

         as         6         4         4         2         9         3         3

         be         6        11         5        10         2         3         2

         by         3         5         4        10        10         7         3

        for         9        12         5        13         7         5         2

        his         4         5         5         2         2         5         1

         in        15        16        12        14        19        20         3

         is         3         7         5         1         7         5         1

         it         9         6        12         6         1         3         2

   