# NLTK

## 1. NLTK基本操作

In [1]:
import nltk

  _nan_object_mask = _nan_object_array != _nan_object_array


In [None]:
nltk.download()

In [2]:
import nltk
from nltk.corpus import brown # 需要下载brown语料库
# 引用布朗大学的语料库

In [3]:
# 查看语料库包含的类别
print(brown.categories())

['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction']


In [4]:
# 查看brown语料库
print('共有{}个句子'.format(len(brown.sents())))
print('共有{}个单词'.format(len(brown.words())))

共有57340个句子
共有1161192个单词


In [5]:
brown.words()[:10]

['The',
 'Fulton',
 'County',
 'Grand',
 'Jury',
 'said',
 'Friday',
 'an',
 'investigation',
 'of']

In [6]:
# 词频统计
from nltk import FreqDist
dist = FreqDist(brown.words())

In [7]:
print('非重复单词个数：', len(dist))
print('前10个单词：', list(dist.keys())[:10])
print('the出现的个数：', dist['the'])

非重复单词个数： 56057
前10个单词： ["Yorker's", 'carted', 'Paulah', 'Wires', 'grated', 'oops', 'friezes', 'save', 'wiggle', 'furnishes']
the出现的个数： 62713


In [8]:
# 找出长度大于5，且出现次数大于500的单词
freq_words = [w for w in dist.keys() if len(w) > 5 and dist[w] > 500]

In [9]:
freq_words

['should',
 'around',
 'another',
 'against',
 'people',
 'between',
 'American',
 'without',
 'before',
 'through',
 'because',
 'himself',
 'thought',
 'little']

## 2. 分词

### 2.1 NLTK英文分词

In [None]:
sentence = "Python is a widely used high-level programming language for general-purpose programming."
tokens = nltk.word_tokenize(sentence) # 需要下载punkt分词模型
print(tokens)

### 2.2 分句

In [None]:
texts = 'Python is a widely used high-level programming language for general-purpose programming, created by Guido van Rossum and first released in 1991. An interpreted language, Python has a design philosophy that emphasizes code readability (notably using whitespace indentation to delimit code blocks rather than curly brackets or keywords), and a syntax that allows programmers to express concepts in fewer lines of code than might be used in languages such as C++ or Java.[23][24] The language provides constructs intended to enable writing clear programs on both a small and large scale.'
sentences = nltk.sent_tokenize(texts)

In [None]:
len(sentences)

In [None]:
sentences

### 2.3 中文结巴分词

In [None]:
# 安装 pip install jieba
import jieba

seg_list = jieba.cut("欢迎来到小象学院", cut_all=True)
print("全模式: " + "/ ".join(seg_list))  # 全模式

seg_list = jieba.cut("欢迎来到小象学院", cut_all=False)
print("精确模式: " + "/ ".join(seg_list))  # 精确模式

## 3. 词形归一化

### 3.1 词干提取(stemming)

In [None]:
# PorterStemmer
from nltk.stem.porter import PorterStemmer

porter_stemmer = PorterStemmer()
print(porter_stemmer.stem('looked'))
print(porter_stemmer.stem('went'))

In [None]:
input1 = 'List listed lists listing listings'
words1 = input1.lower().split(' ')
[porter_stemmer.stem(w) for w in words1]

In [None]:
# SnowballStemmer
from nltk.stem import SnowballStemmer

snowball_stemmer = SnowballStemmer('english')
print(snowball_stemmer.stem('looked'))
print(snowball_stemmer.stem('looking'))

In [None]:
# LancasterStemmer
from nltk.stem.lancaster import LancasterStemmer

lancaster_stemmer = LancasterStemmer()
print(lancaster_stemmer.stem('looked'))
print(lancaster_stemmer.stem('looking'))

### 3.2 词形归并(lemmatization)

In [None]:
from nltk.stem import WordNetLemmatizer # 需要下载wordnet语料库

wordnet_lematizer = WordNetLemmatizer()
print(wordnet_lematizer.lemmatize('cats'))
print(wordnet_lematizer.lemmatize('boxes'))
print(wordnet_lematizer.lemmatize('are'))
print(wordnet_lematizer.lemmatize('went'))

In [None]:
# 指明词性可以更准确地进行lemma
# lemmatize 默认为名词
print(wordnet_lematizer.lemmatize('are', pos='v'))
print(wordnet_lematizer.lemmatize('went', pos='v'))

## 4. 词性标注 (Part-Of-Speech)

In [None]:
import nltk

words = nltk.word_tokenize('Python is a widely used programming language.')
#print(words)
print(nltk.pos_tag(words)) # 需要下载 averaged_perceptron_tagger

## 5. 去除停用词

In [None]:
from nltk.corpus import stopwords # 需要下载stopwords

filtered_words = [word for word in words if word not in stopwords.words('english')]
print('原始词：', words)
print('去除停用词后：', filtered_words)

## 6. 典型的文本预处理流程

In [None]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

# 原始文本
raw_text = 'Life is like a box of chocolates. You never know what you\'re gonna get.'

# 分词
raw_words = nltk.word_tokenize(raw_text)

# 词形归一化
wordnet_lematizer = WordNetLemmatizer()
words = [wordnet_lematizer.lemmatize(raw_word) for raw_word in raw_words]

# 去除停用词
filtered_words = [word for word in words if word not in stopwords.words('english')]

print('原始文本：', raw_text)
print('预处理结果：', filtered_words)