# P053 文本数据-CounterVectorizer向量化

In [3]:
import numpy as np
import pandas as pd

In [5]:
from sklearn.feature_extraction.text import CountVectorizer

In [7]:
documents = [
    'python is a programming language',
    'python is popular',
    'programming in python',
    'object-oriented programming in python'
]

In [9]:
vectorizer = CountVectorizer()

In [11]:
vectorizer.fit_transform(documents)

<4x8 sparse matrix of type '<class 'numpy.int64'>'
	with 15 stored elements in Compressed Sparse Row format>

In [15]:
df = pd.DataFrame(
    vectorizer.fit_transform(documents).toarray(),
    columns = vectorizer.get_feature_names_out()
)

In [17]:
df

Unnamed: 0,in,is,language,object,oriented,popular,programming,python
0,0,1,1,0,0,0,1,1
1,0,1,0,0,0,1,0,1
2,1,0,0,0,0,0,1,1
3,1,0,0,1,1,0,1,1


# P054 文本数据-计数向量化并配置停用词

In [20]:
# 问题：配置english，即使用内置的英语停用词列表，去除停用词

In [22]:
vectorizer = CountVectorizer(stop_words='english')

In [24]:
df = pd.DataFrame(
    vectorizer.fit_transform(documents).toarray(),
    columns = vectorizer.get_feature_names_out()
)
df

Unnamed: 0,language,object,oriented,popular,programming,python
0,1,0,0,0,1,1
1,0,0,0,1,0,1
2,0,0,0,0,1,1
3,0,1,1,0,1,1


# P055 文本数据-计数向量化并配置n-gram

In [29]:
# n-gram是一种基于统计语言模型的算法。基本思想是将文本里面的内容按照字节进行大小为n的滑动窗口操作，形成了长度是n的字节片段序列

In [33]:
vectorizer = CountVectorizer(stop_words='english', ngram_range=(1, 2))

In [35]:
df = pd.DataFrame(
    vectorizer.fit_transform(documents).toarray(),
    columns = vectorizer.get_feature_names_out()
)
df

Unnamed: 0,language,object,object oriented,oriented,oriented programming,popular,programming,programming language,programming python,python,python popular,python programming
0,1,0,0,0,0,0,1,1,0,1,0,1
1,0,0,0,0,0,1,0,0,0,1,1,0
2,0,0,0,0,0,0,1,0,1,1,0,0
3,0,1,1,1,1,0,1,0,1,1,0,0


# P056 文本数据-TFIDF实现文本向量化

In [38]:
# TF-IDF(term frequency-inverse document frequency)是一种用于信息检索与数据挖掘的常用加权技术。
# TF是词频（Term Frequency），IDF是逆文本频率指数（Inverse Document Frequency）

In [40]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

In [42]:
documents = [
    'python is a programming language',
    'python is popular',
    'programming in python',
    'object-oriented programming in python'
]

In [44]:
tfidf_vectorizer = TfidfVectorizer()

In [46]:
df = pd.DataFrame(
    data = tfidf_vectorizer.fit_transform(documents).toarray(),
    columns = tfidf_vectorizer.get_feature_names_out()
)

In [48]:
df

Unnamed: 0,in,is,language,object,oriented,popular,programming,python
0,0.0,0.519714,0.659191,0.0,0.0,0.0,0.420753,0.343993
1,0.0,0.572892,0.0,0.0,0.0,0.726641,0.0,0.379192
2,0.691131,0.0,0.0,0.0,0.0,0.0,0.55953,0.457453
3,0.433919,0.0,0.0,0.550372,0.550372,0.0,0.351295,0.287207


# P057 文本数据-TFIDF向量化增加停用词

In [51]:
tfidf_vectorizer = TfidfVectorizer(stop_words=['is', 'in'])

In [53]:
df = pd.DataFrame(
    data = tfidf_vectorizer.fit_transform(documents).toarray(),
    columns = tfidf_vectorizer.get_feature_names_out()
)

In [55]:
df

Unnamed: 0,language,object,oriented,popular,programming,python
0,0.771579,0.0,0.0,0.0,0.492489,0.402642
1,0.0,0.0,0.0,0.886548,0.0,0.462637
2,0.0,0.0,0.0,0.0,0.774191,0.632952
3,0.0,0.610878,0.610878,0.0,0.389916,0.318782
