In [None]:
import sklearn

In [None]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from pprint import pprint

In [None]:
newsgroup_train = fetch_20newsgroups(subset="train")

In [None]:
fetch_20newsgroups()

In [None]:
# 20個主題
pprint(list(newsgroup_train.target_names))

In [None]:
# 選取4個主題
categories = ["alt.atheism", "comp.graphics", "sci.med", "soc.religion.christian"]

In [None]:
# 下載主題
twenty_train = fetch_20newsgroups(subset="train",categories=categories)

In [None]:
# 對內容進行分詞和向量化操作
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(twenty_train.data)

In [None]:
# 向量化後做TF-IDF轉換
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

## Rocchio程式範例

In [None]:
from sklearn.neighbors.nearest_centroid import  NearestCentroid

In [None]:
# TF-IDF轉換後的結果和對應的主題編號 twenty_train.target 放入分類器中進行訓練
clf = NearestCentroid().fit(X_train_tfidf, twenty_train.target)

In [None]:
# 建立測試集合，每筆資料一行內容，進行向量化和TF-IDF轉換
docs_new = ["religious ", "OpenGL on the GPU is fast"]
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

In [None]:
predicted = clf.predict(X_new_tfidf)

In [None]:
for doc, category in zip(docs_new, predicted):
    print("%r => %s" %(doc, twenty_train.target_names[category]))

## 貝氏分類

In [None]:
# Scikit-learn 提供幾種貝氏分類，其中多項式貝氏最適合做文字分類
from sklearn.naive_bayes import MultinomialNB

In [None]:
# TF-IDF轉換後的結果和對應的主題編號 twenty_train.target 放入分類器中進行訓練
clf = MultinomialNB().fit(X_train_tfidf, twenty_train.target)

In [None]:
# 建立測試集合，每筆資料一行內容，進行向量化和TF-IDF轉換
docs_new = ["religious ", "OpenGL on the GPU is fast"]
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

In [None]:
predicted = clf.predict(X_new_tfidf)

In [None]:
for doc, category in zip(docs_new, predicted):
    print("%r => %s" %(doc, twenty_train.target_names[category]))

## K-近鄰演算法

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
# 找出相似度最高的 15 篇文章
# TF-IDF轉換後的結果和對應的主題編號 twenty_train.target 放入分類器中進行訓練
clf = KNeighborsClassifier(15).fit(X_train_tfidf, twenty_train.target)

In [None]:
# 建立測試集合，每筆資料一行內容，進行向量化和TF-IDF轉換
docs_new = ["religious ", "OpenGL on the GPU is fast"]
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

In [None]:
predicted = clf.predict(X_new_tfidf)

In [None]:
for doc, category in zip(docs_new, predicted):
    print("%r => %s" %(doc, twenty_train.target_names[category]))

## SVM

In [None]:
from sklearn import svm

In [None]:
# TF-IDF轉換後的結果和對應的主題編號 twenty_train.target 放入分類器中進行訓練
# 使用線性支援向量分類 linear，對文章分類效果較好
clf = svm.SVC(kernel='linear').fit(X_train_tfidf, twenty_train.target)

In [None]:
# 建立測試集合，每筆資料一行內容，進行向量化和TF-IDF轉換
docs_new = ["religious ", "OpenGL on the GPU is fast"]
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

In [None]:
predicted = clf.predict(X_new_tfidf)

In [None]:
for doc, category in zip(docs_new, predicted):
    print("%r => %s" %(doc, twenty_train.target_names[category]))