### classification

In [None]:
from sklearn.datasets import load_files
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

# Load the text data
categories = [
    'alt.atheism',
    'talk.religion.misc',
    'comp.graphics',
    'sci.space',
]
twenty_train_small = load_files('data/newsgroup/20news-bydate-train/',
    categories=categories, encoding='latin-1')
twenty_test_small = load_files('data/newsgroup/20news-bydate-test/',
    categories=categories, encoding='latin-1')

twenty_train_small

In [None]:
# Turn the text documents into vectors of word frequencies
vectorizer = TfidfVectorizer(min_df=2)
X_train = vectorizer.fit_transform(twenty_train_small.data)
y_train = twenty_train_small.target

# Fit a classifier on the training set
classifier = MultinomialNB().fit(X_train, y_train)
print("Training score: {0:.1f}%".format(
    classifier.score(X_train, y_train) * 100))

# Evaluate the classifier on the testing set
X_test = vectorizer.transform(twenty_test_small.data)
y_test = twenty_test_small.target
print("Testing score: {0:.1f}%".format(
    classifier.score(X_test, y_test) * 100))

In [None]:
analyzer = TfidfVectorizer().build_analyzer()
analyzer("I love scikit-learn: this is a cool Python lib!")

In [None]:
analyzer = TfidfVectorizer(
    preprocessor=lambda text: text,  
    token_pattern=r'(?u)\b[\w-]+\b').build_analyzer()

analyzer("I love scikit-learn: this is a cool Python lib!")

In [None]:
vectorizer = TfidfVectorizer(preprocessor=lambda text: text, token_pattern=r'(?u)\b[\w-]+\b')
X_train = vectorizer.fit_transform(twenty_train_small.data)
y_train = twenty_train_small.target

# Fit a classifier on the training set
classifier = MultinomialNB().fit(X_train, y_train)
print("Training score: {0:.1f}%".format(
    classifier.score(X_train, y_train) * 100))

# Evaluate the classifier on the testing set
X_test = vectorizer.transform(twenty_test_small.data)
y_test = twenty_test_small.target
print("Testing score: {0:.1f}%".format(
    classifier.score(X_test, y_test) * 100))

In [None]:
from sklearn.metrics import classification_report

predicted = classifier.predict(X_test)
print(classification_report(twenty_test_small.target, predicted,
                            target_names=twenty_test_small.target_names))

In [None]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(twenty_test_small.target, predicted)
cm

In [None]:
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline

target_names = twenty_test_small.target_names
plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
tick_marks = np.arange(len(target_names))
plt.xticks(tick_marks, target_names, rotation=60)
plt.yticks(tick_marks, target_names)
plt.ylabel('True label')
plt.xlabel('Predicted label')