In [1]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_files
 
DATA_DIR = "./bbc/"

In [2]:
data = load_files(DATA_DIR, encoding="utf-8", decode_error="replace")
# calculate count of each category
labels, counts = np.unique(data.target, return_counts=True)
# convert data.target_names to np array for fancy indexing
labels_str = np.array(data.target_names)[labels]
print(dict(zip(labels_str, counts)))


{'business': 510, 'entertainment': 386, 'politics': 417, 'sport': 511, 'tech': 401}


In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data.data, data.target)
list(t[:80] for t in X_train[:10])

['Cheaper chip for mobiles\n\nA mobile phone chip which combines a modem and a compu',
 'Ukraine strikes Turkmen gas deal\n\nUkraine has agreed to pay 30% more for natural',
 'Mexicans tracking unhappy Juninho\n\nMexican outfit Red Sharks Veracruz hope to si',
 'Clarke faces ID cards rebellion\n\nCharles Clarke faces his first real test as hom',
 'Harinordoquy suffers France axe\n\nNumber eight Imanol Harinordoquy has been dropp',
 'House prices drop as sales slow\n\nHouse prices fell further in November and prope',
 'Tories leave door open for Archer\n\nThe Conservative Party would deal "sympatheti',
 'Talks aim to avert pension strike\n\nTalks aimed at averting a series of national ',
 "Gazprom 'in $36m back-tax claim'\n\nThe nuclear unit of Russian energy giant Gazpr",
 'Mourinho takes swipe at Arsenal\n\nChelsea boss Jose Mourinho has attempted to pil']

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words="english", max_features=1000, decode_error="ignore")
vectorizer.fit(X_train)

TfidfVectorizer(analyzer='word', binary=False, decode_error='ignore',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=1000, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words='english', strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [7]:
vectorizer.fit(X_train)
X_train_vectorized = vectorizer.transform(X_train)

In [8]:
from sklearn.naive_bayes import MultinomialNB
cls = MultinomialNB()
# transform the list of text to tf-idf before passing it to the model
cls.fit(vectorizer.transform(X_train), y_train)
 
from sklearn.metrics import classification_report, accuracy_score
 
y_pred = cls.predict(vectorizer.transform(X_test))
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

0.9605026929982047
              precision    recall  f1-score   support

           0       0.95      0.97      0.96       133
           1       0.97      0.97      0.97       101
           2       0.95      0.95      0.95        99
           3       0.98      1.00      0.99       119
           4       0.96      0.90      0.93       105

   micro avg       0.96      0.96      0.96       557
   macro avg       0.96      0.96      0.96       557
weighted avg       0.96      0.96      0.96       557



In [9]:
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import cross_val_score
 
# start with the classic
# with either pure counts or tfidf features
sgd = Pipeline([
        ("count vectorizer", CountVectorizer(stop_words="english", max_features=3000)),
        ("sgd", SGDClassifier(loss="modified_huber"))
    ])
sgd_tfidf = Pipeline([
        ("tfidf_vectorizer", TfidfVectorizer(stop_words="english", max_features=3000)),
        ("sgd", SGDClassifier(loss="modified_huber"))
    ])
 
svc = Pipeline([
        ("count_vectorizer", CountVectorizer(stop_words="english", max_features=3000)),
        ("linear svc", SVC(kernel="linear"))
    ])
svc_tfidf = Pipeline([
        ("tfidf_vectorizer", TfidfVectorizer(stop_words="english", max_features=3000)),
        ("linear svc", SVC(kernel="linear"))
    ])
   
all_models = [
    ("sgd", sgd),
    ("sgd_tfidf", sgd_tfidf),
    ("svc", svc),
    ("svc_tfidf", svc_tfidf),
    ]
 
unsorted_scores = [(name, cross_val_score(model, X_train, y_train, cv=2).mean()) for name, model in all_models]
scores = sorted(unsorted_scores, key=lambda x: -x[1])
print(scores)  



[('svc_tfidf', 0.9682189043281983), ('svc', 0.9514466864590148), ('sgd', 0.9460402124921825), ('sgd_tfidf', 0.944836856898448)]


In [10]:
model = svc_tfidf
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

0.9838420107719928
              precision    recall  f1-score   support

           0       0.98      0.97      0.97       133
           1       0.98      1.00      0.99       101
           2       0.98      0.97      0.97        99
           3       0.99      0.99      0.99       119
           4       0.99      0.99      0.99       105

   micro avg       0.98      0.98      0.98       557
   macro avg       0.98      0.98      0.98       557
weighted avg       0.98      0.98      0.98       557

