# Objective
- use sklearn's Pipeline to "group" the components of the model and use GridSearchCV to find best model parameters
- classify news articles into different categories

Download the dataset from http://mlg.ucd.ie/files/datasets/bbc-fulltext.zip and extract.
The dataset consists of 2225 documents and 5 categories: business, entertainment, politics, sport, tech

In [1]:
import numpy as np
from sklearn.datasets import load_files

In [2]:
DATA_DIR = "./bbc/"

In [3]:
%%cmd 
ls ./bbc

Microsoft Windows [Version 10.0.16299.371]
(c) 2017 Microsoft Corporation. All rights reserved.

E:\Playground\blog\bbc_classification>ls ./bbc
business
entertainment
politics
README.TXT
sport
tech

E:\Playground\blog\bbc_classification>

In [4]:
data = load_files(DATA_DIR, encoding="utf-8", decode_error="replace")

In [5]:
# calculate number of articles for each category
labels, counts = np.unique(data.target, return_counts=True)
labels_str = np.array(data.target_names)[labels]
print(dict(zip(labels_str, counts)))

{'politics': 417, 'entertainment': 386, 'business': 510, 'tech': 401, 'sport': 511}


In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data.data, data.target)
len(X_train), len(X_test)

(1668, 557)

In [14]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

pipeline = Pipeline([
        ('vectorizer', TfidfVectorizer(stop_words="english", )),
        ('classifier', LogisticRegression())
    ])

pipeline.fit(X_train, y_train)

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


Pipeline(steps=[('vectorizer', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=Tr...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [15]:
pipeline.score(X_test, y_test)

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


0.9784560143626571

In [17]:
parameters = {
    'vectorizer__max_features': (None, 1000, 1000),
    'vectorizer__max_df': (0.5, 0.7, 1.0),
    'classifier__penalty': ('l1', 'l2')
}

from sklearn.model_selection import GridSearchCV
grid_search = GridSearchCV(pipeline, parameters)
grid_search.fit(X_train, y_train)

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.

GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(steps=[('vectorizer', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=Tr...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'classifier__penalty': ('l1', 'l2'), 'vectorizer__max_features': (None, 1000, 1000), 'vectorizer__max_df': (0.5, 0.7, 1.0)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [18]:
print("Best score %0.3f" % grid_search.best_score_)

Best score 0.969


In [19]:
print("Best parameters = ")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, best_parameters[param_name]))

Best parameters = 
classifier__penalty: 'l2'
vectorizer__max_df: 0.5
vectorizer__max_features: None


In [20]:
grid_search.best_estimator_.score(X_test, y_testst)

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


0.9784560143626571