In [136]:
### Template for NLP/ Text Analytics tasks

In [137]:
from __future__ import unicode_literals, print_function

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import KFold
from sklearn.grid_search import GridSearchCV
from sklearn import metrics
from time import time
import numpy as np

In [139]:
# Pipeline
text_pipeline = Pipeline([('vect', CountVectorizer()),
                     ('rf', RandomForestClassifier()),
                    ])

In [140]:
# Load the data
categories = ['alt.atheism', 'soc.religion.christian','comp.graphics', 'sci.med']
data = fetch_20newsgroups(subset='all', categories=categories, remove=('headers', 'footers', 'quotes'))

In [169]:
# Info about the data
print("Total %d documents" % len(data.filenames))
print("Total %d categories" % len(data.target_names))
print("Label/Targets shape %d" % len(data.target))
print("Label/Targets first 10 values = %s" % data.target[:10])
print("3 records of text :", data.data)
print("labels :", data.target[:3])

Total 3759 documents
Total 4 categories
Label/Targets shape 3759
Label/Targets first 10 values = [2 3 3 1 2 3 2 1 1 1]
labels : [2 3 3]


In [88]:
# Split the data
test_size=0.3
seed=42
train, test, train_label, test_label = train_test_split(data.data, data.target, test_size=test_size, random_state=seed)

In [131]:
# Info about train and test
print("train = %d documents" % len(train))
print("test = %d documents" % len(test))

train = 2631 documents
test = 1128 documents


In [132]:
# Param grid builder for hyper paramter tuning
param_grid = {
    'vect__ngram_range': ((1, 1), (1, 2)), 
    'rf__n_estimators':(5, 10, 15),
}

In [153]:
#model = text_pipeline.fit(train, train_label)
# k fold  + grid search
kfold = KFold(len(train_label), n_folds=10, shuffle=False, random_state=7)
grid_search = GridSearchCV(text_pipeline, param_grid, cv=kfold, scoring="f1_weighted", n_jobs=-1)
t_start = time()
cvmodel = grid_search.fit(train, train_label)
print("Grid search processing time %0.3fs" % (time() - t_start))
print("Best F1-score: %0.3f" % grid_search.best_score_)

Grid search processing time 91.039s
Best F1-score: 0.680


In [134]:
# Predict on Test data
test_predicted = cvmodel.predict(test)

In [135]:
print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(param_grid.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Best score: 0.683
Best parameters set:
	rf__n_estimators: 15
	vect__ngram_range: (1, 1)


In [128]:
test_label

array([2, 0, 0, ..., 3, 2, 1])

In [129]:
test_predicted

array([2, 0, 3, ..., 3, 2, 1])