# Building a Basic NLP Pipeline w/Scikit-Learn

In [2]:
from sklearn.datasets import fetch_20newsgroups
# Only import data from 4 categories
categories = ['alt.atheism', 
              'soc.religion.christian', 
              'comp.graphics', 
              'sci.med']
# Fetch our preprocessed training data from scikit-learn
twenty_train = fetch_20newsgroups(subset='train', 
                                  categories=categories, 
                                  shuffle=True, 
                                  random_state=1)

In [3]:
print("First lines of the first loaded file:")
print("\n".join(twenty_train.data[0].split("\n")[:3]))

First lines of the first loaded file:
From: jaeger@buphy.bu.edu (Gregg Jaeger)
Subject: Re: The Inimitable Rushdie (Re: An Anecdote about Islam
Organization: Boston University Physics Department


In [4]:
print(f"First 10 target indices: {twenty_train.target[:10]}")
print(f"Target names: {twenty_train['target_names']}")

First 10 target indices: [0 2 3 1 2 1 1 1 2 3]
Target names: ['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian']


In [5]:
from sklearn.feature_extraction.text import CountVectorizer
# Initialize a vectorizer object
count_vect = CountVectorizer()
# Fit the data set into our count vectorizer
X_train_counts = count_vect.fit_transform(twenty_train.data)

# Confirms that there are 2257 sparse matrices with 35788 elements converted
# into Compressed Sparse Row format
X_train_counts.shape

(2257, 35788)

In [6]:
# Print index value of count_vect vocab entry for "physics" 
# This is directly linked to the frequency of the word in the whole training corpus
count_vect.vocabulary_.get('physics')

25121

'''

Occurrence count is a good start but there is an issue: 
longer documents will have higher average count values than shorter 
documents, even though they might talk about the same topics.

To avoid these potential discrepancies it suffices to divide the number 
of occurrences of each word in a document by the total number of words 
in the document: these new features are called tf for Term Frequencies.

Another refinement on top of tf is to downscale weights for words that 
occur in many documents in the corpus and are therefore less informative 
than those that occur only in a smaller portion of the corpus.

This downscaling is called tf–idf for 
“Term Frequency times Inverse Document Frequency”.

'''

In [7]:
from sklearn.feature_extraction.text import TfidfTransformer
# Save word occurence counts into its own object
X_train_counts = CountVectorizer().fit_transform(twenty_train.data)

# Initialize a Term Frequency object & fit with counts
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)

# Transform the counts data into term frequencies data
X_train_tf = tf_transformer.transform(X_train_counts)
print(f"Shape of TF data: {X_train_tf.shape}")

# Initialize a TF times IDF object
# (weights are downscaled for rare words)
tfidf_transformer = TfidfTransformer()
# Fit & transform the counts data into TFID data
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
print(f"Shape of TFIDF data: {X_train_tfidf.shape}")


Shape of TF data: (2257, 35788)
Shape of TFIDF data: (2257, 35788)


'''

Now that we have our features, we can train a classifier to try to predict 
the category of a post. Let’s start with a naïve Bayes classifier, which 
provides a nice baseline for this task.

'''

In [8]:
from sklearn.naive_bayes import MultinomialNB
# Initialize a naïve Bayes classifier & fit with TFIDF data & target indices
clf = MultinomialNB().fit(X_train_tfidf, twenty_train.target)

In [9]:
# Initialize occurency count vectorizer object
count_vect = CountVectorizer()
# Fit object with our training data
count_vect.fit(twenty_train.data)

# Two documents to test classifying based on training data
docs_new = ['God is love', 'OpenGL on the GPU is fast']

# Use fitted count vectorizer to transform new documents into likewise vectors
X_new_counts = count_vect.transform(docs_new)
# Convert those into TFIDF vectors
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

# Use our naïve Bayes classifier to predict tags based on TDIDF vectors
predicted = clf.predict(X_new_tfidf)

# Pretty print our results
for doc, category in zip(docs_new, predicted):
    print(f"{doc} => {twenty_train.target_names[category]}")

God is love => soc.religion.christian
OpenGL on the GPU is fast => comp.graphics


'''

Now that we understand the modular components of text classification,
we can construct a pipeline to simplify the procedure.

'''

In [10]:
from sklearn.pipeline import Pipeline
# We specify the word2vec obect as CountVectorizer(),
# the conversion of vectors to more useful vectors with TfidfTransformer(),
# & the classifier object as MultinomialNB()
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])

# The names 'vect','tfidf', & 'clf' are abitrary but useful when we perform
# a Grid Search for suitable hyperparameters

In [11]:
# Visualize our Pipeline easily
text_clf.fit(twenty_train.data, twenty_train.target)

In [12]:
import numpy as np
# Let's evaluate the predictive accuracy of the Pipeline

# Fetch the preprocessed scikit-learn dataset
twenty_test = fetch_20newsgroups(subset='test',
    categories=categories, shuffle=True, random_state=42)
# Load the documents into a data variable
docs_test = twenty_test.data
# Use the Pipeline object to process the docs & predict their tags
predicted = text_clf.predict(docs_test)
# Conduct a mean function across two arrays to find the amount of 
# correct predictions over the total predictions
print(f"Predictive Accuracy: {round((np.mean(predicted == twenty_test.target))*100,2)}")

Predictive Accuracy: 83.49


In [13]:
from sklearn.linear_model import SGDClassifier
# Lets try to improve the accuracy of our model with linear SVM

# Redefine our pipeline
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier(loss='hinge', penalty='l2',
                          alpha=1e-3, random_state=42,
                          max_iter=5, tol=None)),
])
# Fit Pipeline with our data & targets
text_clf.fit(twenty_train.data, twenty_train.target)
# Load the documents into a data variable
docs_test = twenty_test.data
# Use the Pipeline object to process the docs & predict their tags
predicted = text_clf.predict(docs_test)
# Conduct a mean function across two arrays to find the amount of 
# correct predictions over the total predictions
print(f"Predictive Accuracy: {round((np.mean(predicted == twenty_test.target))*100,2)}")

Predictive Accuracy: 90.81


In [14]:
from sklearn import metrics
# Classification reports are useful for visualizing the important
# metrics of any model with respect to the targets
print(metrics.classification_report(twenty_test.target, predicted,
    target_names=twenty_test.target_names))
# The confusion matrix is what the classification report is based on
metrics.confusion_matrix(twenty_test.target, predicted)

                        precision    recall  f1-score   support

           alt.atheism       0.96      0.80      0.87       319
         comp.graphics       0.86      0.98      0.92       389
               sci.med       0.94      0.88      0.91       396
soc.religion.christian       0.89      0.95      0.92       398

              accuracy                           0.91      1502
             macro avg       0.91      0.90      0.91      1502
          weighted avg       0.91      0.91      0.91      1502



array([[256,  11,  14,  38],
       [  2, 381,   3,   3],
       [  4,  38, 350,   4],
       [  5,  12,   4, 377]], dtype=int64)

'''

Our biggest sources of error come from the model predicting posts
from news-groups on atheism as from Christian groups as well as the model
predicting that posts from news-groups on the medical world as from 
the graphic design world

'''

# Parameter Tuning

In [15]:
# Lets revisit our pipeline and perform a search for best parameters for
# these data

from sklearn.model_selection import GridSearchCV

parameters = {
    # Try out all classifiers on either words or bigrams
    'vect__ngram_range': [(1, 1), (1, 2)],
    # Try out all classifiers with IDF or without IDF
    'tfidf__use_idf': (True, False),
    # Try out all classifiers with a penalty of 0.01 or 0.001 for SVM
    'clf__alpha': (1e-2, 1e-3),
    'clf__loss': ('squared_error', 'log_loss', 'perceptron', 'hinge', 'epsilon_insensitive', 'huber', 'squared_epsilon_insensitive', 'squared_hinge', 'modified_huber'),
}

In [16]:
# Initializes a symbolic GridSearch object
gs_clf = GridSearchCV(
    text_clf, 
    parameters, 
    cv=5, 
    # Grid Search will detect how many CPUs are installed and use them all
    n_jobs=-1)


In [17]:
# Fit the GridSearch object with a subset of our data to reduce time
gs_clf = gs_clf.fit(twenty_train.data, twenty_train.target)
# Print a prediction for visual confirmation
twenty_train.target_names[gs_clf.predict(['God is love'])[0]]

'soc.religion.christian'

In [18]:
# The objects best parameters & corresponding score can found with 
# the respective attributes
print(f"Best Predictive Accuracy using GridSearch: {round((gs_clf.best_score_ * 100),2)}")
for param_name in sorted(parameters.keys()):
    print(f"Param {param_name} : {gs_clf.best_params_[param_name]}")

Best Predictive Accuracy using GridSearch: 97.25271275238899
Param clf__alpha : 0.001
Param clf__loss : modified_huber
Param tfidf__use_idf : True
Param vect__ngram_range : (1, 1)


In [19]:
import pandas as pd
# Using pandas, we can easily visualize the full GridSearch results
pd.DataFrame(gs_clf.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_clf__alpha,param_clf__loss,param_tfidf__use_idf,param_vect__ngram_range,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,1.292487,0.031227,0.265880,0.020184,0.01,squared_error,True,"(1, 1)","{'clf__alpha': 0.01, 'clf__loss': 'squared_err...",0.913717,0.904867,0.909091,0.893570,0.866962,0.897641,0.016730,41
1,5.468441,0.154198,0.695859,0.044665,0.01,squared_error,True,"(1, 2)","{'clf__alpha': 0.01, 'clf__loss': 'squared_err...",0.909292,0.889381,0.900222,0.875831,0.847007,0.884346,0.021755,42
2,1.377472,0.042754,0.295891,0.039603,0.01,squared_error,False,"(1, 1)","{'clf__alpha': 0.01, 'clf__loss': 'squared_err...",0.809735,0.763274,0.800443,0.769401,0.756098,0.779790,0.021285,62
3,5.128292,0.174773,0.584978,0.043430,0.01,squared_error,False,"(1, 2)","{'clf__alpha': 0.01, 'clf__loss': 'squared_err...",0.823009,0.767699,0.798226,0.773836,0.749446,0.782443,0.025593,61
4,1.330197,0.080082,0.229736,0.025333,0.01,log_loss,True,"(1, 1)","{'clf__alpha': 0.01, 'clf__loss': 'log_loss', ...",0.818584,0.807522,0.853659,0.784922,0.756098,0.804157,0.032709,58
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67,4.786297,0.154798,0.585395,0.053550,0.001,squared_hinge,False,"(1, 2)","{'clf__alpha': 0.001, 'clf__loss': 'squared_hi...",0.825221,0.798673,0.800443,0.789357,0.769401,0.796619,0.018058,59
68,1.217457,0.029295,0.254512,0.029695,0.001,modified_huber,True,"(1, 1)","{'clf__alpha': 0.001, 'clf__loss': 'modified_h...",0.975664,0.975664,0.975610,0.968958,0.966741,0.972527,0.003883,1
69,4.632079,0.085069,0.435684,0.059945,0.001,modified_huber,True,"(1, 2)","{'clf__alpha': 0.001, 'clf__loss': 'modified_h...",0.977876,0.966814,0.968958,0.962306,0.962306,0.967652,0.005729,6
70,1.167883,0.062004,0.224977,0.020820,0.001,modified_huber,False,"(1, 1)","{'clf__alpha': 0.001, 'clf__loss': 'modified_h...",0.960177,0.933628,0.960089,0.937916,0.929047,0.944171,0.013331,17


# Constructing the Best Model

In [22]:
from sklearn.linear_model import SGDClassifier
# Lets try to improve the accuracy with the best parameters

# Redefine our pipeline
text_clf = Pipeline([
    ('vect', CountVectorizer(ngram_range=(1, 1))),
    ('tfidf', TfidfTransformer(use_idf=True)),
    ('clf', SGDClassifier(loss='squared_hinge', penalty='l2',
                          alpha=0.001, random_state=1,
                          max_iter=5, tol=None)),
                        ])

# Fit Pipeline with our data & targets
text_clf.fit(twenty_train.data, twenty_train.target)
# Load the documents into a data variable
docs_test = twenty_test.data
# Use the Pipeline object to process the docs & predict their tags
predicted = text_clf.predict(docs_test)
# Conduct a mean function across two arrays to find the amount of 
# correct predictions over the total predictions
print(f"Predictive Accuracy: {round((np.mean(predicted == twenty_test.target)*100),2)}")

Predictive Accuracy: 91.94
