In [26]:
# Scikit learn: Working with text data
# tutorial: https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html
# github source: https://github.com/scikit-learn/scikit-learn/tree/master/doc/tutorial/text_analytics
from sklearn import datasets
from sklearn import metrics

#data/model prep - module for preprocessing & model prep - includes scaling, centering, normalization, binarization and imputation methods.
from sklearn import preprocessing 
#used for removing stop words and obtaining feature extraction from text
from sklearn.feature_extraction.text import TfidfTransformer 
from sklearn.feature_extraction.text import CountVectorizer  #bag-of-words vectorication for LDA model

#import sklearn libraries to NLP model bilding & validation
from sklearn.metrics import accuracy_score  #used for model evaluation - https://scikit-learn.org/stable/modules/model_evaluation.html#model-evaluation
from sklearn.decomposition import LatentDirichletAllocation #model for NLP topic extraction, similar to gensim LDA
from sklearn.datasets import make_multilabel_classification #create random test dataset
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier 
from sklearn.model_selection import train_test_split

In [39]:
# Loading the 20 newsgroups dataset - collection of approximately 20,000 newsgroup documents, 
#partitioned (nearly) evenly across 20 different newsgroups

#partial dataset with only 4 categories out of the 20 available in the dataset
categories = ['alt.atheism', 'soc.religion.christian',
               'comp.graphics', 'sci.med']

from sklearn.datasets import fetch_20newsgroups
twenty_train = fetch_20newsgroups(subset='train',
    categories=categories, shuffle=True, random_state=42)

From: sd345@city.ac.uk (Michael Collier)
Subject: Converting images to HP LaserJet III?
Nntp-Posting-Host: hampton
Organization: The City University
Lines: 14

Does anyone know of a good way (standard PC application/PD utility) to
convert tif/img/tga files into LaserJet III format.  We would also like to
do the same, converting to HPGL (HP plotter) files.

Please email any response.

Is this the correct group?

Thanks in advance.  Michael.
-- 
Michael Collier (Programmer)                 The Computer Unit,
Email: M.P.Collier@uk.ac.city                The City University,
Tel: 071 477-8000 x3769                      London,
Fax: 071 477-8565                            EC1V 0HB.


In [20]:
#Data Understanding/Exploration - Tableau + Summary Stats/Graphs
#Let's print the first lines of the first loaded file:
print("\n".join(twenty_train.data[0].split("\n")[:20]))

In [42]:
#Data Prep - Preprocessing - Text Cleaning, Model Prep - Vectorization
#Text preprocessing, tokenizing and filtering of stopwords are all included in CountVectorizer, 
#which builds a dictionary of features and transforms documents to feature vectors:

#Remove stop words from text - for additioal text extraction options, see:  
#https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html

count_vect = CountVectorizer(stop_words={'english','science','math','beer'})
                                         #,'algorithm'})

X_train_counts = count_vect.fit_transform(twenty_train.data) #create model building "Train" vectorized dataset
#CountVectorizer supports counts of N-grams of words or consecutive characters. 
#Once fitted, the vectorizer has built a dictionary of feature indices
print(count_vect.vocabulary_.get(u'algorithm')) #count # of occurences of word in documents

None


In [None]:
#Model Prep - Text Vectorization
#Alternative - term frequency (tf) vectorization - Word Counts
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)

#Alternative - term frequency (tf-idf) vectorization - Term Frequency times Inverse Document Frequency - Dimensionality Reduction
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

In [22]:
# Model Building/Training - Classification - predict the category of a post using naïve Bayes classifier for word counts
clf = MultinomialNB().fit(X_train_tfidf, twenty_train.target)

In [24]:
# make predictions  - outcome on a new document we need to extract the features using almost the same feature 
# extracting chain as before. The difference is that we call transform instead of fit_transform on the transformers, 
# since they have already been fit to the training set - using MultinomialNB() function
docs_new = ['God is love', 'OpenGL on the GPU is fast']
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

predicted = clf.predict(X_new_tfidf)

for doc, category in zip(docs_new, predicted):
     print('%r => %s' % (doc, twenty_train.target_names[category]))

'God is love' => soc.religion.christian
'OpenGL on the GPU is fast' => comp.graphics


In [29]:
#In order to make the vectorizer => transformer => classifier easier to work with, scikit-learn provides a 
#Pipeline class that behaves like a compound classifier:
from sklearn.pipeline import Pipeline
text_clf1 = Pipeline([
     ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
     ('clf', MultinomialNB()),  #Naive Bays Classifier
]);

text_clf2 = Pipeline([
     ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier(loss='hinge', penalty='l2',
                          alpha=1e-3, random_state=42,
                           max_iter=5, tol=None)), #SVM classifier
]);

In [37]:
# Model Selection - summarize the fit of the model
import numpy as np
twenty_test = fetch_20newsgroups(subset='test',
    categories=categories, shuffle=True, random_state=42)
docs_test = twenty_test.data

#run naive bayes classifier pipeline and assess fit
text_clf1.fit(twenty_train.data, twenty_train.target)  
predicted = text_clf1.predict(docs_test)
print("NB Model Classification %: ")
print(np.mean(predicted == twenty_test.target)) 

#Note: We achieved 83.5% accuracy. Let’s see if we can do better with a linear support vector machine (SVM), 
#which is widely regarded as one of the best text classification algorithms 

#run SVM classifier
text_clf2.fit(twenty_train.data, twenty_train.target)  
predicted = text_clf2.predict(docs_test)
print("SVM Model Classification %: ")
print(np.mean(predicted == twenty_test.target)) 

NB Model Classification %: 
0.8348868175765646




SVM Model Classification %: 
0.9127829560585885


In [38]:
#detailed model performance results 
print(metrics.classification_report(twenty_test.target, predicted,
     target_names=twenty_test.target_names))

                        precision    recall  f1-score   support

           alt.atheism       0.95      0.81      0.87       319
         comp.graphics       0.88      0.97      0.92       389
               sci.med       0.94      0.90      0.92       396
soc.religion.christian       0.90      0.95      0.93       398

             micro avg       0.91      0.91      0.91      1502
             macro avg       0.92      0.91      0.91      1502
          weighted avg       0.92      0.91      0.91      1502

