In [1]:
from sklearn.datasets import fetch_20newsgroups

categories = ['alt.atheism', 'soc.religion.christian',
              'comp.graphics', 'sci.med']

#Training set 
twenty_train = fetch_20newsgroups(subset='train',
    categories=categories, shuffle=True, random_state=42)

#Print 10 target names
for t in twenty_train.target[:10]:
    print(twenty_train.target_names[t])


comp.graphics
comp.graphics
soc.religion.christian
soc.religion.christian
soc.religion.christian
soc.religion.christian
soc.religion.christian
sci.med
sci.med
sci.med


In [2]:
#-> Example of CountVectorizer 1
tags = [
  "python, tools",
  "linux, tools, ubuntu",
  "distributed systems, linux, networking, tools",
]

from sklearn.feature_extraction.text import CountVectorizer
vec = CountVectorizer()
data = vec.fit_transform(tags).toarray()
print(data)

#-> Vocabulary in CountVectorize

vocab = CountVectorizer(vocabulary=['hot', 'cold', 'old'])
cv_words_in_vocab = vocab.fit_transform(['pease porridge hot', 'pease porridge cold', 'pease porridge in the pot', 'nine days old']).toarray()
print(cv_words_in_vocab)

[[0 0 0 1 0 1 0]
 [0 1 0 0 0 1 1]
 [1 1 1 0 1 1 0]]
[[1 0 0]
 [0 1 0]
 [0 0 0]
 [0 0 1]]


In [3]:
#Tokenizing text with count vecorize (document to feature vectors)
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(twenty_train.data)
X_train_counts.shape

(2257, 35788)

In [4]:
#->Frequency
algo_freq = count_vect.vocabulary_.get(u'algorithm')
is_freq = count_vect.vocabulary_.get(u'is')

print("algorithm",algo_freq,
      "\nis ", is_freq)

algorithm 4690 
is  18474


In [5]:
#Prob: Longer documents have words occuring more freq
#Soln: Divide by Total no of words in doc(TF)

#Prob: Commoning occuring words have higher freq 
#Soln Mulitphy with incerve document frequency

from sklearn.feature_extraction.text import TfidfTransformer
#fit the data 
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
#Transform count-matrix to tf-idf representation
X_train_tf = tf_transformer.transform(X_train_counts)
X_train_tf.shape

#Before tf-idf
print("No if", X_train_counts.toarray()[:100])

#After tf-idf 
print("After tf ",X_train_tf.toarray()[:100])

No if [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 1 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
After tf  [[0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 ...
 [0.         0.01590913 0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]]


In [6]:
#Combining the last 2(fit+transform)into one 

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape


(2257, 35788)

In [7]:
#Naive Bayes - Multinomial

from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf, twenty_train.target)

In [8]:
#Predict outcome with docs_new 

#Transform docs_new
docs_new = ['God is love', 'OpenGL on the GPU is fast']
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

#Predict uisng Multinomial NB

predicted = clf.predict(X_new_tfidf)
print("Predicted ", predicted)
#print(twenty_train.target_names)

for doc, category in zip(docs_new, predicted):
#    print("doc ", doc, "category ", category)
    print('%r => %s' % (doc, twenty_train.target_names[category]))


Predicted  [3 1]
'God is love' => soc.religion.christian
'OpenGL on the GPU is fast' => comp.graphics


In [9]:
#Using a Pipeline

# 1.Vectorize
# 2.Transform
# 3.Classify

from sklearn.pipeline import Pipeline
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB()),
])

In [10]:
#Fit MNB 
text_clf.fit(twenty_train.data, twenty_train.target)


Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...inear_tf=False, use_idf=True)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [11]:
#Predict Multinomial NB on Test data

import numpy as np

#Test set
twenty_test = fetch_20newsgroups(subset='test',
    categories=categories, shuffle=True, random_state=42)
docs_test = twenty_test.data

#Predict on test data
mnb_predicted = text_clf.predict(docs_test)  

In [12]:
#Accuracy on test data

multi_nb_accuracy = np.mean(mnb_predicted == twenty_test.target)
print("Accuracy on test data \n"
      "Multinomial Naive Bayes ",
      multi_nb_accuracy*100)

Accuracy on test data 
Multinomial Naive Bayes  83.48868175765645


In [13]:
# SVM Classifier (loss=hinge)

from sklearn.linear_model import SGDClassifier
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', SGDClassifier(loss='hinge', penalty='l2',
                                           alpha=1e-3, random_state=42,
                                           max_iter=5, tol=None)),
])

In [14]:
#Fitting training data using svm
text_clf.fit(twenty_train.data, twenty_train.target)  

#Prediction on test data
svm_predicted = text_clf.predict(docs_test)

In [15]:
#Accuracy of SVM

svm_accuracy = np.mean(svm_predicted == twenty_test.target)
print("Accuracy on test data")
print("Multinomial Naive bayes ", multi_nb_accuracy*100)
print("SVM ", svm_accuracy*100)

Accuracy on test data
Multinomial Naive bayes  83.48868175765645
SVM  91.27829560585884


In [16]:
#Performance - mnb

#todo: pretty print

from sklearn import metrics
print(metrics.classification_report(twenty_test.target, mnb_predicted,
    target_names=twenty_test.target_names))

                        precision    recall  f1-score   support

           alt.atheism       0.97      0.60      0.74       319
         comp.graphics       0.96      0.89      0.92       389
               sci.med       0.97      0.81      0.88       396
soc.religion.christian       0.65      0.99      0.78       398

           avg / total       0.88      0.83      0.84      1502



In [17]:
# performance - svm
print(metrics.classification_report(twenty_test.target, svm_predicted,
    target_names=twenty_test.target_names))


                        precision    recall  f1-score   support

           alt.atheism       0.95      0.81      0.87       319
         comp.graphics       0.88      0.97      0.92       389
               sci.med       0.94      0.90      0.92       396
soc.religion.christian       0.90      0.95      0.93       398

           avg / total       0.92      0.91      0.91      1502



In [18]:
# Confusion matrix - mnb

metrics.confusion_matrix(twenty_test.target, mnb_predicted)

array([[192,   2,   6, 119],
       [  2, 347,   4,  36],
       [  2,  11, 322,  61],
       [  2,   2,   1, 393]], dtype=int64)

In [19]:
#Confusion - svm

metrics.confusion_matrix(twenty_test.target, svm_predicted)

array([[258,  11,  15,  35],
       [  4, 379,   3,   3],
       [  5,  33, 355,   3],
       [  5,  10,   4, 379]], dtype=int64)

In [20]:
# Grid search - Search best parameter of grid

#TODO: 
# 1. word or bigrams??
# 2. Put range of alpha

from sklearn.model_selection import GridSearchCV
parameters = {'vect__ngram_range': [(1, 1), (1, 2)],
              'tfidf__use_idf': (True, False),
              'clf__alpha': (1e-2, 1e-3),
}

In [21]:
#Grid Classifier
gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)

#Fit the classifier on subset of categories
gs_clf = gs_clf.fit(twenty_train.data[:400], twenty_train.target[:400])
gs_clf


GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...ty='l2', power_t=0.5, random_state=42, shuffle=True,
       tol=None, verbose=0, warm_start=False))]),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'vect__ngram_range': [(1, 1), (1, 2)], 'tfidf__use_idf': (True, False), 'clf__alpha': (0.01, 0.001)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [22]:
#Complete training set

gs_clf_full = GridSearchCV(text_clf, parameters, n_jobs=-1)
gs_clf_full = gs_clf_full.fit(twenty_train.data, twenty_train.target)
gs_clf_full

print(gs_clf_full.best_score_)            

for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_clf_full.best_params_[param_name]))

0.9654408506867523
clf__alpha: 0.001
tfidf__use_idf: True
vect__ngram_range: (1, 1)


In [23]:
twenty_train.target_names[gs_clf.predict(['God is love'])[0]]

'soc.religion.christian'

In [24]:
twenty_train.target_names[gs_clf.predict(['God is love', 'ksh or bash?'])[1]]

'comp.graphics'

In [25]:
#Predicting grid

doc_predict = ['God is love', 'ksh or bash?']

for index, doc in enumerate(doc_predict):
    #print("index", index)
    pred_doc = twenty_test.target_names[gs_clf.predict(doc_predict)[index]] 
    #todo: figure syntax, why index not in (), does it predict all? Why doesn't it work without [index]
    print(doc, " -> ", pred_doc)

God is love  ->  soc.religion.christian
ksh or bash?  ->  comp.graphics


In [26]:
#Best parameters from grid search

#todo: Mean cross-validated score of the best_estimator
print(gs_clf.best_score_)            

for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_clf.best_params_[param_name]))

0.9
clf__alpha: 0.001
tfidf__use_idf: True
vect__ngram_range: (1, 1)


In [27]:
# Grid search predict on test data
gs_predict = gs_clf.predict(twenty_test.data)
print(gs_predict[:100])


gs_full_predict = gs_clf_full.predict(twenty_test.data)
print(gs_full_predict[:100])

[2 2 2 0 3 0 1 3 1 0 1 3 3 3 1 0 1 3 0 3 2 3 1 0 2 1 2 3 2 0 1 3 0 2 2 3 0
 0 2 1 2 0 2 3 2 2 3 1 3 1 1 3 3 1 0 3 0 2 1 2 1 2 3 1 2 1 2 0 3 2 0 3 1 3
 1 3 0 2 3 0 1 3 3 1 2 3 3 0 2 2 1 2 2 3 3 1 3 3 3 3]
[2 2 2 0 3 0 1 3 1 2 1 3 0 3 1 2 1 3 0 3 2 3 1 0 2 1 1 3 2 0 1 3 0 2 2 0 0
 0 2 1 2 0 2 3 2 2 3 1 3 1 1 2 3 2 0 3 0 1 1 2 1 2 3 1 2 1 2 0 2 2 0 3 1 3
 1 0 0 2 3 0 1 0 3 1 2 3 3 0 2 2 1 2 2 1 3 1 0 3 3 3]


In [28]:
#Error: figure it
#np.mean(gs_predict, twenty_test.data)

# performance - Grid search
print("400 rows of data\n", metrics.classification_report(twenty_test.target, gs_predict,
    target_names=twenty_test.target_names))

print("On complete categories\n", metrics.classification_report(twenty_test.target, gs_full_predict,
    target_names=twenty_test.target_names))

400 rows of data
                         precision    recall  f1-score   support

           alt.atheism       0.93      0.71      0.80       319
         comp.graphics       0.84      0.96      0.90       389
               sci.med       0.91      0.83      0.87       396
soc.religion.christian       0.80      0.91      0.85       398

           avg / total       0.87      0.86      0.86      1502

On complete categories
                         precision    recall  f1-score   support

           alt.atheism       0.95      0.81      0.87       319
         comp.graphics       0.88      0.97      0.92       389
               sci.med       0.94      0.90      0.92       396
soc.religion.christian       0.90      0.95      0.93       398

           avg / total       0.92      0.91      0.91      1502



In [29]:
#Confusion matrix - grid search  

metrics.confusion_matrix(twenty_test.target, svm_predicted)

array([[258,  11,  15,  35],
       [  4, 379,   3,   3],
       [  5,  33, 355,   3],
       [  5,  10,   4, 379]], dtype=int64)