In [45]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV
import numpy as np

twenty_train = fetch_20newsgroups(subset='train', shuffle=True)

In [12]:
twenty_train.target_names #prints all the categories

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [13]:
print("\n".join(twenty_train.data[0].split("\n")[:3])) #prints first line of the first data file

From: lerxst@wam.umd.edu (where's my thing)
Subject: WHAT car is this!?
Nntp-Posting-Host: rac3.wam.umd.edu


In [19]:
# Extracting features from text files
# In order to run machine learning algorithms we 
# need to convert the text files into numerical 
# feature vectors. We will be using bag of words model.
# Each unique word in our dictionary will correspond to a feature (descriptive feature)

count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(twenty_train.data)
X_train_counts.shape

# Here by doing ‘count_vect.fit_transform(twenty_train.data)’, 
# we are learning the vocabulary dictionary and it returns 
# a Document-Term matrix. [n_samples, n_features]




(11314, 130107)

In [20]:
# TF: Just counting the number of words in each document 
# has 1 issue: it will give more weightage to longer documents 
# than shorter documents. To avoid this, 
# we can use frequency (TF - Term Frequencies) 
# i.e. #count(word) / #Total words, in each document.

# TF-IDF: Finally, we can even reduce the weightage 
# of more common words like (the, is, an etc.) 
# which occurs in all document. This is called as TF-IDF 
# i.e Term Frequency times inverse document frequency

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(11314, 130107)

In [37]:
# The last line will output the dimension of the Document-Term matrix -> (11314, 130107)
clf = MultinomialNB().fit(X_train_tfidf, twenty_train.target)

#  We can write less code and do all of the above, by building a pipeline as follows

text_clf = Pipeline([('vect', CountVectorizer()),
                      ('tfidf', TfidfTransformer()),
                      ('clf', MultinomialNB()),
])

text_clf = text_clf.fit(twenty_train.data, twenty_train.target)



In [39]:
twenty_test = fetch_20newsgroups(subset='test', shuffle=True)
predicted = text_clf.predict(twenty_test.data)
np.mean(predicted == twenty_test.target)

0.7738980350504514

In [None]:
# Support Vector Machines (SVM): Let’s try using a different algorithm SVM, 
# and see if we can get any better performance

text_clf_svm = Pipeline([('vect', CountVectorizer()),
                      ('tfidf', TfidfTransformer()),
                      ('clf-svm', SGDClassifier(loss='hinge', penalty='l2',
                                            alpha=1e-3, n_iter=5, random_state=42)),
])

In [44]:
_ = text_clf_svm.fit(twenty_train.data, twenty_train.target)
predicted_svm = text_clf_svm.predict(twenty_test.data)
np.mean(predicted_svm == twenty_test.target)



0.8238183749336165

In [46]:
 parameters = {'vect__ngram_range': [(1, 1), (1, 2)],
               'tfidf__use_idf': (True, False),
               'clf__alpha': (1e-2, 1e-3),
}

In [47]:
gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)
gs_clf = gs_clf.fit(twenty_train.data, twenty_train.target)



In [49]:
gs_clf.best_score_



0.9067526957751458

In [50]:
gs_clf.best_params_


{'clf__alpha': 0.01, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 2)}