In [1]:
#Loading & training the data.
from sklearn.datasets import fetch_20newsgroups #fetching news dataset
news_train = fetch_20newsgroups(subset='train', shuffle=True)

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


In [2]:
#View the categories in the dataset
news_train.target_names 

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [4]:
# Extracting features from news text files
from sklearn.feature_extraction.text import CountVectorizer
cnt = CountVectorizer()
X_train_count = cnt.fit_transform(news_train.data)
X_train_count.shape

(11314, 130107)

In [5]:
# Implementing TF-IDF
from sklearn.feature_extraction.text import TfidfTransformer
tfidf = TfidfTransformer()
X_train_tfidf = tfidf.fit_transform(X_train_count)
X_train_tfidf.shape

(11314, 130107)

In [6]:
# Training Naive Bayes classifier
from sklearn.naive_bayes import MultinomialNB
clsf_NB = MultinomialNB().fit(X_train_tfidf, news_train.target)

In [7]:
# Building ML pipeline
from sklearn.pipeline import Pipeline
clsf_NB = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB())])
clsf_NB = clsf_NB.fit(news_train.data, news_train.target)

In [8]:
# Performance check for NB
import numpy as np
news_test = fetch_20newsgroups(subset='test', shuffle=True)
predicted_NB = clsf_N.predict(news_test.data)
np.mean(predicted_NB == news_test.target)

0.7738980350504514

In [9]:
# Training SVM and calculating its performance

from sklearn.linear_model import SGDClassifier
clsf_SVM = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                         ('clf-svm', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, n_iter=5, random_state=42))])

clsf_SVM = clsf_SVM.fit(news_train.data, news_train.target)
predicted_SVM = clsf_SVM.predict(news_test.data)
np.mean(predicted_SVM == news_test.target)



0.82381837493361654

In [13]:
# Performance tuning for factors using unigram and bigrams for NB
from sklearn.model_selection import GridSearchCV
param_NB = {'vect__ngram_range': [(1, 1), (1, 2)], 'tfidf__use_idf': (True, False), 'clf__alpha': (1e-2, 1e-3)}

gs_clf_NB = GridSearchCV(clf_NB, param, n_jobs=-1)
gs_clf_NB = gs_clf_NB.fit(news_train.data, news_train.target
                          
gs_clf_NB.best_score_
gs_clf_NB.best_params_


{'clf__alpha': 0.01, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 2)}

In [14]:
# Performance tuning for factors using unigram and bigrams for SVM
from sklearn.model_selection import GridSearchCV
param_SVM = {'vect__ngram_range': [(1, 1), (1, 2)], 'tfidf__use_idf': (True, False),'clf-svm__alpha': (1e-2, 1e-3)}

gs_clf_SVM = GridSearchCV(clf_svm, param_SVM, n_jobs=-1)
gs_clf_SVM = gs_clf_SVM.fit(news_train.data, news_train.target)

gs_clf_SVM.best_score_
gs_clf_SVM.best_params_



{'clf-svm__alpha': 0.001, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 2)}

In [15]:
# Using NLTK to Remove stop words
from sklearn.pipeline import Pipeline
clsf_NB = Pipeline([('vect', CountVectorizer(stop_words='english')), ('tfidf', TfidfTransformer()), 
                     ('clf', MultinomialNB())])

In [17]:
# Stemming Code

import nltk
nltk.download()

from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english", ignore_stopwords=True)

class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda doc: ([stemmer.stem(w) for w in analyzer(doc)])
    
stemmed_count_vect = StemmedCountVectorizer(stop_words='english')

text_mnb_stemmed = Pipeline([('vect', stemmed_count_vect), ('tfidf', TfidfTransformer()), 
                             ('mnb', MultinomialNB(fit_prior=False))])

text_mnb_stemmed = text_mnb_stemmed.fit(news_train.data, news_train.target)

predicted_mnb_stemmed = text_mnb_stemmed.predict(news_test.data)

np.mean(predicted_mnb_stemmed == news_test.target)

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


0.81678173127987252