In [1]:
from sklearn.datasets import fetch_20newsgroups
twenty_train = fetch_20newsgroups(subset='train', shuffle=True,random_state=42)
twenty_test = fetch_20newsgroups(subset='test', shuffle=True,random_state=42)

In [2]:
X_train = twenty_train.data
y_train = twenty_train.target
X_test = twenty_test.data
y_test = twenty_test.target

In [3]:
import time
from sklearn.metrics import accuracy_score

def train(classifier,X_train,y_train,X_test,y_test):
    start = time.time()

    classifier.fit(X_train, y_train)
    end = time.time()
    predicted = classifier.predict(X_test)

    print("Accuracy: ", accuracy_score(y_test,predicted))
    print("Time duration: " + str(end - start))
    return classifier

In [4]:
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

**Model 1: Without Stop words**

In [5]:
model_1 = Pipeline([('vectorizer', TfidfVectorizer()), ('classifier', MultinomialNB())])
train(model_1, X_train,y_train,X_test,y_test)

Accuracy:  0.7738980350504514
Time duration: 6.518404245376587


Pipeline(steps=[('vectorizer', TfidfVectorizer()),
                ('classifier', MultinomialNB())])

**Model 2: Stop words**

In [6]:
model_2 = Pipeline([('vectorizer', TfidfVectorizer(stop_words='english')), ('classifier', MultinomialNB())])
train(model_2, X_train,y_train,X_test,y_test)

Accuracy:  0.8169144981412639
Time duration: 6.780815601348877


Pipeline(steps=[('vectorizer', TfidfVectorizer(stop_words='english')),
                ('classifier', MultinomialNB())])

**Model 3: Stop words + Stemming**

In [7]:
def stemming_tokenizer(text):
    stemmer = PorterStemmer()
    return [stemmer.stem(w) for w in word_tokenize(text)]
model_3 = Pipeline([('vectorizer', TfidfVectorizer(tokenizer=stemming_tokenizer,stop_words='english')), ('classifier', MultinomialNB())])
train(model_3, X_train,y_train,X_test,y_test)



Accuracy:  0.7833244822092406
Time duration: 217.64855360984802


Pipeline(steps=[('vectorizer',
                 TfidfVectorizer(stop_words='english',
                                 tokenizer=<function stemming_tokenizer at 0x000000316FF5D1F0>)),
                ('classifier', MultinomialNB())])

**Model 4: Stop words + Lemmatization**

In [8]:
def lemma_tokenizer(text):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(w) for w in word_tokenize(text)]
model_4 = Pipeline([('vectorizer', TfidfVectorizer(tokenizer=lemma_tokenizer,stop_words= 'english')), ('classifier', MultinomialNB())])
train(model_4, X_train,y_train,X_test,y_test)



Accuracy:  0.7943441317047265
Time duration: 124.31054306030273


Pipeline(steps=[('vectorizer',
                 TfidfVectorizer(stop_words='english',
                                 tokenizer=<function lemma_tokenizer at 0x000000316FF5D430>)),
                ('classifier', MultinomialNB())])

**Grid search on model_2 which achieves the highest accuracy so far**

In [9]:
from sklearn.model_selection import GridSearchCV
parameters = {'vectorizer__ngram_range': [(1, 1), (1, 2)],
              'classifier__alpha': (0.1,0.01,0.001,0.5,0.05,0.005)}
gs_clf = GridSearchCV(model_2, parameters, n_jobs=-1)
gs_clf = gs_clf.fit(twenty_train.data, twenty_train.target)

In [10]:
gs_clf.get_params().keys()

dict_keys(['cv', 'error_score', 'estimator__memory', 'estimator__steps', 'estimator__verbose', 'estimator__vectorizer', 'estimator__classifier', 'estimator__vectorizer__analyzer', 'estimator__vectorizer__binary', 'estimator__vectorizer__decode_error', 'estimator__vectorizer__dtype', 'estimator__vectorizer__encoding', 'estimator__vectorizer__input', 'estimator__vectorizer__lowercase', 'estimator__vectorizer__max_df', 'estimator__vectorizer__max_features', 'estimator__vectorizer__min_df', 'estimator__vectorizer__ngram_range', 'estimator__vectorizer__norm', 'estimator__vectorizer__preprocessor', 'estimator__vectorizer__smooth_idf', 'estimator__vectorizer__stop_words', 'estimator__vectorizer__strip_accents', 'estimator__vectorizer__sublinear_tf', 'estimator__vectorizer__token_pattern', 'estimator__vectorizer__tokenizer', 'estimator__vectorizer__use_idf', 'estimator__vectorizer__vocabulary', 'estimator__classifier__alpha', 'estimator__classifier__class_prior', 'estimator__classifier__fit_pr

In [11]:
gs_clf.best_params_

{'classifier__alpha': 0.01, 'vectorizer__ngram_range': (1, 2)}

In [12]:
predicted = gs_clf.predict(X_test)

print("Accuracy: ", accuracy_score(y_test,predicted))

Accuracy:  0.8300584174190122
