In [1]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

In [2]:
dataset = fetch_20newsgroups(data_home="./data", subset="all")

In [3]:
dataset.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [4]:
X, y = dataset.data, dataset.target

In [5]:
vectorizer = CountVectorizer(max_features=10000, stop_words='english')
X_bow = vectorizer.fit_transform(X)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X_bow, y, test_size=0.3, random_state=42)

In [7]:
model = MultinomialNB()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

In [8]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred, target_names=dataset.target_names))

Accuracy: 0.8438273788468341
                          precision    recall  f1-score   support

             alt.atheism       0.85      0.92      0.88       236
           comp.graphics       0.63      0.83      0.72       287
 comp.os.ms-windows.misc       0.89      0.11      0.19       290
comp.sys.ibm.pc.hardware       0.57      0.80      0.66       285
   comp.sys.mac.hardware       0.74      0.91      0.82       312
          comp.windows.x       0.76      0.83      0.79       308
            misc.forsale       0.79      0.76      0.77       276
               rec.autos       0.88      0.93      0.90       304
         rec.motorcycles       0.88      0.92      0.90       279
      rec.sport.baseball       0.95      0.95      0.95       308
        rec.sport.hockey       0.97      0.96      0.97       309
               sci.crypt       0.97      0.93      0.95       290
         sci.electronics       0.84      0.81      0.82       304
                 sci.med       0.97      0.88 

In [9]:
test = vectorizer.transform(['football'])
dataset.target_names[model.predict(test).item()]

'rec.sport.baseball'