In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [2]:
df = pd.read_csv('newsgroups_train.csv')
df.head()

Unnamed: 0,text,target,category
0,From: degroff@netcom.com (21012d)\nSubject: Re...,2,sci.space
1,From: ab@nova.cc.purdue.edu (Allen B)\nSubject...,1,comp.graphics
2,From: healta@saturn.wwc.edu (Tammy R Healy)\nS...,0,alt.atheism
3,From: capelli@vnet.IBM.COM (Ron Capelli)\nSubj...,1,comp.graphics
4,From: henry@zoo.toronto.edu (Henry Spencer)\nS...,2,sci.space


In [3]:
df.shape

(1657, 3)

In [4]:
target_names = df['target'].unique()
target_names

array([2, 1, 0])

In [5]:
dict =  {2 : 'sci.space',1: 'comp.graphics',0:'alt.atheism'}
dict

{2: 'sci.space', 1: 'comp.graphics', 0: 'alt.atheism'}

In [6]:
ls = [dict.get(val) for val in target_names]
ls

['sci.space', 'comp.graphics', 'alt.atheism']

In [7]:
labels = df['category'].unique()
labels

array(['sci.space', 'comp.graphics', 'alt.atheism'], dtype=object)

In [8]:
article_fifth = df['text'][4]
print(article_fifth)

From: henry@zoo.toronto.edu (Henry Spencer)
Subject: Re: TRUE "GLOBE", Who makes it?
Organization: U of Toronto Zoology
Lines: 12

In article <bill.047m@xpresso.UUCP> bill@xpresso.UUCP (Bill Vance) writes:
>It has been known for quite a while that the earth is actually more pear
>shaped than globular/spherical.  Does anyone make a "globe" that is accurate
>as to actual shape, landmass configuration/Long/Lat lines etc.?

I don't think you're going to be able to see the differences from a sphere
unless they are greatly exaggerated.  Even the equatorial bulge is only
about 1 part in 300 -- you'd never notice a 1mm error in a 30cm globe --
and the other deviations from spherical shape are much smaller.
-- 
SVR4 resembles a high-speed collision   | Henry Spencer @ U of Toronto Zoology
between SVR3 and SunOS.    - Dick Dunn  |  henry@zoo.toronto.edu  utzoo!henry



In [9]:
from sklearn.feature_extraction.text import CountVectorizer

vectoriser = CountVectorizer(max_features=1500)
vectors = vectoriser.fit_transform(df['text']).toarray()
vectors

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [10]:
X = vectors
y = df['target']
X.shape, y.shape

((1657, 1500), (1657,))

In [11]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((1325, 1500), (332, 1500), (1325,), (332,))

In [12]:
from sklearn.naive_bayes import BernoulliNB

model = BernoulliNB()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
y_pred

array([0, 1, 1, 0, 2, 0, 1, 2, 1, 1, 1, 2, 0, 0, 0, 1, 0, 2, 0, 1, 1, 1,
       2, 2, 1, 2, 2, 0, 0, 2, 2, 1, 1, 2, 2, 1, 2, 0, 2, 2, 1, 2, 0, 1,
       1, 1, 1, 1, 2, 1, 2, 1, 1, 1, 1, 0, 1, 0, 2, 2, 2, 1, 2, 1, 2, 1,
       1, 1, 0, 0, 2, 0, 2, 1, 1, 1, 2, 1, 0, 0, 1, 2, 2, 1, 1, 1, 1, 1,
       2, 2, 1, 2, 2, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1,
       2, 0, 0, 1, 0, 0, 1, 1, 0, 2, 1, 1, 0, 0, 1, 0, 0, 2, 2, 0, 1, 1,
       0, 1, 0, 1, 0, 2, 1, 0, 1, 1, 1, 0, 0, 0, 2, 1, 1, 2, 1, 2, 2, 1,
       2, 1, 2, 1, 1, 1, 1, 1, 2, 0, 0, 1, 1, 1, 2, 1, 0, 2, 2, 1, 2, 1,
       0, 1, 1, 2, 0, 1, 2, 1, 1, 1, 0, 0, 2, 2, 2, 1, 1, 1, 0, 1, 1, 0,
       0, 2, 0, 1, 1, 2, 0, 2, 0, 0, 0, 0, 0, 1, 0, 1, 2, 2, 2, 0, 2, 1,
       1, 0, 1, 0, 2, 0, 2, 1, 1, 2, 0, 2, 0, 1, 1, 2, 1, 0, 1, 2, 0, 0,
       1, 0, 2, 2, 1, 1, 1, 0, 1, 1, 2, 1, 0, 2, 1, 1, 2, 1, 2, 1, 1, 1,
       1, 2, 0, 0, 0, 2, 1, 0, 0, 1, 0, 2, 0, 1, 2, 1, 2, 1, 2, 1, 2, 0,
       1, 2, 2, 0, 1, 1, 1, 1, 0, 2, 2, 2, 2, 2, 0,

In [13]:
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score

accuracy = accuracy_score(y_test, y_pred)
c_matric = confusion_matrix(y_test, y_pred)
c_report = classification_report(y_test, y_pred)


print("Accuracy: ", accuracy)
print("Confusion Matrix: \n", c_matric)
print(c_report)

Accuracy:  0.8945783132530121
Confusion Matrix: 
 [[ 91   8   2]
 [  1 116   5]
 [  1  18  90]]
              precision    recall  f1-score   support

           0       0.98      0.90      0.94       101
           1       0.82      0.95      0.88       122
           2       0.93      0.83      0.87       109

    accuracy                           0.89       332
   macro avg       0.91      0.89      0.90       332
weighted avg       0.90      0.89      0.90       332



In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectoriser = TfidfVectorizer(max_features=1500)
vectors = tfidf_vectoriser.fit_transform(df['text']).toarray()
vectors

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [None]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
y_pred


array([0, 1, 1, 0, 2, 0, 1, 2, 1, 1, 1, 2, 0, 0, 0, 1, 0, 2, 0, 2, 1, 1,
       2, 2, 1, 2, 2, 0, 0, 2, 2, 0, 1, 2, 2, 1, 2, 0, 2, 2, 1, 2, 0, 1,
       1, 1, 1, 0, 2, 1, 2, 1, 1, 1, 1, 0, 2, 0, 2, 2, 2, 1, 2, 1, 2, 1,
       1, 1, 1, 0, 1, 0, 2, 1, 0, 2, 2, 2, 0, 0, 2, 2, 2, 1, 1, 1, 1, 1,
       1, 2, 1, 2, 2, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 2,
       2, 0, 0, 1, 0, 0, 1, 1, 0, 2, 1, 1, 0, 0, 1, 0, 0, 1, 2, 0, 2, 1,
       0, 1, 0, 1, 0, 2, 1, 0, 1, 1, 1, 0, 0, 0, 2, 1, 1, 2, 1, 1, 2, 1,
       1, 1, 2, 2, 1, 1, 1, 1, 2, 0, 0, 0, 2, 2, 2, 1, 0, 2, 2, 1, 2, 1,
       0, 1, 2, 2, 0, 2, 2, 2, 1, 1, 0, 0, 2, 2, 2, 1, 1, 0, 0, 0, 1, 0,
       0, 2, 0, 1, 1, 2, 0, 2, 0, 0, 0, 0, 0, 1, 0, 1, 2, 1, 2, 0, 2, 1,
       1, 0, 1, 0, 2, 0, 2, 2, 1, 2, 0, 2, 0, 1, 1, 2, 1, 0, 1, 2, 0, 0,
       1, 0, 2, 2, 1, 1, 1, 0, 2, 1, 2, 1, 0, 2, 2, 1, 2, 1, 2, 1, 1, 0,
       1, 2, 0, 0, 0, 2, 1, 0, 0, 2, 0, 2, 0, 2, 2, 1, 2, 1, 2, 1, 2, 0,
       1, 2, 2, 0, 1, 1, 1, 1, 0, 2, 2, 2, 2, 2, 0,

In [16]:
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score

accuracy = accuracy_score(y_test, y_pred)
c_matric = confusion_matrix(y_test, y_pred)
c_report = classification_report(y_test, y_pred)


print("Accuracy: ", accuracy)
print("Confusion Matrix: \n", c_matric)
print(c_report)

Accuracy:  0.9668674698795181
Confusion Matrix: 
 [[ 98   0   3]
 [  0 119   3]
 [  2   3 104]]
              precision    recall  f1-score   support

           0       0.98      0.97      0.98       101
           1       0.98      0.98      0.98       122
           2       0.95      0.95      0.95       109

    accuracy                           0.97       332
   macro avg       0.97      0.97      0.97       332
weighted avg       0.97      0.97      0.97       332

