In [2]:
from sklearn.datasets import fetch_20newsgroups
import pandas as pd

def twenty_newsgroup_to_csv():
    newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))

    df = pd.DataFrame([newsgroups_train.data, newsgroups_train.target.tolist()]).T
    df.columns = ['text', 'target']

    targets = pd.DataFrame( newsgroups_train.target_names)
    targets.columns=['title']

    out = pd.merge(df, targets, left_on='target', right_index=True)
    out['date'] = pd.to_datetime('now')
    out.to_csv('20_newsgroup.csv')
    
twenty_newsgroup_to_csv()

In [3]:
data_set = pd.read_csv('20_newsgroup.csv')
data_set

Unnamed: 0.1,Unnamed: 0,text,target,title,date
0,0,I was wondering if anyone out there could enli...,7,rec.autos,2021-04-02 04:08:58.776291
1,17,I recently posted an article asking what kind ...,7,rec.autos,2021-04-02 04:08:58.776291
2,29,\nIt depends on your priorities. A lot of peo...,7,rec.autos,2021-04-02 04:08:58.776291
3,56,an excellent automatic can be found in the sub...,7,rec.autos,2021-04-02 04:08:58.776291
4,64,: Ford and his automobile. I need information...,7,rec.autos,2021-04-02 04:08:58.776291
...,...,...,...,...,...
11309,11210,Secrecy in Clipper Chip\n\nThe serial number o...,11,sci.crypt,2021-04-02 04:08:58.776291
11310,11217,Hi !\n\nI am interested in the source of FEAL ...,11,sci.crypt,2021-04-02 04:08:58.776291
11311,11243,"The actual algorithm is classified, however, t...",11,sci.crypt,2021-04-02 04:08:58.776291
11312,11254,\n\tThis appears to be generic calling upon th...,11,sci.crypt,2021-04-02 04:08:58.776291


In [4]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer(stop_words='english')

X = data_set.iloc[:,1]
y = data_set.iloc[:,2]

X = count_vect.fit_transform(X.values.astype('U'))

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.25)

In [5]:
X_train

<8485x101322 sparse matrix of type '<class 'numpy.int64'>'
	with 565755 stored elements in Compressed Sparse Row format>

In [6]:
y_train

4630      2
10943    11
9313     15
10283    18
8548      9
         ..
4859      8
3264     13
9845     17
10799    11
2732     16
Name: target, Length: 8485, dtype: int64

In [7]:
from sklearn.naive_bayes import MultinomialNB
mnb_classifier = MultinomialNB()
mnb_classifier.fit(X_train, y_train)

MultinomialNB()

In [8]:
y_predict = mnb_classifier.predict(X_test)

In [9]:
mnb_classifier.score(X_test, y_test)

0.656062212796041

In [11]:
from sklearn.metrics import classification_report, accuracy_score

print(accuracy_score(y_test, y_predict))
print(classification_report(y_test, y_predict))

0.656062212796041
              precision    recall  f1-score   support

           0       0.81      0.40      0.54       124
           1       0.52      0.61      0.56       139
           2       1.00      0.04      0.08       148
           3       0.60      0.67      0.64       156
           4       0.73      0.68      0.70       120
           5       0.50      0.89      0.64       148
           6       0.87      0.67      0.76       152
           7       0.60      0.66      0.63       157
           8       0.71      0.60      0.65       136
           9       0.94      0.77      0.85       153
          10       0.92      0.83      0.87       155
          11       0.50      0.81      0.62       142
          12       0.89      0.44      0.59       154
          13       0.84      0.86      0.85       145
          14       0.71      0.74      0.72       140
          15       0.57      0.88      0.69       167
          16       0.80      0.66      0.72       145
         

In [13]:
from sklearn.naive_bayes import ComplementNB

cnb_classifier = ComplementNB()
cnb_classifier.fit(X_train, y_train)
y_predict = cnb_classifier.predict(X_test)
cnb_classifier.score(X_test, y_test)

0.7182750088370449

In [14]:
print(accuracy_score(y_test, y_predict))
print(classification_report(y_test, y_predict))

0.7182750088370449
              precision    recall  f1-score   support

           0       0.62      0.56      0.59       124
           1       0.67      0.68      0.67       139
           2       0.81      0.20      0.32       148
           3       0.61      0.68      0.64       156
           4       0.75      0.77      0.76       120
           5       0.54      0.91      0.68       148
           6       0.80      0.62      0.70       152
           7       0.60      0.73      0.66       157
           8       0.87      0.76      0.81       136
           9       0.94      0.84      0.89       153
          10       0.86      0.90      0.88       155
          11       0.74      0.85      0.79       142
          12       0.79      0.56      0.65       154
          13       0.83      0.88      0.86       145
          14       0.78      0.84      0.81       140
          15       0.63      0.87      0.73       167
          16       0.78      0.72      0.75       145
        

In [17]:
from sklearn.naive_bayes import GaussianNB

gnb_classifier = GaussianNB()
gnb_classifier.fit(X_train.toarray(), y_train)
y_predict = gnb_classifier.predict(X_test.toarray())
gnb_classifier.score(X_test.toarray(), y_test)

0.6189466242488512

In [18]:
print(accuracy_score(y_test, y_predict))
print(classification_report(y_test, y_predict))

0.6189466242488512
              precision    recall  f1-score   support

           0       0.59      0.46      0.52       124
           1       0.50      0.58      0.54       139
           2       0.80      0.24      0.36       148
           3       0.53      0.50      0.51       156
           4       0.33      0.78      0.47       120
           5       0.67      0.75      0.71       148
           6       0.66      0.47      0.55       152
           7       0.64      0.56      0.60       157
           8       0.66      0.67      0.67       136
           9       0.83      0.80      0.82       153
          10       0.90      0.81      0.85       155
          11       0.66      0.73      0.69       142
          12       0.62      0.52      0.56       154
          13       0.80      0.79      0.79       145
          14       0.62      0.62      0.62       140
          15       0.65      0.70      0.68       167
          16       0.68      0.62      0.65       145
        

In [12]:
from sklearn.naive_bayes import BernoulliNB

bnb_classifier = ComplementNB()
bnb_classifier.fit(X_train, y_train)
y_predict = bnb_classifier.predict(X_test)
bnb_classifier.score(X_test, y_test)

NameError: name 'ComplementNB' is not defined

In [13]:
print(accuracy_score(y_test, y_predict))
print(classification_report(y_test, y_predict))

NameError: name 'accuracy_score' is not defined

In [10]:
from sklearn.neighbors import KNeighborsClassifier

knc_classifier = KNeighborsClassifier(n_neighbors=5)
knc_classifier.fit(X_train, y_train)
y_predict = knc_classifier.predict(X_test)
knc_classifier.score(X_test, y_test)

0.20855425945563805

In [11]:
print(accuracy_score(y_test, y_predict))
print(classification_report(y_test, y_predict))

NameError: name 'accuracy_score' is not defined