In [10]:
from sklearn.datasets import fetch_20newsgroups
import pandas as pd

def twenty_newsgroup_to_csv():
    newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))

    df = pd.DataFrame([newsgroups_train.data, newsgroups_train.target.tolist()]).T
    df.columns = ['text', 'target']

    targets = pd.DataFrame( newsgroups_train.target_names)
    targets.columns=['title']

    out = pd.merge(df, targets, left_on='target', right_index=True)
    out['date'] = pd.to_datetime('now')
    out.to_csv('20_newsgroup.csv')
    
twenty_newsgroup_to_csv()

In [11]:
data_set = pd.read_csv('20_newsgroup.csv')
data_set

Unnamed: 0.1,Unnamed: 0,text,target,title,date
0,0,I was wondering if anyone out there could enli...,7,rec.autos,2021-04-05 21:34:14.690721
1,17,I recently posted an article asking what kind ...,7,rec.autos,2021-04-05 21:34:14.690721
2,29,\nIt depends on your priorities. A lot of peo...,7,rec.autos,2021-04-05 21:34:14.690721
3,56,an excellent automatic can be found in the sub...,7,rec.autos,2021-04-05 21:34:14.690721
4,64,: Ford and his automobile. I need information...,7,rec.autos,2021-04-05 21:34:14.690721
...,...,...,...,...,...
11309,11210,Secrecy in Clipper Chip\n\nThe serial number o...,11,sci.crypt,2021-04-05 21:34:14.690721
11310,11217,Hi !\n\nI am interested in the source of FEAL ...,11,sci.crypt,2021-04-05 21:34:14.690721
11311,11243,"The actual algorithm is classified, however, t...",11,sci.crypt,2021-04-05 21:34:14.690721
11312,11254,\n\tThis appears to be generic calling upon th...,11,sci.crypt,2021-04-05 21:34:14.690721


In [32]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer(stop_words='english')

X = data_set.iloc[:,1]
y = data_set.iloc[:,3]

X = count_vect.fit_transform(X.values.astype('U'))

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.25)

In [33]:
X_train

<8485x101322 sparse matrix of type '<class 'numpy.int64'>'
	with 565755 stored elements in Compressed Sparse Row format>

In [34]:
y_train

4630     comp.os.ms-windows.misc
10943                  sci.crypt
9313      soc.religion.christian
10283         talk.politics.misc
8548          rec.sport.baseball
                  ...           
4859             rec.motorcycles
3264                     sci.med
9845       talk.politics.mideast
10799                  sci.crypt
2732          talk.politics.guns
Name: title, Length: 8485, dtype: object

In [60]:
from sklearn.naive_bayes import MultinomialNB
mnb_classifier = MultinomialNB()
mnb_classifier.fit(X_train, y_train)

MultinomialNB()

In [61]:
y_predict = mnb_classifier.predict(X_test)

In [62]:
mnb_classifier.score(X_test, y_test)

0.656062212796041

In [63]:
from sklearn.metrics import classification_report, accuracy_score

print(accuracy_score(y_test, y_predict))
print(classification_report(y_test, y_predict))

0.656062212796041
                          precision    recall  f1-score   support

             alt.atheism       0.81      0.40      0.54       124
           comp.graphics       0.52      0.61      0.56       139
 comp.os.ms-windows.misc       1.00      0.04      0.08       148
comp.sys.ibm.pc.hardware       0.60      0.67      0.64       156
   comp.sys.mac.hardware       0.73      0.68      0.70       120
          comp.windows.x       0.50      0.89      0.64       148
            misc.forsale       0.87      0.67      0.76       152
               rec.autos       0.60      0.66      0.63       157
         rec.motorcycles       0.71      0.60      0.65       136
      rec.sport.baseball       0.94      0.77      0.85       153
        rec.sport.hockey       0.92      0.83      0.87       155
               sci.crypt       0.50      0.81      0.62       142
         sci.electronics       0.89      0.44      0.59       154
                 sci.med       0.84      0.86      0.85  

In [49]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(mnb_classifier, X, y, cv=10)
scores.mean()

0.6810166743211704

In [50]:
scores.std()

0.011660238501585268

In [19]:
from sklearn.naive_bayes import ComplementNB

cnb_classifier = ComplementNB()
cnb_classifier.fit(X_train, y_train)
y_predict = cnb_classifier.predict(X_test)
cnb_classifier.score(X_test, y_test)

0.7182750088370449

In [20]:
print(accuracy_score(y_test, y_predict))
print(classification_report(y_test, y_predict))

0.7182750088370449
              precision    recall  f1-score   support

           0       0.62      0.56      0.59       124
           1       0.67      0.68      0.67       139
           2       0.81      0.20      0.32       148
           3       0.61      0.68      0.64       156
           4       0.75      0.77      0.76       120
           5       0.54      0.91      0.68       148
           6       0.80      0.62      0.70       152
           7       0.60      0.73      0.66       157
           8       0.87      0.76      0.81       136
           9       0.94      0.84      0.89       153
          10       0.86      0.90      0.88       155
          11       0.74      0.85      0.79       142
          12       0.79      0.56      0.65       154
          13       0.83      0.88      0.86       145
          14       0.78      0.84      0.81       140
          15       0.63      0.87      0.73       167
          16       0.78      0.72      0.75       145
        

In [21]:
from sklearn.naive_bayes import GaussianNB

gnb_classifier = GaussianNB()
gnb_classifier.fit(X_train.toarray(), y_train)
y_predict = gnb_classifier.predict(X_test.toarray())
gnb_classifier.score(X_test.toarray(), y_test)

0.6189466242488512

In [55]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(gnb_classifier, X.toarray(), y, cv=10)
scores.mean()

0.6319615368993949

In [56]:
scores.std()

0.015140335878844467

In [22]:
print(accuracy_score(y_test, y_predict))
print(classification_report(y_test, y_predict))

0.6189466242488512
              precision    recall  f1-score   support

           0       0.59      0.46      0.52       124
           1       0.50      0.58      0.54       139
           2       0.80      0.24      0.36       148
           3       0.53      0.50      0.51       156
           4       0.33      0.78      0.47       120
           5       0.67      0.75      0.71       148
           6       0.66      0.47      0.55       152
           7       0.64      0.56      0.60       157
           8       0.66      0.67      0.67       136
           9       0.83      0.80      0.82       153
          10       0.90      0.81      0.85       155
          11       0.66      0.73      0.69       142
          12       0.62      0.52      0.56       154
          13       0.80      0.79      0.79       145
          14       0.62      0.62      0.62       140
          15       0.65      0.70      0.68       167
          16       0.68      0.62      0.65       145
        

In [65]:
from sklearn.naive_bayes import BernoulliNB

bnb_classifier = BernoulliNB()
bnb_classifier.fit(X_train, y_train)
y_predict = bnb_classifier.predict(X_test)
bnb_classifier.score(X_test, y_test)

0.35489572287027216

In [24]:
print(accuracy_score(y_test, y_predict))
print(classification_report(y_test, y_predict))

0.7182750088370449
              precision    recall  f1-score   support

           0       0.62      0.56      0.59       124
           1       0.67      0.68      0.67       139
           2       0.81      0.20      0.32       148
           3       0.61      0.68      0.64       156
           4       0.75      0.77      0.76       120
           5       0.54      0.91      0.68       148
           6       0.80      0.62      0.70       152
           7       0.60      0.73      0.66       157
           8       0.87      0.76      0.81       136
           9       0.94      0.84      0.89       153
          10       0.86      0.90      0.88       155
          11       0.74      0.85      0.79       142
          12       0.79      0.56      0.65       154
          13       0.83      0.88      0.86       145
          14       0.78      0.84      0.81       140
          15       0.63      0.87      0.73       167
          16       0.78      0.72      0.75       145
        

In [25]:
from sklearn.neighbors import KNeighborsClassifier

knc_classifier = KNeighborsClassifier(n_neighbors=5)
knc_classifier.fit(X_train, y_train)
y_predict = knc_classifier.predict(X_test)
knc_classifier.score(X_test, y_test)

0.20855425945563805

In [26]:
print(accuracy_score(y_test, y_predict))
print(classification_report(y_test, y_predict))

0.20855425945563805
              precision    recall  f1-score   support

           0       0.09      0.44      0.15       124
           1       0.13      0.25      0.17       139
           2       0.17      0.40      0.24       148
           3       0.16      0.17      0.17       156
           4       0.09      0.23      0.13       120
           5       0.33      0.22      0.27       148
           6       0.19      0.13      0.16       152
           7       0.13      0.18      0.15       157
           8       0.18      0.13      0.15       136
           9       0.29      0.29      0.29       153
          10       0.51      0.21      0.30       155
          11       0.76      0.27      0.40       142
          12       0.55      0.12      0.19       154
          13       0.55      0.24      0.33       145
          14       0.61      0.08      0.14       140
          15       0.58      0.17      0.26       167
          16       0.49      0.15      0.23       145
       