In [12]:
import pandas as pd
import numpy as np
import os
import nltk
import matplotlib.pyplot as plt
# from nltk.tokenize import word_tokenize
from sklearn.datasets import load_files
import re
import seaborn as sns

# Import data

In [13]:
train_path = r"20news-bydate-train"
test_path = r"20news-bydate-test"

In [14]:
train = load_files(train_path, encoding='latin1')

In [15]:
test = load_files(test_path, encoding='latin1')

# Preprocessing

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer, ENGLISH_STOP_WORDS
from string import punctuation

In [17]:
# nltk.download('stopwords')

## Stopwords

In [18]:
stopwords = set(line.strip() for line in open(r"nltk_data/corpora/stopwords/english",encoding="utf8"))
new_stopwords = {'subject:','from:', 'date:', 'newsgroups:', 'message-id:', 'lines:', 'path:', 'organization:', 
            'would', 'writes:', 'references:', 'article', 'sender:', 'nntp-posting-host:', 'people', 
            'university', 'think', 'xref:', 'cantaloupe.srv.cs.cmu.edu', 'could', 'distribution:', 'first', 
            'anyone','world', 'really', 'since', 'right', 'believe', 'still'}

In [19]:
stopwords = ENGLISH_STOP_WORDS.union(stopwords, new_stopwords, punctuation)
print(stopwords)

frozenset({'_', 'along', 'one', 'this', "couldn't", 'already', 'us', 'thereafter', 'll', 'herein', 'behind', 'meanwhile', "hasn't", 'amongst', 'hence', 'down', '!', 'cant', 'move', 'who', 'toward', '~', 'isn', "needn't", 'sixty', 'see', "doesn't", 'seems', 'either', 'below', 'would', 'amount', 'from:', "it's", "you're", 'hundred', 'doesn', 'detail', 'under', 'somewhere', 'most', 'once', 'world', 'anyone', 'mill', 'empty', 'around', 'were', 'seem', "should've", 'they', 'might', 'again', 'every', 'by', 'article', "that'll", 'date:', 'himself', 'yourselves', 'against', 'such', 'back', '%', 'message-id:', 'side', 'please', 'should', '"', '(', '*', '^', 'during', 'everything', 'could', 'find', 'whole', 'elsewhere', "hadn't", 'anyhow', 'moreover', '}', 'un', "won't", ')', 'three', 'hereafter', 'does', 'our', 'was', 'fire', 'few', 'otherwise', 'an', '-', 'next', 'became', 'whom', 'front', 'done', 'no', 'them', 'noone', 'why', 'first', 'six', 'weren', 'thereupon', 'fill', 'so', ';', 'latterly'

## TF-IDF

In [140]:
vectorizer = TfidfVectorizer(lowercase=True, sublinear_tf=True, min_df=5, stop_words=stopwords, token_pattern=r'\b[^\d\W]+\b', ngram_range=(1,3))

In [141]:
vectors = vectorizer.fit_transform(train.data)

In [142]:
vectors_test = vectorizer.transform(test.data)

# Classification Models

In [143]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report,confusion_matrix

## Multinomial NB Model

In [144]:
MNB = MultinomialNB(alpha=0.01)
MNB.fit(vectors, train.target)
pred_MNB = MNB.predict(vectors_test)

In [145]:
print(classification_report(pred_MNB,test.target, target_names=test.target_names))

                          precision    recall  f1-score   support

             alt.atheism       0.82      0.84      0.83       311
           comp.graphics       0.71      0.61      0.66       451
 comp.os.ms-windows.misc       0.58      0.75      0.65       303
comp.sys.ibm.pc.hardware       0.68      0.57      0.62       470
   comp.sys.mac.hardware       0.76      0.76      0.76       384
          comp.windows.x       0.76      0.76      0.76       397
            misc.forsale       0.80      0.68      0.73       460
               rec.autos       0.81      0.80      0.80       405
         rec.motorcycles       0.88      0.92      0.90       382
      rec.sport.baseball       0.89      0.90      0.90       394
        rec.sport.hockey       0.93      0.92      0.93       406
               sci.crypt       0.89      0.90      0.89       391
         sci.electronics       0.68      0.74      0.71       359
                 sci.med       0.70      0.86      0.77       326
         

## SGD Classifier Model

In [146]:
SGD = SGDClassifier(max_iter = 10000)
SGD.fit(vectors, train.target)
pred_SGD = SGD.predict(vectors_test)

In [147]:
print(classification_report(pred_SGD,test.target, target_names=test.target_names))

                          precision    recall  f1-score   support

             alt.atheism       0.77      0.87      0.82       284
           comp.graphics       0.79      0.74      0.76       415
 comp.os.ms-windows.misc       0.76      0.80      0.78       376
comp.sys.ibm.pc.hardware       0.73      0.75      0.74       383
   comp.sys.mac.hardware       0.88      0.83      0.85       405
          comp.windows.x       0.77      0.87      0.82       350
            misc.forsale       0.92      0.84      0.88       426
               rec.autos       0.90      0.93      0.91       383
         rec.motorcycles       0.96      0.95      0.96       406
      rec.sport.baseball       0.96      0.91      0.93       418
        rec.sport.hockey       0.98      0.95      0.97       415
               sci.crypt       0.94      0.93      0.94       401
         sci.electronics       0.80      0.82      0.81       386
                 sci.med       0.88      0.91      0.89       381
         

## LinearSVC

In [148]:
SVC = LinearSVC(C=10)
SVC.fit(vectors, train.target)
pred_SVC = SVC.predict(vectors_test)

In [149]:
print(classification_report(pred_SVC,test.target, target_names=test.target_names))

                          precision    recall  f1-score   support

             alt.atheism       0.79      0.87      0.83       291
           comp.graphics       0.80      0.74      0.77       421
 comp.os.ms-windows.misc       0.74      0.80      0.77       368
comp.sys.ibm.pc.hardware       0.72      0.70      0.71       403
   comp.sys.mac.hardware       0.85      0.80      0.82       408
          comp.windows.x       0.77      0.86      0.81       354
            misc.forsale       0.92      0.81      0.86       440
               rec.autos       0.89      0.91      0.90       386
         rec.motorcycles       0.94      0.95      0.95       395
      rec.sport.baseball       0.94      0.92      0.93       407
        rec.sport.hockey       0.97      0.96      0.96       406
               sci.crypt       0.93      0.93      0.93       396
         sci.electronics       0.79      0.78      0.79       399
                 sci.med       0.85      0.88      0.87       383
         

## K Neighbors Classifier Model

In [150]:
KNN = KNeighborsClassifier(n_neighbors=200)
KNN.fit(vectors, train.target)
pred_KNN = KNN.predict(vectors_test)

In [151]:
print(classification_report(pred_KNN,test.target, target_names=test.target_names))

                          precision    recall  f1-score   support

             alt.atheism       0.71      0.57      0.63       400
           comp.graphics       0.70      0.60      0.65       456
 comp.os.ms-windows.misc       0.69      0.73      0.71       369
comp.sys.ibm.pc.hardware       0.66      0.59      0.63       438
   comp.sys.mac.hardware       0.68      0.76      0.72       347
          comp.windows.x       0.75      0.73      0.74       404
            misc.forsale       0.80      0.78      0.79       398
               rec.autos       0.76      0.77      0.76       390
         rec.motorcycles       0.79      0.88      0.83       358
      rec.sport.baseball       0.83      0.92      0.87       357
        rec.sport.hockey       0.94      0.74      0.83       506
               sci.crypt       0.86      0.83      0.84       408
         sci.electronics       0.43      0.73      0.54       233
                 sci.med       0.60      0.86      0.70       277
         