In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.datasets import load_files
import re
import nltk
from string import punctuation 
from nltk.tokenize import word_tokenize
import matplotlib.pyplot as plt

In [2]:
train_path = r"C:\Users\htc\Desktop\NLP\BTL-NLP\20news-bydate-train"
test_path = r"C:\Users\htc\Desktop\NLP\BTL-NLP\20news-bydate-test"

# Import Data


In [3]:
folders = sorted(os.listdir(train_path))
folders

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [4]:
#Import train data
train_data = {}
for folder in folders: 
    train_data[folder] = []
    for doc in os.listdir(os.path.join(train_path, folder)):
        with open(os.path.join(train_path, folder, doc), encoding = 'latin-1') as opened_file:
            train_data[folder].append(opened_file.read())

In [5]:
#Import test data
test_data = {}
for folder in folders: 
    test_data[folder] = []
    for doc in os.listdir(os.path.join(test_path, folder)):
        with open(os.path.join(test_path, folder, doc), encoding = 'latin-1') as opened_file:
            test_data[folder].append(opened_file.read())


# Data Preprocessing


In [6]:

punctuations=list(punctuation)
stopwords = list(line.strip() for line in open(r"C:\Users\htc\Desktop\NLP\BTL-NLP\nltk_data\corpora\stopwords\english",encoding="utf8"))

for item in punctuations:
    stopwords.append(item)

In [7]:
new_stopwords = ['subject:','from:', 'date:', 'newsgroups:', 'message-id:', 'lines:', 'path:', 'organization:', 
            'would', 'writes:', 'references:', 'article', 'sender:', 'nntp-posting-host:', 'people', 
            'university', 'think', 'xref:', 'cantaloupe.srv.cs.cmu.edu', 'could', 'distribution:', 'first', 
            'anyone','world', 'really', 'since', 'right', 'believe', 'still']
stopwords.append(new_stopwords)

In [8]:
print(stopwords)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [9]:
Y_train = []
for i in range(len(train_data)):
    for doc in train_data[folders[i]]:
        Y_train.append(folders[i])

Y_train = np.array(Y_train)

In [10]:
Y_test = []
for i in range(len(test_data)):
    for doc in test_data[folders[i]]:
        Y_test.append(folders[i])

Y_test = np.array(Y_test)

In [11]:
X_train = []
for folder in train_data:
    for doc in train_data[folder]:
        tokenized_doc = word_tokenize(doc)
        newdoc = []
        for word in tokenized_doc:
             if len(word) < 21 and len(word) > 1:
                if any(c.isdigit() for c in word): #or c == "_"
                    newdoc.append('0')
                    continue
                if word.lower() not in stopwords:
                    newword = re.sub(r'[^A-Za-z]+', '', word.lower())
                    newdoc.append(newword.lower())
        X_train.append(newdoc)

In [12]:
X_test = []
for folder in test_data:
    for doc in test_data[folder]:
        tokenized_doc = word_tokenize(doc)
        newdoc = []
        for word in tokenized_doc:
             if len(word) < 21 and len(word) > 1:
                if any(c.isdigit() for c in word): #or c == "_" 
                    newdoc.append('0')
                    continue
                if word.lower() not in stopwords:
                    newword = re.sub(r'[^A-Za-z]+', '', word.lower())
                    newdoc.append(newword.lower())
        X_test.append(newdoc)

In [13]:
X_train_combine = []
for doc in X_train:
    X_train_combine.append(' '.join(doc))

In [14]:
X_test_combine = []
for doc in X_test:
    X_test_combine.append(' '.join(doc))

In [15]:
X_train = X_train_combine
X_test = X_test_combine

# Classification Models

In [16]:
import scikitplot as skplt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.svm import LinearSVC

In [17]:
vectorizer = CountVectorizer(stop_words='english',ngram_range=(1, 3))
# Caculate accuracy of model
def acc_score(model):
    return np.mean(cross_val_score(model,X_train,Y_train,scoring="accuracy"))

In [18]:
MNBparams = {
    'nbclf__alpha': (1.0,2.0,3.0,4.0,10.0,20.0)
}
KNNparams = {
       'clf__n_neighbors': (5, 10, 100),
       'clf__weights': ('uniform', 'distance')
}

## Multinomial NB Mode

In [44]:
# Too long
#MNB_Pipeline = Pipeline([
#    ('cvec',vectorizer),
#    ('nbclf', MultinomialNB())])
#MNB=GridSearchCV(MNB_Pipeline, MNBparams)
#MNB.fit(X_train, Y_train)

MNB = Pipeline([
   ('cvec',vectorizer),
    ('nbclf', MultinomialNB(alpha = 0.01))
])
MNB.fit(X_train, Y_train)

print("Accuracy: " + str(acc_score(MNB)))

Accuracy: 0.8745805451399186


In [45]:
Y_pred = MNB.predict(X_test)
print(classification_report(Y_pred,Y_test))

                          precision    recall  f1-score   support

             alt.atheism       0.86      0.84      0.85       326
           comp.graphics       0.73      0.64      0.68       447
 comp.os.ms-windows.misc       0.33      0.90      0.48       144
comp.sys.ibm.pc.hardware       0.76      0.56      0.64       528
   comp.sys.mac.hardware       0.81      0.71      0.76       439
          comp.windows.x       0.76      0.84      0.80       358
            misc.forsale       0.86      0.76      0.81       440
               rec.autos       0.88      0.87      0.88       403
         rec.motorcycles       0.95      0.92      0.94       414
      rec.sport.baseball       0.92      0.93      0.93       396
        rec.sport.hockey       0.95      0.95      0.95       402
               sci.crypt       0.91      0.89      0.90       407
         sci.electronics       0.79      0.77      0.78       401
                 sci.med       0.79      0.88      0.83       357
         

## SGD Classifier Model

In [47]:
SGDC = Pipeline([
    ('cvec',vectorizer),
    ('SGD', SGDClassifier(max_iter = 10000,alpha= 0.005))
])
SGDC.fit(X_train, Y_train)

print("Accuracy: " + str(acc_score(SGDC)))

Accuracy: 0.882356933297857


In [48]:
Y_pred = SGDC.predict(X_test)
print(classification_report(Y_pred,Y_test))

                          precision    recall  f1-score   support

             alt.atheism       0.75      0.77      0.76       308
           comp.graphics       0.77      0.69      0.72       436
 comp.os.ms-windows.misc       0.74      0.75      0.74       386
comp.sys.ibm.pc.hardware       0.74      0.70      0.72       417
   comp.sys.mac.hardware       0.82      0.81      0.82       388
          comp.windows.x       0.73      0.84      0.78       342
            misc.forsale       0.91      0.79      0.84       453
               rec.autos       0.87      0.92      0.90       374
         rec.motorcycles       0.96      0.95      0.95       401
      rec.sport.baseball       0.94      0.89      0.91       421
        rec.sport.hockey       0.95      0.96      0.96       397
               sci.crypt       0.92      0.90      0.91       402
         sci.electronics       0.74      0.79      0.76       366
                 sci.med       0.85      0.87      0.86       383
         

## LinearSVC


In [None]:
SVC = Pipeline([
    ('vect', vectorizer),
    ('SGDclf', LinearSVC(C=10))
])
SVC.fit(X_train, Y_train)
print("Accuracy: " + str(acc_score(SVC)))

In [None]:
Y_pred = SVC.predict(X_test)
print(classification_report(Y_pred,Y_test))

## K Neighbors Classifier Model

In [36]:
KNN_Pipeline = Pipeline([
        ('vect', vectorizer), 
        ('clf', KNeighborsClassifier(n_neighbors=100))
])
KNN=GridSearchCV(KNN_Pipeline, KNNparams)
KNN.fit(X_train, Y_train)

print("Accuracy: " + str(acc_score(KNN)))

KeyboardInterrupt: 

In [None]:
Y_pred = KNN.predict(X_test)
print(classification_report(Y_pred,Y_test))