In [267]:
from pprint import pprint
from time import time
import logging

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from gensim.models import Doc2Vec, Word2Vec, KeyedVectors
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

In [268]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier

In [269]:
from gensim.models.doc2vec import Doc2Vec,TaggedDocument
import numpy as np
import pandas as pd

In [270]:
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.preprocessing import MinMaxScaler

In [271]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [272]:
# All Categories:

# alt.atheism
# talk.religion.misc
# comp.graphics
# comp.os.ms-windows.misc
# comp.sys.ibm.pc.hardware
# comp.sys.mac.hardware
# comp.windows.x
# rec.autos
# rec.motorcycles
# rec.sport.baseball
# rec.sport.hockey
# sci.crypt
# sci.electronics
# sci.med
# sci.space
# misc.forsale
# talk.politics.misc
# talk.politics.guns
# talk.politics.mideast
# talk.politics.guns

categories = [
    'talk.politics.misc',
    'comp.graphics',
]

In [273]:
print("Loading 20 newsgroups dataset for categories:")
print(categories)

Loading 20 newsgroups dataset for categories:
['talk.politics.misc', 'comp.graphics']


In [274]:
train_data = fetch_20newsgroups(subset='train', categories=categories)
print("Training Data: \n")
print(f"{len(train_data.filenames)} documents")
print(f"{len(train_data.target_names)} categories")

test_data = fetch_20newsgroups(subset='test', categories=categories)
print("\nTesting Data: \n")
print(f"{len(test_data.filenames)} documents")
print(f"{len(test_data.target_names)} categories")

Training Data: 

1049 documents
2 categories

Testing Data: 

699 documents
2 categories


Data Cleaning

In [275]:
stop_words = set(stopwords.words('english'))

In [276]:
import re

In [277]:
for i in range(len(train_data.data)):
    # Converting all words in lower case.
    train_data.data[i] = train_data.data[i].lower()

    # \W indicates non-word characters. This line replaces these characters with blank.
    train_data.data[i] = re.sub(r'\W',' ',train_data.data[i]) 
    
    # \s+ indicates more than 1 white space. This line replaces those spaces with single space.
    train_data.data[i] = re.sub(r'\s+',' ',train_data.data[i])
    

for i in range(len(test_data.data)):
    test_data.data[i] = test_data.data[i].lower()
    test_data.data[i] = re.sub(r'\W',' ',test_data.data[i]) 
    # \W indicates non-word characters. This line replaces these characters with blank.
    test_data.data[i] = re.sub(r'\s+',' ',test_data.data[i])
    # \s+ indicates more than 1 white space. This line replaces those spaces with single space.
# train_data.data

In [278]:
# Removing the stop words

new_corpus_train = []
new_corpus_test = []

for sentences in train_data.data:
    # Splitting the sentences in words
    words = word_tokenize(sentences)

    filtered_sentence = []

    for w in words:
        if w not in stop_words:
            filtered_sentence.append(w)

        # Rejoining the words to form the sentences
        s = ' '.join(filtered_sentence)

    new_corpus_train.append(s)

for sentences in test_data.data:
    words = word_tokenize(sentences)

    filtered_sentence = []

    for w in words:
        if w not in stop_words:
            filtered_sentence.append(w)

        s = ' '.join(filtered_sentence)
        
    new_corpus_test.append(s)


In [279]:
train_data.data = new_corpus_train
test_data.data = new_corpus_test

> CountVectorizer (Feature Extraction)

In [280]:
vect = CountVectorizer()
X_train_cove = vect.fit_transform(train_data.data)
X_test_cove = vect.transform(test_data.data)

In [281]:
y_train_cove = train_data.target
y_test_cove = test_data.target

In [282]:
# Individual Model Training
# nb = MultinomialNB()
# model = nb.fit(X_train_cove,y_train_cove)
# y_predict_cove = model.predict(X_test_cove)
# print("Confusion Matrix: ", confusion_matrix(y_test_cove, y_predict_cove))
# print("Accuracy Score: ", accuracy_score(y_test_cove, y_predict_cove))

> Benchmarking The Accuracies

In [283]:
# Algorithms
# MultinomialNB, LogisticRegression, LinearSVC, DecisionTreeClassifier
model_names = [MultinomialNB(), LogisticRegression(), LinearSVC(), DecisionTreeClassifier()]

models = [MultinomialNB(), LogisticRegression(max_iter=1000), LinearSVC(dual=True), DecisionTreeClassifier()]
accuracies_cove = []

# Fit the models
for model in models:
    model.fit(X_train_cove,y_train_cove)
    y_predict_cove = model.predict(X_test_cove)
    acc = accuracy_score(y_test_cove, y_predict_cove)
    accuracies_cove.append(acc)
    print("Accuracy for", model_names[models.index(model)], " is ", acc)

Accuracy for MultinomialNB()  is  0.9814020028612304
Accuracy for LogisticRegression()  is  0.9513590844062947
Accuracy for LinearSVC()  is  0.949928469241774
Accuracy for DecisionTreeClassifier()  is  0.8841201716738197


> Doc2Vec (Feature Extraction)

In [284]:
# Converting training data into the format required for Doc2Vec

d2vtrain = [TaggedDocument((d), tags=[str(i)]) for i, d in enumerate(train_data.data)]
model_d2v = Doc2Vec(vector_size=50, alpha=0.025, min_count = 10, dm = 1, epochs = 100)
model_d2v.build_vocab(d2vtrain)
model_d2v.train(d2vtrain, total_examples=model_d2v.corpus_count, epochs = model_d2v.epochs)

# Converting testing data into the format required for Doc2Vec

d2vtest = [TaggedDocument((d), tags=[str(i)]) for i, d in enumerate(test_data.data)]
model_d2v = Doc2Vec(vector_size=50, alpha=0.025, min_count = 10, dm = 1, epochs = 100)
model_d2v.build_vocab(d2vtest)

In [285]:
# Parameters for inter_vector should be a list of string not a single string.
# So, using .split() method

# Preparing vectors for training data
doc_vecs_train = [model_d2v.infer_vector(doc.words.split()) for doc in d2vtrain]

doc_vecs_test = [model_d2v.infer_vector(doc.words.split()) for doc in d2vtest]


In [286]:
# Check the shape of doc_vecs and train_data.target (Both should be same)
# len(doc_vecs)
# len(train_data.target)

In [287]:
X_train_d2v = np.array(doc_vecs_train)
y_train_d2v = train_data.target

X_test_d2v = np.array(doc_vecs_test)
y_test_d2v = test_data.target

In [288]:
# Individual Model
# model_doc2vec = MultinomialNB()
# model_doc2vec.fit(X_train_d2v, y_train_d2v)

> Benchmarking The Accuracies

In [289]:
# models = [MultinomialNB(), LogisticRegression(max_iter=1000), LinearSVC(dual=True), DecisionTreeClassifier()]

In [290]:
accuracies_d2v = []

# Doc2Vec contains negative values of input data which is not acceptible by MultinomialNB().

# Skipping MultinomialNB() for Doc2Vec
accuracies_d2v.append("NA")

# For other algorithms
model_names = [MultinomialNB(), LogisticRegression(), LinearSVC(), DecisionTreeClassifier()]

models = [LogisticRegression(max_iter=1000), LinearSVC(dual=True), DecisionTreeClassifier()]


# Fit the models
for model in models:
    model.fit(X_train_d2v,y_train_d2v)
    y_predict_d2v = model.predict(X_test_d2v)
    acc = accuracy_score(y_test_d2v, y_predict_d2v)
    accuracies_d2v.append(acc)
    print("Accuracy for", model_names[models.index(model)], " is ", acc)

Accuracy for MultinomialNB()  is  0.5565092989985694
Accuracy for LogisticRegression()  is  0.5565092989985694
Accuracy for LinearSVC()  is  0.4892703862660944


> Word2Vec (Feature Extraction)

In [291]:
# If I have the downloaded model in the bin, I can use the following line for loading the data
# w2v_model = KeyedVectors.load_word2vec_format(path_to_bin_file, binary=True)

# But, we are using fetch_20newsgroup so we can directly load that in Word2Vec
sentences = [doc.split() for doc in train_data.data]

model_w2v = Word2Vec(sentences, vector_size=100, window=5, min_count=1, sg=0)


In [292]:
# Making the vector space
vector_dimension = model_w2v.vector_size
print("Dimension of word vectors:", vector_dimension)

Dimension of word vectors: 100


In [293]:
def embbeding_feats(list_of_lists):
    DIMENSION = 100
    zero_vector = np.zeros(DIMENSION)
    feats = []
    corpus = list_of_lists
    for doc in corpus:
        feats_for_this = np.zeros(DIMENSION)
        count_for_this = 0
        for token in doc:
            if token in model_w2v.wv:
                feats_for_this += model_w2v.wv[token]
                count_for_this += 1
        feats.append(feats_for_this / count_for_this)
    return feats

X_train_w2v = embbeding_feats(train_data.data)
X_test_w2v = embbeding_feats(test_data.data)

In [294]:
y_train_w2v = train_data.target
y_test_w2v = test_data.target

In [295]:
# Individual Algorithm
# ml_model_w2v = LogisticRegression()
# ml_model_w2v.fit(X_train_w2v, y_train_w2v)

> Benchmarking The Accuracies

In [296]:
# Algorithms

# Word2Vec contains negative values of input data which is not acceptible by MultinomialNB().

# Skipping MultinomialNB() for Word2Vec

# MultinomialNB, LogisticRegression, LinearSVC, DecisionTreeClassifier
model_names = [MultinomialNB(), LogisticRegression(), LinearSVC(), DecisionTreeClassifier()]

models = [LogisticRegression(max_iter=1000), LinearSVC(max_iter=10000, dual=True), DecisionTreeClassifier()]
accuracies_w2v = []

# For MultinomialNB()
accuracies_w2v.append("NA")

# Fit the models
for model in models:
    model.fit(X_train_w2v,y_train_w2v)
    y_predict_w2v = model.predict(X_test_w2v)
    acc = accuracy_score(y_test_w2v, y_predict_w2v)
    accuracies_w2v.append(acc)
    print("Accuracy for", model_names[models.index(model)], " is ", acc)

Accuracy for MultinomialNB()  is  0.6437768240343348
Accuracy for LogisticRegression()  is  0.6452074391988555
Accuracy for LinearSVC()  is  0.6151645207439199


> Final Data

In [297]:
model_names

[MultinomialNB(), LogisticRegression(), LinearSVC(), DecisionTreeClassifier()]

In [298]:
final_data = {'CountVectorizer': accuracies_cove, 'Doc2Vec': accuracies_d2v, 'Word2Vec': accuracies_w2v}
df = pd.DataFrame(final_data, index=model_names)
df

Unnamed: 0,CountVectorizer,Doc2Vec,Word2Vec
MultinomialNB(),0.981402,,
LogisticRegression(),0.951359,0.556509,0.643777
LinearSVC(),0.949928,0.556509,0.645207
DecisionTreeClassifier(),0.88412,0.48927,0.615165
