In [None]:
import nltk
from nltk.corpus import stopwords
from sklearn.datasets import fetch_20newsgroups
from sklearn.datasets import load_files
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from gensim.corpora import Dictionary
from gensim.models import TfidfModel
from wordcloud import WordCloud
import numpy as np
import re

In [None]:
# Importing the dataset
categories = ["comp.graphics",
              "rec.sport.baseball",
              "sci.electronics",
              "talk.politics.guns",
              "talk.religion.misc"]

In [None]:
newsgroups = fetch_20newsgroups(subset='all', shuffle=True, random_state=42, categories = {categories[1]}, remove=('headers', 'footers', 'quotes'))

#Store the  required data for later use                
X,y,z = newsgroups.data,newsgroups.target, newsgroups.target_names

In [None]:
# Creating the corpus
corpus = []
for i in range(0, len(X)):
    newsgroup = re.sub(r'\W', ' ', str(X[i]))
    newsgroup = newsgroup.lower()
    newsgroup = re.sub(r'^br$', ' ', newsgroup)
    newsgroup = re.sub(r'\s+br\s+',' ',newsgroup)
    newsgroup = re.sub(r'\s+[a-z]\s+', ' ',newsgroup)
    newsgroup = re.sub(r'[0-9]+', '', newsgroup)
    newsgroup = re.sub(r'^b\s+', '', newsgroup)
    newsgroup = re.sub(r'\s+', ' ', newsgroup)
    newsgroup = re.sub(r'would', ' ', newsgroup)
   
    corpus.append(newsgroup)  
     #  Initiating the Stemmer class 
stemmer = PorterStemmer()

# Stemming
for i in range(len(corpus)):
    words = nltk.word_tokenize(corpus[i])
    words = [stemmer.stem(word) for word in words]
    corpus[i] = ' '.join(words) 
    
#  Initiating the lemmatizer class
lemmatizer = WordNetLemmatizer()

# Lemmatization
for i in range(len(corpus)):
    words = nltk.word_tokenize(corpus[i])
    words = [lemmatizer.lemmatize(word) for word in words]
    corpus[i] = ' '.join(words) 

# Remove words with less than or equal to three letter's (doesn't make any sense)
for i in range(len(corpus)):
    words = nltk.word_tokenize(corpus[i])
    words = [word for word in words if len(word)>3]
    corpus[i] = ' '.join(words)


In [None]:
# Creating WordClouds

text_doc = ''
 
for i in range(len(corpus)): # Iterate over the files 
    # Extract tokens
    text_doc = text_doc + corpus[i]

wc = WordCloud(
        background_color="Black",
        max_words=2000,
        width = 1024,
        height = 720,
        stopwords=stopwords.words("english")
        )

wc.generate_from_text(text_doc)
     # Save the could to a file

wc.to_file("word_sports.png")    

In [None]:

# visulalisation Plots

import numpy as np
import matplotlib.pyplot as plt
 
# data to plot
n_groups = 2

accuracy_filter_model = (82, 87)
accuracy_hybrid_model = (81, 83)
 
# create plot
fig, ax = plt.subplots()
index = np.arange(n_groups)
bar_width = 0.2
opacity = 0.7
 
rects1 = plt.bar(index, accuracy_filter_model, bar_width,
                 alpha=opacity,
                 color='b',
                 label='Filter Model (Tf-idf)')
 
rects2 = plt.bar(index + bar_width, accuracy_hybrid_model, bar_width,
                 alpha=opacity,
                 color='g',
                 label='Hybrid Model (BPSO)')

plt.xlabel('Accuracy')
plt.ylabel('Classifiers')
#plt.title('Performance of filter and hybrid models for topic classification')
plt.xticks(index + bar_width, ('Naive Bayes', 'MLP'))
# Place a legend to the right of this smaller subplot.
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
 
plt.tight_layout()
plt.show()