### Import Libraries
Loading all libraries to be used

In [None]:
import copy
import numpy as np
import matplotlib.pyplot as plt
import re
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import VarianceThreshold
from imblearn.over_sampling import SMOTE
from sklearn.dummy import DummyClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics.pairwise import cosine_similarity
import seaborn as sns
import pandas as pd
from sklearn.datasets import fetch_20newsgroups
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score
from sklearn.cluster import KMeans
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.decomposition import KernelPCA 
from time import time
from numpy.linalg import eigh
sns.set()

### Data preparation
#### Load data
Lets load the data from _dsjVoxArticles.tsv_ file. We will clean the title to remove special characters and punctuations. We will store title in _titles_ and Category in _categories_

In [None]:
df = fetch_20newsgroups(subset='all', shuffle=False, remove=('headers', 'quotes'))
print("Message-\n", df.data[0])
print("\Label-\n", df.target_names[df.target[0]])

We can print and check the data loaded in _titles_ and _categories_

In [3]:
titles, categories = df.data, df.target

### Split data
Split data into 3 parts - training, development and test. We will use training data to train out model and use development data to check and tune hyper parameters. And finally use test data to see how our model performs

In [None]:
title_tr, title_te, category_tr, category_te = train_test_split(titles, categories)
print("Training: ",len(title_tr))
print("Testing: ",len(title_te))

Using wordCload we can visualize our data.

In [None]:
!pip install wordcloud

In [None]:
df.target_names

In [None]:
from wordcloud import WordCloud

sci_med, politics = [], []
for (msg, lab) in zip(title_tr, category_tr):
  if lab == 13:
    sci_med.append(msg)
  if lab == 16:
    politics.append(msg)

print("sci.med class word cloud\n")
text = " ".join(sci_med)
wordcloud = WordCloud().generate(text)
plt.figure()
plt.subplots(figsize=(20,12))
wordcloud = WordCloud(
    background_color="white",
    max_words=len(text),
    max_font_size=40,
    relative_scaling=.5).generate(text)
plt.imshow(wordcloud)
plt.axis("off")
plt.show()

print("\n\n\n\ntalk.politics.guns class word cloud\n")
text = " ".join(politics)
wordcloud = WordCloud().generate(text)
plt.figure()
plt.subplots(figsize=(20,12))
wordcloud = WordCloud(
    background_color="white",
    max_words=len(text),
    max_font_size=40,
    relative_scaling=.5).generate(text)
plt.imshow(wordcloud)
plt.axis("off")
plt.show()

### Data Preprocessing
#### Word lemmatization
Lemmatize the messages using WordNet Lemmetizer

In [5]:
word_lemmatizer = WordNetLemmatizer()
stopwords_set = set(stopwords.words("english"))

for i in range(len(title_tr)):
  msg_words = word_tokenize(title_tr[i])
  lemmatized_msg = ""
  for word in msg_words:
    if word not in stopwords_set:
      lemmatized_msg += word_lemmatizer.lemmatize(word) + " "
  title_tr[i] = lemmatized_msg

for i in range(len(title_te)):
  msg_words = word_tokenize(title_te[i])
  lemmatized_msg = ""
  for word in msg_words:
    if word not in stopwords_set:
      lemmatized_msg += word_lemmatizer.lemmatize(word) + " "
  title_te[i] = lemmatized_msg

### Vectorization of data
Vectorize the data using Bag of words (BOW)

In [6]:
stop_words = nltk.corpus.stopwords.words("english")
vectorizer = TfidfVectorizer(stop_words=stop_words, min_df=0.001)
vectorizer.fit(title_tr)

Xtr = vectorizer.transform(title_tr)
Xte = vectorizer.transform(title_te)

Lets look at what exactly is this vectorizer doing. We will first create reverse dictionary from the vectorizer. Iterating over the vectorized sentence _Nasa scientists are good_. We get the vector to be representative of three words "good", "nasa" and "scientists". The order has been changed because bag of words does not preserve order.

In [7]:
reverse_vocabulary = {}
vocabulary = vectorizer.vocabulary_
for word in vocabulary:
    index = vocabulary[word]
    reverse_vocabulary[index] = word

vector = vectorizer.transform(iter(["This season so far, Morgan and Guzman helped to lead the Cubs at top in ERA, even better than THE rotation at Atlanta."]))
indexes = vector.indices
for i in indexes:
    print (reverse_vocabulary[i], end=" ")

top season rotation morgan lead helped far even era cubs better atlanta 

### Frequency Threshold
We can check the variance of the feature and drop them based on a threshold

In [8]:
print("Number of features before reduction : ", Xtr.shape[1])
scaler = StandardScaler(with_mean=False)
scaled_data_Xtr = scaler.fit_transform(Xtr)
scaled_data_Xte = scaler.transform(Xte)

Number of features before reduction :  9307


In [9]:
from sklearn import metrics
def purity_score(y, yhat):
  c_mat = metrics.cluster.contingency_matrix(y, yhat)
  sns.set(rc={'figure.figsize':(20,20)})
  print(sns.heatmap(c_mat, annot=True, ))
  return np.sum(np.amax(c_mat, axis=0)) / np.sum(c_mat)

### Without Reduction
Results calculated with any features reduction on the dataset

###### Random Forest

In [None]:
rf = RandomForestClassifier()
start_time = time()
rf.fit(scaled_data_Xtr.toarray(), category_tr)
print("Time taken to fit Random Forest classifier : ", time()-start_time)
pred = rf.predict(scaled_data_Xte.toarray())
print(classification_report(category_te, pred, target_names=df.target_names, digits=5))

###### SVC

In [None]:
svc = SVC()
start_time = time() 
svc.fit(scaled_data_Xtr.toarray(), category_tr)
print("Time taken to fit SVC classifier : ", time()-start_time)
pred = svc.predict(scaled_data_Xte.toarray())
print(classification_report(category_te, pred, target_names=df.target_names, digits=5))

###### KMeans

In [None]:
kmeans = KMeans(n_clusters=20, init='k-means++', max_iter=250)
start_time = time()
kmeans.fit(scaled_data_Xtr.toarray(), category_tr)
print("Time taken to fit KMeans classifier : ", time()-start_time)
pred = kmeans.predict(scaled_data_Xte.toarray())
print("Purity Score: ", purity_score(category_te, pred))
print("Homoegneity Score: ",metrics.homogeneity_score(category_te, pred))
print("Completeness Score: ",metrics.completeness_score(category_te, pred))

### PCA (Principal Component Analysis)

In [None]:
start_time = time()
pca = PCA(n_components=2)
pca.fit(scaled_data_Xtr.toarray())
reduced_pca_Xtr = pca.transform(scaled_data_Xtr.toarray())
reduced_pca_Xte = pca.transform(scaled_data_Xte.toarray())
print("Time taken to perform dimensionality reduction using PCA" ,time() - start_time)
print("Number of features after reduction : ", reduced_pca_Xtr.shape[1])
print("Explained variance ratio: ", sum(pca.explained_variance_ratio_))

In [None]:
plt.figure(figsize=(5, 5))
plt.style.use("ggplot") 
plt.plot(pca.explained_variance_, marker='.')
plt.xlabel("Eigenvalue number")
plt.ylabel("Eigenvalue size")
plt.title("Scree Plot")

###### Random Forest

In [None]:
rf = RandomForestClassifier()
start_time = time()
rf.fit(reduced_pca_Xtr, category_tr)
print("Time taken to fit Random Forest classifier : ", time()-start_time)

In [None]:
pred = rf.predict(reduced_pca_Xte)
print(classification_report(category_te, pred, target_names=df.target_names, digits=5))

###### SVC

In [None]:
svc = SVC()
start_time = time() 
svc.fit(reduced_pca_Xtr, category_tr)
print("Time taken to fit SVC classifier : ", time()-start_time)

In [None]:
pred = svc.predict(reduced_pca_Xte)
print(classification_report(category_te, pred, target_names=df.target_names))

###### KMeans

In [None]:
kmeans = KMeans(n_clusters=20, init='k-means++', max_iter=250)
start_time = time()
kmeans.fit(reduced_pca_Xtr, category_tr)
print("Time taken to fit KMeans classifier : ", time()-start_time)
pred = kmeans.predict(reduced_pca_Xte)
print("Purity Score: ", purity_score(category_te, pred))
print("Homoegneity Score: ",metrics.homogeneity_score(category_te, pred))
print("Completeness Score: ",metrics.completeness_score(category_te, pred))

### LDA (Linear Discriminant Analysis)

In [16]:
lda = LDA(n_components=15)
lda.fit(scaled_data_Xtr.toarray(), category_tr)
reduced_lda_Xtr = lda.transform(scaled_data_Xtr.toarray())
reduced_lda_Xte = lda.transform(scaled_data_Xte.toarray())

###### Random Forest

In [None]:
rf = RandomForestClassifier()
start_time = time()
rf.fit(reduced_lda_Xtr, category_tr)
print("Time taken for fitting using Random Forest",time() - start_time)
pred = rf.predict(reduced_lda_Xte)
print(classification_report(category_te, pred, target_names=df.target_names))

###### SVC

In [None]:
svc = SVC()
start_time = time()
svc.fit(reduced_lda_Xtr, category_tr)
print("Time taken for fitting using SVC",time() - start_time)
pred = svc.predict(reduced_lda_Xte)
print(classification_report(category_te, pred, target_names=df.target_names))

###### KMeans

In [None]:
kmeans = KMeans(n_clusters=20, init='k-means++', max_iter=250)
start_time = time()
kmeans.fit(reduced_lda_Xtr, category_tr)
print("Time taken for fitting using KMeans",time() - start_time)
pred = kmeans.predict(reduced_lda_Xte)
print("Purity Score: ", purity_score(category_te, pred))
print("Homoegneity Score: ",metrics.homogeneity_score(category_te, pred))
print("Completeness Score: ",metrics.completeness_score(category_te, pred))

### Kernel PCA

In [None]:
start_time = time()
kpca = KernelPCA(kernel="sigmoid", n_components=2, gamma=.01) # Sigmoid Kernel
reduced_kpca_Xtr = kpca.fit_transform(scaled_data_Xtr)
reduced_kpca_Xte = kpca.transform(scaled_data_Xte.toarray())

Random Forest

In [None]:
rf = RandomForestClassifier()
start_time = time()
rf.fit(reduced_kpca_Xtr, category_tr)
print("Time taken to fit Random Forest classifier : ", time()-start_time)
pred = rf.predict(reduced_kpca_Xte)
print(classification_report(category_te, pred, target_names=df.target_names,digits=5))

SVC

In [None]:
svc = SVC()
start_time = time()
svc.fit(reduced_kpca_Xtr, category_tr)
print("Time taken for fitting using SVC",time() - start_time)
pred = svc.predict(reduced_kpca_Xte)
print(classification_report(category_te, pred, target_names=df.target_names))

KMeans

In [None]:
kmeans = KMeans(n_clusters=20, init='k-means++', max_iter=250)
start_time = time()
kmeans.fit(reduced_kpca_Xtr, category_tr)
print("Time taken for fitting using KMeans",time() - start_time)
pred = kmeans.predict(reduced_kpca_Xte)
print("Purity Score: ", purity_score(category_te, pred))
print("Homoegneity Score: ",metrics.homogeneity_score(category_te, pred))
print("Completeness Score: ",metrics.completeness_score(category_te, pred))

### UMAP (Unified Manifol Approximation & Projection)

In [None]:
!pip install umap-learn

In [None]:
import umap
umap_embedding = umap.UMAP(metric='hellinger', n_components=2)
reduced_tr = umap_embedding.fit_transform(scaled_data_Xtr)
reduced_te = umap_embedding.transform(scaled_data_Xte)

In [None]:
refined_tr = reduced_tr[[not a for a in umap.utils.disconnected_vertices(umap_embedding)]]
refined_cat = category_tr[[not a for a in umap.utils.disconnected_vertices(umap_embedding)]]

reduced_te = umap_embedding.transform(scaled_data_Xte)
refined_te = []
refined_cat_te = []
for i in range(len(reduced_te)):
  if str(reduced_te[i][0]) != "nan":
    refined_te.append(reduced_te[i])
    refined_cat_te.append(category_te[i])

In [None]:
scaled_data_Xtr
scaled_data_Xte

Random Forest

In [None]:
start_time = time()
rf = RandomForestClassifier()
rf.fit(refined_tr, refined_cat)
pred = rf.predict(refined_te)
print("Time taken to fit Random Forest classifier : ", time()-start_time)
print(classification_report(refined_cat_te, pred, target_names=df.target_names, digits=5))

SVC

In [None]:
svc = SVC()
start_time = time()
svc.fit(refined_tr, refined_cat)
print("Time taken for fit SVC classifier",time() - start_time)
pred = svc.predict(refined_te)
print(classification_report(category_te, pred, target_names=df.target_names))

KMeans

In [None]:
kmeans = KMeans(n_clusters=20, init='k-means++', max_iter=250)
start_time = time()
kmeans.fit(refined_tr, refined_cat)
print("Time taken for fitting using KMeans",time() - start_time)
pred = kmeans.predict(refined_te)
print("Purity Score: ", purity_score(category_te, pred))
print("Homoegneity Score: ",metrics.homogeneity_score(category_te, pred))
print("Completeness Score: ",metrics.completeness_score(category_te, pred))

### Reduced dataset visualization (n_components = 2)

In [None]:
umapdf = pd.DataFrame()
umapdf["y"] = [df.target_names[lab] for lab in category_tr]
umapdf["comp-1"] = reduced_tr[:,0]
umapdf["comp-2"] = reduced_tr[:,1]
sns.set(rc={'figure.figsize':(20,10)})
sns.scatterplot(x="comp-1", y="comp-2", hue=umapdf.y.tolist(), data=umapdf).set(title="PCA projection with features = 2")

In [None]:
umapdf = pd.DataFrame()
umapdf["y"] = [df.target_names[lab] for lab in category_tr]
umapdf["comp-1"] = reduced_tr[:,0]
umapdf["comp-2"] = reduced_tr[:,1]
sns.set(rc={'figure.figsize':(20,10)})
sns.scatterplot(x="comp-1", y="comp-2", hue=umapdf.y.tolist(), data=umapdf).set(title="LDA projection with features = 2")

In [None]:
umapdf = pd.DataFrame()
umapdf["y"] = [df.target_names[lab] for lab in category_tr]
umapdf["comp-1"] = reduced_tr[:,0]
umapdf["comp-2"] = reduced_tr[:,1]
sns.set(rc={'figure.figsize':(20,10)})
sns.scatterplot(x="comp-1", y="comp-2", hue=umapdf.y.tolist(), data=umapdf).set(title="Kernel PCA projection with features = 2")

In [None]:
umapdf = pd.DataFrame()
umapdf["y"] = [df.target_names[lab] for lab in category_tr]
umapdf["comp-1"] = reduced_tr[:,0]
umapdf["comp-2"] = reduced_tr[:,1]
sns.set(rc={'figure.figsize':(20,10)})
sns.scatterplot(x="comp-1", y="comp-2", hue=umapdf.y.tolist(), data=umapdf).set(title="UMAP projection with features = 2")