<a href="https://colab.research.google.com/github/hackveda-canada/Data-Science-Essentials/blob/master/Data_Science_Essentials_Day_17%2618_Text_Clustering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
# Inbuilt function to calculate cosine similarity using Sklearn library
from sklearn.metrics.pairwise import cosine_similarity

import pandas as pd
dict1 = {'doc1.txt':[10], 'doc2.txt':[3]}
df = pd.DataFrame(dict1)
df

cosine_similarity(df[['doc1.txt']], df[['doc2.txt']])

array([[1.]])

In [0]:
# Text Clustering
from sklearn.datasets import fetch_20newsgroups

newsgroups_train = fetch_20newsgroups(subset="train")
newsgroups_train

In [0]:
newsgroups_train.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [0]:
categories = ['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'rec.motorcycles']
dataset = fetch_20newsgroups(subset="all", categories=categories, shuffle=True, random_state=5)
dataset

In [0]:
# Calculate size of data and target
print("%d documents" % len(dataset.data))
print("%d categories" % len(dataset.target_names))

3753 documents
4 categories


In [0]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words="english")

print("Non Vectorized Data \n")
dataset.data


In [0]:
X = vectorizer.fit_transform(dataset.data)
X.shape

(3753, 83000)

In [0]:
print("%d documents, %d features" % X.shape)

3753 documents, 83000 features


In [0]:
pd.DataFrame(X.toarray().transpose(), index=vectorizer.get_feature_names())

In [0]:
# SVD of a matrix ( m rows * n columns ) Term Document Matrix
# Product of 
# r = rank of the matrix
# a.) Column orthonormal = m * r 
# b.) Diagonal = r * r
# c.) Column orthonormal = r * n

In [0]:
# (text clustering) Latent semantic analysis using SVD
from sklearn.decomposition import TruncatedSVD

# Shape of original matrix
X.shape

# print the documents and dimensions
print("%d documents, %d dimensions" % X.shape)

# To perform Latent semantic analysis through SVD we have to decompose original matrix
svd = TruncatedSVD(2000)

# Create LSA using SVD
from sklearn.preprocessing import Normalizer
from sklearn.pipeline import make_pipeline

lsa = make_pipeline(svd, Normalizer(copy=False))

# Transform X matrix into lsa (svd) matrix 
X = lsa.fit_transform(X)

3753 documents, 83000 dimensions


In [0]:
# Check Shape of the matrix
print("%d documents, %d dimensions" % X.shape)

# Check the explained variance after decomposition
evr = svd.explained_variance_ratio_.sum()
print("Dimensions reduced to ", evr * 100, " %")

3753 documents, 2000 dimensions
Dimensions reduced to  87.90560728992392  %


In [0]:
# Text Clustering using Kmeans and SVD Data
from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn import metrics
import numpy as np

kmeans = KMeans(n_clusters=4, max_iter=100, n_init=1)

In [0]:
# Run Kmeans in batch mode. Useful for large datasets
kmeans = MiniBatchKMeans(n_clusters=4, n_init=1, batch_size=1000)
kmeans.fit(X)

MiniBatchKMeans(batch_size=1000, compute_labels=True, init='k-means++',
        init_size=None, max_iter=100, max_no_improvement=10, n_clusters=4,
        n_init=1, random_state=None, reassignment_ratio=0.01, tol=0.0,
        verbose=0)