# Part 1: Load Data

In [1]:
# Load libraries
import numpy as np
import pandas as pd
import nltk
import random
import re
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.cluster import KMeans
from yellowbrick.cluster import SilhouetteVisualizer
from sklearn.decomposition import PCA, KernelPCA
from sklearn.decomposition import LatentDirichletAllocation

random.seed(20202200)

In [2]:
# Load data into dataframe
df = pd.read_json("newsgroups.json")

In [4]:
df.head(5)

Unnamed: 0,content,target,target_names
0,From: lerxst@wam.umd.edu (where's my thing)\nS...,7,rec.autos
1,From: guykuo@carson.u.washington.edu (Guy Kuo)...,4,comp.sys.mac.hardware
2,From: twillis@ec.ecn.purdue.edu (Thomas E Will...,4,comp.sys.mac.hardware
3,From: jgreen@amber (Joe Green)\nSubject: Re: W...,1,comp.graphics
4,From: jcm@head-cfa.harvard.edu (Jonathan McDow...,14,sci.space


In [5]:
# Cheching if there is any missing value
df.isnull().sum()

content         0
target          0
target_names    0
dtype: int64

In [7]:
df.target_names.value_counts()

rec.sport.hockey            600
soc.religion.christian      599
rec.motorcycles             598
rec.sport.baseball          597
sci.crypt                   595
sci.med                     594
rec.autos                   594
comp.windows.x              593
sci.space                   593
sci.electronics             591
comp.os.ms-windows.misc     591
comp.sys.ibm.pc.hardware    590
misc.forsale                585
comp.graphics               584
comp.sys.mac.hardware       578
talk.politics.mideast       564
talk.politics.guns          546
alt.atheism                 480
talk.politics.misc          465
talk.religion.misc          377
Name: target_names, dtype: int64

In [8]:
# Take only the review_body column for unsupervised learning task

data = df.loc[:, 'content'].tolist()
print(type(data))
print(len(data))

<class 'list'>
11314


In [9]:
# Take a look at some of the reviews
for _ in range(5):
    print(data[_],"\n")

From: lerxst@wam.umd.edu (where's my thing)
Subject: WHAT car is this!?
Nntp-Posting-Host: rac3.wam.umd.edu
Organization: University of Maryland, College Park
Lines: 15

 I was wondering if anyone out there could enlighten me on this car I saw
the other day. It was a 2-door sports car, looked to be from the late 60s/
early 70s. It was called a Bricklin. The doors were really small. In addition,
the front bumper was separate from the rest of the body. This is 
all I know. If anyone can tellme a model name, engine specs, years
of production, where this car is made, history, or whatever info you
have on this funky looking car, please e-mail.

Thanks,
- IL
   ---- brought to you by your neighborhood Lerxst ----




 

From: guykuo@carson.u.washington.edu (Guy Kuo)
Subject: SI Clock Poll - Final Call
Summary: Final call for SI clock reports
Keywords: SI,acceleration,clock,upgrade
Article-I.D.: shelley.1qvfo9INNc3s
Organization: University of Washington
Lines: 11
NNTP-Posting-Host: carson.u.

# Part 2: Tokenizing and Stemming


In [10]:
# Use nltk's English stopwords.
stopwords = stopwords.words('english')

print("We use " + str(len(stopwords)) + " stop-words from nltk library.")

We use 179 stop-words from nltk library.


In [11]:
def tokenization_and_stemming(text):
    '''
    INPUT
    text - string
    OUTPUT
    clean_tokens - a list of words
    This function processes the input using the following steps :
    1. Remove punctuation characters
    2. Tokenize text into list
    3. Stem, Normalize and Strip each word
    4. Remove stop words
    '''
    # Remove punctuation characters and numbers
    text = re.sub(r"[^a-zA-Z]", " ", text)
    
    # Tokenize text
    tokens = word_tokenize(text)
    
    # Create a instance of stem class
    stemmer = SnowballStemmer("english")
    
    clean_tokens = []
    for word in tokens:
        clean_tok = stemmer.stem(word).lower().strip()
        if clean_tok not in stopwords:
            clean_tokens.append(clean_tok)

    return clean_tokens

In [14]:
tokenization_and_stemming(data[42])

['ab',
 'cleveland',
 'freenet',
 'edu',
 'sam',
 'latonia',
 'subject',
 'need',
 'phone',
 'number',
 'western',
 'digit',
 'esdi',
 'problem',
 'organ',
 'case',
 'western',
 'reserv',
 'univers',
 'cleveland',
 'ohio',
 'usa',
 'line',
 'nntp',
 'post',
 'host',
 'slc',
 'cwru',
 'edu',
 'western',
 'digit',
 'sam',
 'gosh',
 'think',
 'instal',
 'virus',
 'call',
 'ms',
 'dos',
 'copi',
 'floppi',
 'burn',
 'love',
 'window',
 'crash']

# Part 3: c-TF-IDF

In [41]:
import numpy as np
import scipy.sparse as sp

from sklearn.utils import check_array
from sklearn.preprocessing import normalize
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.utils.validation import FLOAT_DTYPES, check_is_fitted


class CTFIDFVectorizer(TfidfTransformer):
    def __init__(self, *args, **kwargs):
        super(CTFIDFVectorizer, self).__init__(*args, **kwargs)
        self._idf_diag = None

    def fit(self, X: sp.csr_matrix, n_samples: int):
        """Learn the idf vector (global term weights)

        Parameters
        ----------
        X : sparse matrix of shape n_samples, n_features)
            A matrix of term/token counts.

        """

        # Prepare input
        X = check_array(X, accept_sparse=('csr', 'csc'))
        if not sp.issparse(X):
            X = sp.csr_matrix(X)
        dtype = X.dtype if X.dtype in FLOAT_DTYPES else np.float64

        # Calculate IDF scores
        _, n_features = X.shape
        df = np.squeeze(np.asarray(X.sum(axis=0)))
        avg_nr_samples = int(X.sum(axis=1).mean())
        idf = np.log(avg_nr_samples / df)
        self._idf_diag = sp.diags(idf, offsets=0,
                                  shape=(n_features, n_features),
                                  format='csr',
                                  dtype=dtype)
        return self

    def transform(self, X: sp.csr_matrix, copy=True) -> sp.csr_matrix:
        """Transform a count-based matrix to c-TF-IDF

        Parameters
        ----------
        X : sparse matrix of (n_samples, n_features)
            a matrix of term/token counts

        Returns
        -------
        vectors : sparse matrix of shape (n_samples, n_features)

        """

        # Prepare input
        X = check_array(X, accept_sparse='csr', dtype=FLOAT_DTYPES, copy=copy)
        if not sp.issparse(X):
            X = sp.csr_matrix(X, dtype=np.float64)

        n_samples, n_features = X.shape

        # idf_ being a property, the automatic attributes detection
        # does not work as usual and we need to specify the attribute
        # name:
        check_is_fitted(self, attributes=["idf_"],
                        msg='idf vector is not fitted')

        # Check if expected nr features is found
        expected_n_features = self._idf_diag.shape[0]
        if n_features != expected_n_features:
            raise ValueError("Input has n_features=%d while the model"
                             " has been trained with n_features=%d" % (
                                 n_features, expected_n_features))

        X = X * self._idf_diag

        if self.norm:
            X = normalize(X, axis=1, norm='l1', copy=False)

        return X

In [61]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer


# Create c-TF-IDF
cv_matric = CountVectorizer().fit_transform(data)
ctfidf_matrix = CTFIDFVectorizer().fit_transform(cv_matric, n_samples=len(df))

In [62]:
ctfidf_matrix

<11314x130107 sparse matrix of type '<class 'numpy.float64'>'
	with 1301018 stored elements in Compressed Sparse Row format>

In [63]:
print(ctfidf_matrix)

  (0, 4605)	-0.0034965034965034965
  (0, 16574)	0.006993006993006993
  (0, 18299)	0.006993006993006993
  (0, 27436)	-0.01048951048951049
  (0, 28615)	-0.013986013986013986
  (0, 32311)	-0.013986013986013986
  (0, 34995)	0.013986013986013986
  (0, 35612)	0.006993006993006993
  (0, 35983)	-0.01048951048951049
  (0, 37433)	-0.0034965034965034965
  (0, 37565)	-0.01048951048951049
  (0, 37780)	-0.017482517482517484
  (0, 42876)	-0.006993006993006993
  (0, 45295)	-0.0034965034965034965
  (0, 48620)	0.0034965034965034965
  (0, 50527)	-0.027972027972027972
  (0, 51793)	0.006993006993006993
  (0, 56979)	-0.04195804195804196
  (0, 57308)	0.01048951048951049
  (0, 62221)	-0.013986013986013986
  (0, 64095)	-0.006993006993006993
  (0, 65798)	-0.02097902097902098
  (0, 66608)	-0.017482517482517484
  (0, 67156)	-0.0034965034965034965
  (0, 68532)	-0.04195804195804196
  :	:
  (11313, 76032)	-0.02127659574468085
  (11313, 76377)	-0.0070921985815602835
  (11313, 80638)	-0.02127659574468085
  (11313, 823

In [64]:
np.abs(ctfidf_matrix)

<11314x130107 sparse matrix of type '<class 'numpy.float64'>'
	with 1301018 stored elements in Compressed Sparse Row format>

In [65]:
print(ctfidf_matrix)

  (0, 4605)	-0.0034965034965034965
  (0, 16574)	0.006993006993006993
  (0, 18299)	0.006993006993006993
  (0, 27436)	-0.01048951048951049
  (0, 28615)	-0.013986013986013986
  (0, 32311)	-0.013986013986013986
  (0, 34995)	0.013986013986013986
  (0, 35612)	0.006993006993006993
  (0, 35983)	-0.01048951048951049
  (0, 37433)	-0.0034965034965034965
  (0, 37565)	-0.01048951048951049
  (0, 37780)	-0.017482517482517484
  (0, 42876)	-0.006993006993006993
  (0, 45295)	-0.0034965034965034965
  (0, 48620)	0.0034965034965034965
  (0, 50527)	-0.027972027972027972
  (0, 51793)	0.006993006993006993
  (0, 56979)	-0.04195804195804196
  (0, 57308)	0.01048951048951049
  (0, 62221)	-0.013986013986013986
  (0, 64095)	-0.006993006993006993
  (0, 65798)	-0.02097902097902098
  (0, 66608)	-0.017482517482517484
  (0, 67156)	-0.0034965034965034965
  (0, 68532)	-0.04195804195804196
  :	:
  (11313, 76032)	-0.02127659574468085
  (11313, 76377)	-0.0070921985815602835
  (11313, 80638)	-0.02127659574468085
  (11313, 823

# Part 4: K-means clustering

In [48]:
kmeans_model = KMeans(n_clusters=3)

kmeans_model.fit(ctfidf_matrix) # Fit the data

KMeans(n_clusters=3)

#  Part 5: Topic Modeling - Latent Dirichlet Allocation

In [49]:
# Use LDA for clustering
LDA = LatentDirichletAllocation(n_components=3)