In [12]:
import sys
import re
import bz2

regex = re.compile("[^a-zA-Z]")

import numpy as np
import pandas as pd

from pprint import pprint

import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD

from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer

from nltk.tokenize import RegexpTokenizer

# 20 Newsgroups Dataset 

You should download and look at the 20-news-same-line.txt.bz2
http://qwone.com/~jason/20Newsgroups/

In [13]:
# Download the file from here 
# https://github.com/kiat/Elements-of-Data-Analytics/blob/main/datasets/Court-Wiki-Dataset.txt.bz2
file = bz2.open("./datasets/20-news-same-line.txt.bz2", "r")

mlist = []

count = 0 
for line in file:
    count +=1
    line = (lambda x : (x[x.index('id="') + 4 : x.index('" url=')], x[x.index('">') + 2:][:-6]))(str(line))
    text = np.array(regex.sub(" ", line[1]).lower().split())
    
    text = [x for x in text if len(x) > 3] # drop all word with less than 3 

    mlist.append({'text' : text, 'label' : (lambda x: 0 if 'AU' in x else 1 )(line[0])})


    if(count%1000==0):
        print("Line number: " , count)
    # 20 news group dataset has 19997 documents 
    # We read only 5000 documents of them.


# text_list
print("Line number: " , count)

data = pd.DataFrame.from_dict(mlist)
data 

Line number:  1000
Line number:  2000
Line number:  3000
Line number:  4000
Line number:  5000
Line number:  6000
Line number:  7000
Line number:  8000
Line number:  9000
Line number:  10000
Line number:  11000
Line number:  12000
Line number:  13000
Line number:  14000
Line number:  15000
Line number:  16000
Line number:  17000
Line number:  18000
Line number:  19000
Line number:  19997


Unnamed: 0,text,label
0,"[from, lipman, oasys, navy, robert, lipman, su...",1
1,"[from, weston, ucssun, sdsu, weston, subject, ...",1
2,"[from, coconut, ryan, porter, subject, dmorph,...",1
3,"[from, onyx, virginia, kenneth, hinckley, subj...",1
4,"[from, joth, ersys, edmonton, tham, subject, w...",1
...,...,...
19992,"[from, gmills, chemical, watstar, uwaterloo, p...",1
19993,"[from, imager, dave, knapp, subject, branch, a...",1
19994,"[from, pharvey, quack, paul, harvey, subject, ...",1
19995,"[date, tuesday, from, subject, info, about, li...",1


In [14]:
data['label'] = data['label'].apply(lambda x: (str(x).replace( "20_newsgroups/" , "")).split("/", 1)[0] )
data

Unnamed: 0,text,label
0,"[from, lipman, oasys, navy, robert, lipman, su...",1
1,"[from, weston, ucssun, sdsu, weston, subject, ...",1
2,"[from, coconut, ryan, porter, subject, dmorph,...",1
3,"[from, onyx, virginia, kenneth, hinckley, subj...",1
4,"[from, joth, ersys, edmonton, tham, subject, w...",1
...,...,...
19992,"[from, gmills, chemical, watstar, uwaterloo, p...",1
19993,"[from, imager, dave, knapp, subject, branch, a...",1
19994,"[from, pharvey, quack, paul, harvey, subject, ...",1
19995,"[date, tuesday, from, subject, info, about, li...",1


In [15]:
# Create a Dictionary with all of the words in all documents 
# Count them up 
# Get the top 600 most common words. 

from collections import Counter
my_dict=Counter()

# iterate through list-string
for item in data['text']:
    my_dict += Counter(list(item))

# Print the top 10 just for checking 
print(my_dict.most_common(10))

no_features = 600 
# We use the top no_features as our dictionary
top_words= my_dict.most_common(no_features)

[('that', 70751), ('from', 39653), ('this', 34682), ('have', 31994), ('with', 30218), ('they', 23182), ('subject', 21589), ('lines', 20891), ('date', 20771), ('what', 17693)]


In [16]:
# Sorted List of words based on their frequencies
my_dict_words = sorted(top_words, key=lambda k: top_words[1])

# This is the order of words based on their frequencies
# We use this order to create feature vectors for our text corpus. 
dict_words= list(zip(*my_dict_words))[0]

dict_words

('that',
 'from',
 'this',
 'have',
 'with',
 'they',
 'subject',
 'lines',
 'date',
 'what',
 'there',
 'would',
 'will',
 'writes',
 'about',
 'your',
 'article',
 'some',
 'which',
 'like',
 'people',
 'more',
 'when',
 'just',
 'were',
 'their',
 'know',
 'other',
 'only',
 'them',
 'than',
 'been',
 'think',
 'also',
 'does',
 'time',
 'then',
 'these',
 'should',
 'good',
 'could',
 'well',
 'because',
 'even',
 'very',
 'into',
 'those',
 'make',
 'many',
 'much',
 'first',
 'right',
 'most',
 'such',
 'world',
 'distribution',
 'here',
 'system',
 'where',
 'after',
 'want',
 'anyone',
 'said',
 'being',
 'over',
 'used',
 'same',
 'need',
 'work',
 'really',
 'something',
 'please',
 'problem',
 'believe',
 'since',
 'still',
 'back',
 'windows',
 'mail',
 'years',
 'going',
 'before',
 'find',
 'point',
 'government',
 'help',
 'take',
 'information',
 'file',
 'might',
 'year',
 'better',
 'using',
 'question',
 'never',
 'things',
 'both',
 'last',
 'read',
 'thanks',
 'whi

In [17]:
data['text']=data['text'].apply(lambda x:Counter(list(x)) )
data

Unnamed: 0,text,label
0,"{'from': 1, 'lipman': 7, 'oasys': 3, 'navy': 1...",1
1,"{'from': 1, 'weston': 4, 'ucssun': 2, 'sdsu': ...",1
2,"{'from': 1, 'coconut': 1, 'ryan': 2, 'porter':...",1
3,"{'from': 2, 'onyx': 1, 'virginia': 3, 'kenneth...",1
4,"{'from': 1, 'joth': 2, 'ersys': 2, 'edmonton':...",1
...,...,...
19992,"{'from': 1, 'gmills': 1, 'chemical': 1, 'watst...",1
19993,"{'from': 3, 'imager': 2, 'dave': 2, 'knapp': 2...",1
19994,"{'from': 1, 'pharvey': 1, 'quack': 1, 'paul': ...",1
19995,"{'date': 1, 'tuesday': 1, 'from': 1, 'subject'...",1


In [18]:
topics = list(set(data['label']))
topics 

['1']

In [19]:
len(topics)

1

In [20]:
def convert_words_to_numbers(input_dict_text):
    # get a numpy of no_features zeros 
    a = np.zeros(no_features)
    for i in range(no_features):
        if (dict_words[i] in input_dict_text):
            a[i] = input_dict_text.get(dict_words[i])
    return a
    
convert_words_to_numbers(data['text'][0])        

array([0., 1., 0., 0., 0., 0., 1., 1., 1., 0., 0., 0., 4., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 1., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 3., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 1., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
       2., 0., 0., 1., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0.,
       0., 0., 2., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 1.,
       0., 0., 0., 2., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
       0., 1., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 1.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 1., 0., 0., 0., 1., 0., 2., 0., 0., 0., 0., 0., 0., 1., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0.

In [21]:
data['features']=data['text'].apply(lambda x:convert_words_to_numbers(x))
data = data.drop(['text'], axis=1)

data

Unnamed: 0,label,features
0,1,"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, ..."
1,1,"[0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, ..."
2,1,"[2.0, 1.0, 0.0, 3.0, 4.0, 2.0, 1.0, 1.0, 1.0, ..."
3,1,"[1.0, 2.0, 0.0, 2.0, 1.0, 0.0, 1.0, 1.0, 1.0, ..."
4,1,"[1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, ..."
...,...,...
19992,1,"[4.0, 1.0, 2.0, 2.0, 1.0, 0.0, 1.0, 1.0, 1.0, ..."
19993,1,"[6.0, 3.0, 2.0, 0.0, 2.0, 1.0, 1.0, 1.0, 1.0, ..."
19994,1,"[3.0, 1.0, 0.0, 0.0, 1.0, 3.0, 1.0, 1.0, 1.0, ..."
19995,1,"[6.0, 1.0, 0.0, 2.0, 3.0, 7.0, 1.0, 1.0, 1.0, ..."


In [22]:

X = data['features'].to_numpy()
y = data['label']

X = np.array(list(map(lambda x: list(x), X)))
X.shape

(19997, 600)

In [23]:
# Define the number of topics or components
num_components=20

# Create SVD object
lsa = TruncatedSVD(n_components=num_components, n_iter=100, random_state=42)

# Fit SVD model on data
lsa.fit_transform(X)

# Get Singular values and Components 
Sigma = lsa.singular_values_ 
V_transpose = lsa.components_.T

print(Sigma.shape)
print(V_transpose.shape)

(20,)
(600, 20)


In [24]:
# Print the topics with their terms
terms = dict_words

for index, component in enumerate(lsa.components_):
    zipped = zip(terms, component)
    top_terms_key = sorted(zipped, key = lambda t: t[1], reverse=True)[:10]
    top_terms_list = list(dict(top_terms_key).keys())
    print("Topic "+str(index)+": ",top_terms_list)

Topic 0:  ['that', 'this', 'they', 'with', 'have', 'from', 'there', 'what', 'will', 'were']
Topic 1:  ['jpeg', 'file', 'this', 'from', 'image', 'with', 'available', 'version', 'files', 'will']
Topic 2:  ['they', 'were', 'jpeg', 'file', 'from', 'there', 'them', 'image', 'didn', 'with']
Topic 3:  ['jpeg', 'that', 'image', 'color', 'images', 'format', 'file', 'president', 'free', 'than']
Topic 4:  ['file', 'output', 'program', 'build', 'line', 'open', 'name', 'check', 'that', 'your']
Topic 5:  ['that', 'windows', 'graphics', 'data', 'available', 'system', 'software', 'server', 'president', 'there']
Topic 6:  ['from', 'that', 'image', 'were', 'their', 'data', 'been', 'states', 'turkish', 'president']
Topic 7:  ['will', 'this', 'with', 'president', 'were', 'their', 'space', 'been', 'jesus', 'these']
Topic 8:  ['windows', 'with', 'president', 'have', 'file', 'think', 'drive', 'card', 'date', 'lines']
Topic 9:  ['windows', 'that', 'with', 'disk', 'jesus', 'drive', 'which', 'card', 'their', 's

# We could get the data from sklearn

In [25]:
from sklearn.datasets import fetch_20newsgroups
from pprint import pprint

newsgroups_data = fetch_20newsgroups()
pprint(list(newsgroups_data.target_names))

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']


# TF-IDF -  term frequency–inverse document frequency

We can use TF-IDF

https://en.wikipedia.org/wiki/Tf%E2%80%93idf 

In [26]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import RegexpTokenizer

tokenizer = RegexpTokenizer(r'\w+')
vectorizer = TfidfVectorizer(lowercase=True,
                             stop_words='english',
                             ngram_range=(1,1),
                             tokenizer = tokenizer.tokenize)
vectors = vectorizer.fit_transform(newsgroups_data.data)
vectors.shape



(11314, 129839)

In [27]:
# Define the number of topics or components
num_components=20

# Create SVD object
lsa = TruncatedSVD(n_components=num_components, n_iter=100, random_state=42)

# Fit SVD model on data
lsa.fit_transform(vectors)

# Get Singular values and Components 
Sigma = lsa.singular_values_ 
V_transpose = lsa.components_.T

In [28]:
# Print the topics with their terms
terms  = vectorizer.get_feature_names_out()

for index, component in enumerate(lsa.components_):
    zipped = zip(terms, component)
    top_terms_key=sorted(zipped, key = lambda t: t[1], reverse=True)[:10]
    top_terms_list=list(dict(top_terms_key).keys())
    print("Topic "+str(index)+": ",top_terms_list)

Topic 0:  ['edu', 's', 't', 'com', '1', 'writes', 'article', 'people', 'don', 'subject']
Topic 1:  ['1', '0', '_', '2', '3', 'windows', 'x', '4', '5', '6']
Topic 2:  ['_', '___', '__', 'god', 'jesus', 'o', 'people', 'bnr', '____', 'bible']
Topic 3:  ['key', 'clipper', 'encryption', 'com', 'chip', 'government', 'escrow', 'keys', 'access', '_']
Topic 4:  ['edu', 'cs', 'pitt', 'geb', 'university', 'gordon', 'banks', 'nntp', 'host', 'article']
Topic 5:  ['0', 'team', 'game', 'year', '1', 'hockey', '2', 'players', 's', 'games']
Topic 6:  ['pitt', 'geb', 'banks', 'gordon', 'cs', 'key', 'god', '0', 'encryption', 'clipper']
Topic 7:  ['israel', 'edu', 'israeli', 'jews', 'state', 'ohio', 'cleveland', 'university', 'cwru', 'turkish']
Topic 8:  ['com', 'israel', 'geb', 'gordon', 'pitt', 'banks', 'armenian', 'turkish', 'israeli', 'armenians']
Topic 9:  ['scsi', 'drive', 'ide', 'ca', 's', 'controller', 'card', 'bus', 'hard', 'drives']
Topic 10:  ['scsi', 'drive', 'keith', 'ide', 'caltech', 'livesey