### The purpose of this exercise is to use LSA in order to run unsupervised topic extraction on texts and compare the results to the target variable!

In [6]:
import pandas as pd
import numpy as np
import en_core_web_sm
import spacy

import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import confusion_matrix
from sklearn.cluster import KMeans




In [3]:
from sklearn.datasets import fetch_20newsgroups

news = fetch_20newsgroups()
print(news.DESCR)


.. _20newsgroups_dataset:

The 20 newsgroups text dataset
------------------------------

The 20 newsgroups dataset comprises around 18000 newsgroups posts on
20 topics split in two subsets: one for training (or development)
and the other one for testing (or for performance evaluation). The split
between the train and test set is based upon a messages posted before
and after a specific date.

This module contains two loaders. The first one,
:func:`sklearn.datasets.fetch_20newsgroups`,
returns a list of the raw texts that can be fed to text feature
extractors such as :class:`~sklearn.feature_extraction.text.CountVectorizer`
with custom parameters so as to extract feature vectors.
The second one, :func:`sklearn.datasets.fetch_20newsgroups_vectorized`,
returns ready-to-use features, i.e., it is not necessary to use a feature
extractor.

**Data Set Characteristics:**

    Classes                     20
    Samples total            18846
    Dimensionality               1
    Features      

### Storing the object news.data in a DataFrame and call the column text. Then extracting a sample of 5000 rows to begin with after that I added the target variable to this dataframe in order to run analysis later

In [7]:
data=pd.DataFrame(news.data,columns=['text'])
data=data.sample(5000)
data['target']=news.target[data.index]
data.head()

Unnamed: 0,text,target
3276,From: byab314@chpc.utexas.edu (Srinivas Bettad...,14
6162,From: julie@eddie.jpl.nasa.gov (Julie Kangas)\...,13
5020,From: PA146008@utkvm1.utk.edu (David Veal)\nSu...,16
7194,From: kbanaian@bernard.pitzer.claremont.edu (K...,18
8983,From: smb@research.att.com (Steven Bellovin)\n...,11


In [8]:
'''
This line of code applies a lambda function to each 
value in the 'text' column. The lambda function splits the text at the string "Subject:" 
and retains the portion after "Subject:". This effectively removes any content that appears before "Subject:" in each text.
'''
data['text_clean'] = data['text'].apply(lambda x: x.split("Subject:")[1])
'''
data['text_clean'] = data['text_clean'].apply(lambda x: ''.join(ch for ch in x if ch.isalnum() or ch==" "))
This line of code applies another lambda function to each value in the 'text_clean' column. The lambda function 
iterates through each character in the text and keeps only alphanumeric characters 
(letters and numbers) and spaces. It removes any other characters, effectively cleaning the text.
'''
data['text_clean'] = data['text_clean'].apply(lambda x: ''.join(ch for ch in x if ch.isalnum() or ch==" "))
'''
This line of code applies a third lambda function to the 
'text_clean' column. It converts all the text to lowercase using the .lower() method and fills any missing values 
(NaNs) with an empty string ('') to avoid potential errors during the lowercase conversion.
'''
data['text_clean'] = data['text_clean'].fillna('').apply(lambda x: x.lower())
data.head()

Unnamed: 0,text,target,text_clean
3276,From: byab314@chpc.utexas.edu (Srinivas Bettad...,14,re vandalizing the skyorganization center for...
6162,From: julie@eddie.jpl.nasa.gov (Julie Kangas)\...,13,re is msg sensitivity superstitionnntpposting...
5020,From: PA146008@utkvm1.utk.edu (David Veal)\nSu...,16,re ban all firearms lines 89organization univ...
7194,From: kbanaian@bernard.pitzer.claremont.edu (K...,18,re national sales tax the movielines 43organi...
8983,From: smb@research.att.com (Steven Bellovin)\n...,11,clipper chip technical detailsorganization a...


In [9]:
nlp = en_core_web_sm.load()

### Tokenizing the cleaned sentences and removing english stopwords

In [10]:
from spacy.lang.en.stop_words import STOP_WORDS

data["text_tokenized"] = data["text_clean"].apply(lambda x: [token.lemma_ for token in nlp(x) if token.text not in STOP_WORDS])
data.head()

Unnamed: 0,text,target,text_clean,text_tokenized
3276,From: byab314@chpc.utexas.edu (Srinivas Bettad...,14,re vandalizing the skyorganization center for...,"[ , vandalize, skyorganization, center, space,..."
6162,From: julie@eddie.jpl.nasa.gov (Julie Kangas)\...,13,re is msg sensitivity superstitionnntpposting...,"[ , msg, sensitivity, superstitionnntppostingh..."
5020,From: PA146008@utkvm1.utk.edu (David Veal)\nSu...,16,re ban all firearms lines 89organization univ...,"[ , ban, firearm, line, 89organization, univer..."
7194,From: kbanaian@bernard.pitzer.claremont.edu (K...,18,re national sales tax the movielines 43organi...,"[ , national, sale, tax, movieline, 43organiza..."
8983,From: smb@research.att.com (Steven Bellovin)\n...,11,clipper chip technical detailsorganization a...,"[ , clipper, chip, , technical, detailsorgani..."


### I detokenized the text because  detokenization reconstructs the original text, preserving the contextual information between words. TF-IDF relies on the relationships between words within a document, and detokenization helps maintain these relationships.

In [11]:
detokenized_doc = []
for sentence in data["text_tokenized"]:
    t = ' '.join(sentence)
    detokenized_doc.append(t)

data['nlp_ready'] = detokenized_doc
data.head()

Unnamed: 0,text,target,text_clean,text_tokenized,nlp_ready
3276,From: byab314@chpc.utexas.edu (Srinivas Bettad...,14,re vandalizing the skyorganization center for...,"[ , vandalize, skyorganization, center, space,...",vandalize skyorganization center space resea...
6162,From: julie@eddie.jpl.nasa.gov (Julie Kangas)\...,13,re is msg sensitivity superstitionnntpposting...,"[ , msg, sensitivity, superstitionnntppostingh...",msg sensitivity superstitionnntppostinghost ...
5020,From: PA146008@utkvm1.utk.edu (David Veal)\nSu...,16,re ban all firearms lines 89organization univ...,"[ , ban, firearm, line, 89organization, univer...",ban firearm line 89organization university t...
7194,From: kbanaian@bernard.pitzer.claremont.edu (K...,18,re national sales tax the movielines 43organi...,"[ , national, sale, tax, movieline, 43organiza...",national sale tax movieline 43organization p...
8983,From: smb@research.att.com (Steven Bellovin)\n...,11,clipper chip technical detailsorganization a...,"[ , clipper, chip, , technical, detailsorgani...",clipper chip technical detailsorganization...


In [12]:
# TF-IDF vector
vectorizer = TfidfVectorizer(stop_words='english', smooth_idf=True)
X = vectorizer.fit_transform(data['nlp_ready'])
X

<5000x120528 sparse matrix of type '<class 'numpy.float64'>'
	with 465259 stored elements in Compressed Sparse Row format>

### Using the truncatedSVD model in order to create a topic model with 20 different topics

In [15]:
svd_model = TruncatedSVD(n_components=20, algorithm='randomized', n_iter=100)
lsa = svd_model.fit_transform(X)

topic_encoded_df = pd.DataFrame(lsa, columns = ["topic_{}".format(i+1) for i in range(20)], index = data.index)
topic_encoded_df["text"] = data['nlp_ready'].values
topic_encoded_df

Unnamed: 0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,topic_9,topic_10,...,topic_12,topic_13,topic_14,topic_15,topic_16,topic_17,topic_18,topic_19,topic_20,text
3276,0.074965,-0.022421,-0.004137,-0.008018,0.001517,-0.011081,-0.034196,-0.059725,0.055858,-0.008884,...,-0.017380,-0.004989,-0.013664,0.073894,0.028447,0.024238,0.028998,-0.028950,-0.010301,vandalize skyorganization center space resea...
6162,0.092245,0.023024,-0.010487,-0.006485,0.005923,0.006427,-0.076698,-0.077743,-0.021321,-0.160918,...,0.300075,-0.228565,-0.170249,-0.031156,-0.155546,0.020133,0.012763,-0.057353,0.017282,msg sensitivity superstitionnntppostinghost ...
5020,0.150243,0.012916,-0.060755,0.020706,0.005235,0.033383,-0.078434,0.023562,-0.037431,-0.026994,...,-0.058019,0.004025,0.009290,0.022828,-0.036074,-0.023726,-0.032649,0.029115,0.004773,ban firearm line 89organization university t...
7194,0.105857,0.001634,-0.028366,0.006724,0.002557,0.026506,-0.046810,0.000637,0.011300,-0.009283,...,-0.029962,0.000841,0.024208,0.000455,-0.008509,-0.001378,0.057719,0.064402,0.002184,national sale tax movieline 43organization p...
8983,0.062543,-0.030219,-0.017358,0.091237,-0.067130,0.015193,0.061859,-0.036516,0.011107,0.026656,...,0.004396,0.003004,0.001508,0.019219,0.001643,-0.014366,-0.002965,-0.009798,0.022638,clipper chip technical detailsorganization...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3151,0.116433,-0.058788,0.008606,0.046273,-0.019753,0.005657,0.032538,-0.011300,-0.029359,0.005873,...,0.049022,-0.008878,0.035777,-0.000636,-0.000401,-0.018710,-0.036709,-0.084170,0.016645,desktop rebuild datadesk keyboardorganizatio...
9659,0.173040,0.039295,-0.036561,0.020575,0.011127,-0.002130,-0.038132,0.001673,0.007711,-0.018560,...,0.002722,-0.012818,0.034048,-0.026104,0.051384,-0.020551,0.076115,0.016873,-0.034375,clinton press briefing george stephanopoulos...
1177,0.058832,0.010135,0.009480,0.009608,-0.000166,-0.004280,-0.009957,0.003347,0.000532,-0.013922,...,-0.010211,-0.001274,-0.008591,0.001183,0.014641,-0.002215,-0.005448,0.021428,0.015635,political atheistsorganization california in...
5410,0.082544,0.017567,-0.020974,0.017759,-0.007697,0.018213,-0.055889,0.015982,-0.003849,0.021684,...,-0.007576,0.028126,-0.001709,-0.005173,-0.008753,-0.004893,0.026299,0.033752,0.082808,rgv posingnntppostinghost 13320625121replyto...


In [16]:
## Using np.argmax to have the index of the maximal element
topic_encoded_df["class_pred"] = [np.argmax(topic) for topic in lsa]
topic_encoded_df["class_pred"].value_counts()

0     3810
9      138
3      134
5      112
1      103
8      100
19      75
16      74
12      72
4       62
14      57
6       48
7       39
15      32
13      31
10      30
18      28
2       21
11      17
17      17
Name: class_pred, dtype: int64

### Add the target variable to the topic model dataframe and print the confusion matrix for the topic against the target variable :

In [18]:
topic_encoded_df["target"] = news.target[data.index]
topic_encoded_df.head()
topic_encoded_df["target"].value_counts()

13    284
14    279
15    278
6     265
2     265
4     265
3     262
11    260
17    259
10    259
1     256
9     254
5     251
8     250
7     246
16    240
12    236
18    223
0     204
19    164
Name: target, dtype: int64

### We conclude that LSA is based on the hypothesis that a given document can be related to several topics. This makes the interpretation of the model's output more complicated, but allows to create topic models that are more realistic (because in real life, a document is often related to different topics !)
### Note that LSA is very convenient to find some structure among a text corpus, but it usually creates topics that are quite different from the categories that would have been determined by a human,  this is why the topics found by LSA are very different from the target !