### Preprocessing

In [1]:
# https://www.kaggle.com/datasets/dipankarsrirag/topic-modelling-on-emails

In [2]:
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess

import spacy
from pprint import pprint
import pandas as pd
import os

from tqdm import tqdm

# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from','subject','re','edu','use'])



In [3]:
def find_txt_files(directory):
    txt_files = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith('.txt'):
                txt_files.append(os.path.join(root, file))
    return txt_files

In [4]:
def read_txt_file(file_path):
    try:
        with open(file_path, 'r', encoding="windows-1252") as file:
            file_contents = file.read()
        return file_contents
    except FileNotFoundError:
        return "File not found."

In [5]:
def sent_to_words(sentences):
    for sentence in tqdm(sentences):
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

In [6]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

In [7]:
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in tqdm(texts):
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc])
    return texts_out

In [8]:
# category 1: Crime
directory_path = "C:\\Users\\Daniel Atzberger\\Documents\\IEEE_Vis24\\data\\Emails\\Crime"
file_list = find_txt_files(directory_path)

file_contents_Crime = []
labels_Crime = []
for file in file_list:
    file_contents_Crime.append(read_txt_file(file))
    labels_Crime.append("Crime")

In [9]:
# category 2: Entertainment
directory_path = "C:\\Users\\Daniel Atzberger\\Documents\\IEEE_Vis24\\data\\Emails\\Entertainment"
file_list = find_txt_files(directory_path)

file_contents_Entertainment = []
labels_Entertainment = []
for file in file_list:
    file_contents_Entertainment.append(read_txt_file(file))
    labels_Entertainment.append("Entertainment")

In [10]:
# category 3: Politics
directory_path = "C:\\Users\\Daniel Atzberger\\Documents\\IEEE_Vis24\\data\\Emails\\Politics"
file_list = find_txt_files(directory_path)

file_contents_Politics = []
labels_Politics = []
for file in file_list:
    file_contents_Politics.append(read_txt_file(file))
    labels_Politics.append("Politics")

In [11]:
# category 4: Science
directory_path = "C:\\Users\\Daniel Atzberger\\Documents\\IEEE_Vis24\\data\\Emails\\Science"
file_list = find_txt_files(directory_path)

file_contents_Science = []
labels_Science = []
for file in file_list:
    file_contents_Science.append(read_txt_file(file))
    labels_Science.append("Science")

In [12]:
data = file_contents_Crime + file_contents_Entertainment + file_contents_Politics + file_contents_Science
labels = labels_Crime + labels_Entertainment + labels_Politics + labels_Science
data_words = list(sent_to_words(data))

# Remove Stop Words
print("Start removing stop words")
data_words_nostops = remove_stopwords(data_words)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# conda install -c conda-forge spacy-model-en_core_web_sm
print("Installing spacy")
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
print("Start lemmatizing words")
#data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])
data_lemmatized = lemmatization(data_words_nostops, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

100%|████████████████████████████████████████████████████████████████████████████| 9156/9156 [00:08<00:00, 1071.12it/s]


Start removing stop words
Installing spacy


  0%|                                                                                 | 1/9156 [00:00<19:46,  7.72it/s]

Start lemmatizing words


100%|██████████████████████████████████████████████████████████████████████████████| 9156/9156 [01:52<00:00, 81.71it/s]


In [13]:
#print(data[0])
print(data_lemmatized[9000])

['distribution', 'na', 'message', 'i', 'd', 'rs', 'au', 'access', 'digex', 'net', 'reference', 'apr', 'kelvin', 'jpl', 'nasa', 'gov', 'apr', 'stsci', 'rrhlo', 'ajb', 'access', 'digex', 'net', 'apr', 'stsci', 'nntp', 'post', 'host', 'access', 'digex', 'net', 'article', 'apr', 'stsci', 'hathaway', 'stsci', 'write', 'space', 'walk', 'go', 'boost', 'hst', 'orbit', 'think', 'right', 'sit', 'mile', 'would', 'like', 'know', 'exact', 'orbit', 'number', 'ben', 'say', 'boost', 'idea', 'news', 'we', 'know', 'something', 'please', 'supply', 'source', 'would', 'nice', 'scheduler', 'observation', 'know', 'thing', 'go', 'altitude', 'number', 'also', 'way', 'good', 'source', 'minimum', 'st', 'altitude', 'pmdb', 'kilometer', 'maximum', 'st', 'altitude', 'pmdb', 'kilometer', 'delta', 'st', 'altitude', 'pmdb', 'kilometer', 'pmdb', 'proposal', 'management', 'datum', 'base', 'use', 'schedule', 'observation', 'sure', 'number', 'far', 'well', 'mine', 'say', 'exact', 'number', 'order', 'perform', 'boost', 'hs

In [14]:
data_lemmatized_min_length = []

for sublist in tqdm(data_lemmatized):
    # Use a list comprehension to filter out strings with less than two characters
    sublist = [word for word in sublist if len(word) > 2]
    data_lemmatized_min_length.append(sublist)

100%|███████████████████████████████████████████████████████████████████████████| 9156/9156 [00:00<00:00, 41936.72it/s]


In [15]:
#print(data[0])
#print(data_lemmatized[0])
print(data_lemmatized_min_length[0])

['archive', 'name', 'ripem', 'faq', 'last', 'update', 'sun', 'mar', 'post', 'still', 'rather', 'rough', 'list', 'likely', 'question', 'information', 'ripem', 'program', 'public', 'key', 'mail', 'encryption', 'faq', 'ripem', 'write', 'maintain', 'marc', 'vanheyningen', 'mvanheyn', 'whale', 'indiana', 'post', 'variety', 'newsgroup', 'monthly', 'basis', 'follow', 'discussion', 'specific', 'ripem', 'redirect', 'group', 'alt', 'security', 'ripem', 'month', 'reformatte', 'post', 'attempt', 'comply', 'standard', 'hypertext', 'faq', 'formatting', 'allow', 'easy', 'manipulation', 'document', 'world', 'wide', 'web', 'let', 'know', 'think', 'disclaimer', 'nothing', 'faq', 'consider', 'legal', 'advice', 'anything', 'one', 'person', 'opinion', 'want', 'real', 'legal', 'advice', 'talk', 'real', 'lawyer', 'question', 'answer', 'ripem', 'ripem', 'program', 'perform', 'privacy', 'enhance', 'mail', 'pem', 'use', 'cryptographic', 'technique', 'rsa', 'des', 'allow', 'electronic', 'mail', 'property', 'auth

In [16]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized_min_length)

# Create Corpus
texts = data_lemmatized_min_length

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View 
print(corpus[:1])

[[(0, 2), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 2), (7, 1), (8, 2), (9, 2), (10, 1), (11, 2), (12, 7), (13, 2), (14, 2), (15, 2), (16, 1), (17, 2), (18, 1), (19, 1), (20, 1), (21, 2), (22, 2), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1), (32, 2), (33, 3), (34, 2), (35, 3), (36, 4), (37, 2), (38, 1), (39, 1), (40, 1), (41, 1), (42, 1), (43, 1), (44, 1), (45, 1), (46, 2), (47, 2), (48, 1), (49, 1), (50, 2), (51, 2), (52, 5), (53, 1), (54, 1), (55, 1), (56, 1), (57, 1), (58, 1), (59, 1), (60, 3), (61, 3), (62, 3), (63, 1), (64, 1), (65, 1), (66, 3), (67, 2), (68, 3), (69, 1), (70, 1), (71, 4), (72, 1), (73, 1), (74, 1), (75, 1), (76, 1), (77, 1), (78, 1), (79, 3), (80, 1), (81, 1), (82, 4), (83, 3), (84, 1), (85, 1), (86, 1), (87, 1), (88, 1), (89, 1), (90, 1), (91, 1), (92, 1), (93, 1), (94, 1), (95, 3), (96, 2), (97, 4), (98, 5), (99, 4), (100, 1), (101, 1), (102, 1), (103, 1), (104, 12), (105, 5), (106, 1), (107, 2), (108, 2), (109, 1), (110, 1

In [17]:
# Number of documents
print(len(corpus))

# Size of the vocabulary
print(len(id2word))

9156
48318


In [18]:
import statistics

def sum_of_second_components(tuple_list):
    total_sum = 0
    for tup in tuple_list:
        total_sum += tup[1]  # Accessing the second component of each tuple
    return total_sum

lengths = []
for doc in corpus:
    lengths.append(sum_of_second_components(doc))
print(statistics.median(lengths))

101.0


### Topic Models

#### Vector Space Model (VSM)

In [None]:
from scipy.sparse import dok_matrix

# Define function to convert Gensim corpus to a sparse pandas DataFrame
def corpus_to_sparse_dataframe(corpus):
    word_freq = dok_matrix((len(corpus), len(id2word)), dtype=int)

    for i, doc in enumerate(corpus):
        for word_id, freq in doc:
            word_freq[i, word_id] = freq

    dataframe = pd.DataFrame.sparse.from_spmatrix(word_freq)
    dataframe.columns = [id2word[word_id] for word_id in range(len(id2word))]
    return dataframe

In [None]:
VSM = corpus_to_sparse_dataframe(corpus)

#### Vector Space Model and tf-idf (VSM & tf-idf)

In [None]:
from gensim.models import TfidfModel

model = TfidfModel(corpus)  # fit model
tfidf_corpus = model[corpus]

#### Latent Semantic Indexing (LSI)

In [None]:
from gensim.models import LsiModel

K = 8
lsi_model = LsiModel(corpus, id2word=id2word, num_topics=K)
lsi_model.print_topics(num_topics=K, num_words=10)

#### Latent Semantic Indexing and tf-idf (LSI & tf-idf)

In [None]:
from gensim.models import LsiModel

K = 8
tfidf_lsi_model = LsiModel(tfidf_corpus, id2word=id2word, num_topics=K)
tfidf_lsi_model.print_topics(num_topics=K, num_words=10)

#### Non-Negative Matrix Factorization (NMF)

In [None]:
# https://github.com/piskvorky/gensim/blob/develop/docs/notebooks/nmf_tutorial.ipynb
from gensim.models.nmf import Nmf

K = 8
nmf_model = Nmf(corpus, id2word=id2word, num_topics=K)
nmf_model.show_topics(num_topics=K, num_words=10)

#### Non-Negative Matrix Factorization and tf-idf (NMF & tf-idf)

In [None]:
# https://github.com/piskvorky/gensim/blob/develop/docs/notebooks/nmf_tutorial.ipynb
from gensim.models.nmf import Nmf

K = 8
tfidf_nmf_model = Nmf(tfidf_corpus, id2word=id2word, num_topics=K)
tfidf_nmf_model.show_topics(num_topics=K, num_words=10)

#### Latent Dirichlet Allocation (LDA)

In [None]:
# Build LDA model
K = 8
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=K,
                                           random_state=100,
                                           update_every=1,
                                           chunksize=400,
                                           passes=30,
                                           alpha='auto',
                                           per_word_topics=True)

# Print the Keywords in the 3 topics
pprint(lda_model.print_topics())