### Preprocessing

In [1]:
from sklearn.datasets import fetch_20newsgroups

import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess

import spacy
from pprint import pprint
import pandas as pd
import numpy as np

# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from','subject','re','edu','use'])



In [2]:
newsgroups_train = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))

In [3]:
print(newsgroups_train.data[379])

The subject line says it all. I'm working on a project
that will use a car battery. I need to pull off 3V and possibly
48V at 3A.

I have several ideas, but I'd prefer to benefit from all you
brilliant people :-)


In [4]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

In [5]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

In [6]:
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc])
    return texts_out

In [7]:
data = newsgroups_train.data
data_words = list(sent_to_words(data))

# Remove Stop Words
print("Start removing stop words")
data_words_nostops = remove_stopwords(data_words)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# conda install -c conda-forge spacy-model-en_core_web_sm
print("Installing spacy")
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
print("Start lemmatizing words")
#data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])
data_lemmatized = lemmatization(data_words_nostops, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

Start removing stop words
Installing spacy
Start lemmatizing words


In [8]:
print(data_lemmatized[1028])

['poppy', 'cock', 'story', 'obviously', 'complete', 'fabrication', 'show', 'establishe', 'story', 'take', 'place', 'iowa', 'iowa', 'come', 'dan', 'dumb', 'think', 'could', 'least', 'throw', 'llama', 'tennis', 'ball', 'reference', 'hell', 'even', 'get', 'speed', 'right', 'dean', 'ps']


In [9]:
data_lemmatized_min_length = []

for sublist in data_lemmatized:
    # Use a list comprehension to filter out strings with less than two characters
    sublist = [word for word in sublist if len(word) > 3]
    data_lemmatized_min_length.append(sublist)

In [10]:
Y = newsgroups_train.target.tolist()

In [11]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized_min_length)

# Create Corpus
texts = data_lemmatized_min_length

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View 
print(corpus[:1])

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 2), (9, 1), (10, 1), (11, 2), (12, 1), (13, 1), (14, 2), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 2), (32, 1), (33, 1), (34, 1), (35, 2), (36, 1), (37, 1), (38, 1), (39, 1), (40, 1), (41, 1), (42, 1)]]


In [12]:
# Number of documents
print(len(corpus))

# Size of the vocabulary
print(len(id2word))

18846
72370


In [27]:
import statistics

def sum_of_second_components(tuple_list):
    total_sum = 0
    for tup in tuple_list:
        total_sum += tup[1]  # Accessing the second component of each tuple
    return total_sum

lengths = []
for doc in corpus:
    lengths.append(sum_of_second_components(doc))
print(statistics.median(lengths))

35.0


### Topic Models

#### Vector Space Model (VSM)

In [None]:
from scipy.sparse import dok_matrix

# Define function to convert Gensim corpus to a sparse pandas DataFrame
def corpus_to_sparse_dataframe(corpus):
    word_freq = dok_matrix((len(corpus), len(id2word)), dtype=int)

    for i, doc in enumerate(corpus):
        for word_id, freq in doc:
            word_freq[i, word_id] = freq

    dataframe = pd.DataFrame.sparse.from_spmatrix(word_freq)
    dataframe.columns = [id2word[word_id] for word_id in range(len(id2word))]
    return dataframe

In [None]:
VSM = corpus_to_sparse_dataframe(corpus)

#### Vector Space Model and tf-idf (VSM & tf-idf)

In [None]:
from gensim.models import TfidfModel

model = TfidfModel(corpus)  # fit model
tfidf_corpus = model[corpus]

#### Latent Semantic Indexing (LSI)

In [None]:
from gensim.models import LsiModel

K = 20
lsi_model = LsiModel(corpus, id2word=id2word, num_topics=K)
lsi_model.print_topics(num_topics=K, num_words=10)

#### Latent Semantic Indexing and tf-idf (LSI & tf-idf)

In [None]:
from gensim.models import LsiModel

K = 20
tfidf_lsi_model = LsiModel(tfidf_corpus, id2word=id2word, num_topics=K)
tfidf_lsi_model.print_topics(num_topics=K, num_words=10)

#### Non-Negative Matrix Factorization (NMF)

In [None]:
# https://github.com/piskvorky/gensim/blob/develop/docs/notebooks/nmf_tutorial.ipynb
from gensim.models.nmf import Nmf

K = 20
nmf_model = Nmf(corpus, id2word=id2word, num_topics=K)
nmf_model.show_topics(num_topics=K, num_words=10)

#### Non-Negative Matrix Factorization and tf-idf (NMF & tf-idf)

In [None]:
# https://github.com/piskvorky/gensim/blob/develop/docs/notebooks/nmf_tutorial.ipynb
from gensim.models.nmf import Nmf

K = 20
tfidf_nmf_model = Nmf(tfidf_corpus, id2word=id2word, num_topics=K)
tfidf_nmf_model.show_topics(num_topics=K, num_words=10)

#### Latent Dirichlet Allocation (LDA)

In [None]:
# Build LDA model
K = 20
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=K,
                                           random_state=100,
                                           update_every=1,
                                           chunksize=400,
                                           passes=30,
                                           alpha='auto',
                                           per_word_topics=True)

# Print the Keywords in the 3 topics
pprint(lda_model.print_topics())

#### BERT A

#### BERT B

#### BERT C

### Guidelines of Atzberger and Cech et al.