### Preprocessing

In [1]:
import numpy as np
import pandas as pd
import operator

import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import LsiModel

import spacy
from keras.datasets import reuters 

# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from','subject','re','edu','use'])

from collections import Counter
from tqdm import tqdm
from pprint import pprint



In [2]:
(train_data, train_labels), (test_data, test_labels) = reuters.load_data(num_words=10000)

  x_train, y_train = np.array(xs[:idx]), np.array(labels[:idx])
  x_test, y_test = np.array(xs[idx:]), np.array(labels[idx:])


In [3]:
K = 10

def top_K_frequent_integers(arr):
    # Count occurrences of each integer in the list
    counter = Counter(arr)
    # Get the K most common integers and their frequencies
    most_common = counter.most_common(K)  
    return most_common

def extract_first_elements(tuple_list):
    return [t[0] for t in tuple_list]

def find_indices(main_list, check_list):
    indices = [index for index, element in enumerate(main_list) if element in check_list]
    return indices

In [4]:
def get_elements_with_indices(main_list, indices):
    return [main_list[index] for index in indices if index < len(main_list)]

In [5]:
word_index = reuters.get_word_index()
reverse_word_index = dict([(value,key) for (key, value) in word_index.items()])
decoded_train_data = []
for j in range(len(train_data)):
    decoded_doc = ' '.join([reverse_word_index.get(i - 3, '?') for i in train_data[j]])
    decoded_train_data.append(decoded_doc)

decoded_train_data[0]

'? ? ? said as a result of its december acquisition of space co it expects earnings per share in 1987 of 1 15 to 1 30 dlrs per share up from 70 cts in 1986 the company said pretax net should rise to nine to 10 mln dlrs from six mln dlrs in 1986 and rental operation revenues to 19 to 22 mln dlrs from 12 5 mln dlrs it said cash flow per share this year should be 2 50 to three dlrs reuter 3'

In [6]:
topics_dict = top_K_frequent_integers(list(train_labels))
topics = extract_first_elements(topics_dict)
indices_positions = find_indices(list(train_labels), topics)

In [7]:
decoded_train_data_selected = get_elements_with_indices(decoded_train_data, indices_positions)
train_labels_selected = get_elements_with_indices(list(train_labels), indices_positions)

In [8]:
decoded_train_data_selected[0]

'? ? ? said as a result of its december acquisition of space co it expects earnings per share in 1987 of 1 15 to 1 30 dlrs per share up from 70 cts in 1986 the company said pretax net should rise to nine to 10 mln dlrs from six mln dlrs in 1986 and rental operation revenues to 19 to 22 mln dlrs from 12 5 mln dlrs it said cash flow per share this year should be 2 50 to three dlrs reuter 3'

In [9]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations
        
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc])
    return texts_out

In [10]:
data_words = list(sent_to_words(decoded_train_data_selected))
print(data_words[0])

['said', 'as', 'result', 'of', 'its', 'december', 'acquisition', 'of', 'space', 'co', 'it', 'expects', 'earnings', 'per', 'share', 'in', 'of', 'to', 'dlrs', 'per', 'share', 'up', 'from', 'cts', 'in', 'the', 'company', 'said', 'pretax', 'net', 'should', 'rise', 'to', 'nine', 'to', 'mln', 'dlrs', 'from', 'six', 'mln', 'dlrs', 'in', 'and', 'rental', 'operation', 'revenues', 'to', 'to', 'mln', 'dlrs', 'from', 'mln', 'dlrs', 'it', 'said', 'cash', 'flow', 'per', 'share', 'this', 'year', 'should', 'be', 'to', 'three', 'dlrs', 'reuter']


In [11]:
# Remove Stop Words
print("Start removing stop words")
data_words_nostops = remove_stopwords(data_words)
print(data_words_nostops[0])

Start removing stop words
['said', 'result', 'december', 'acquisition', 'space', 'co', 'expects', 'earnings', 'per', 'share', 'dlrs', 'per', 'share', 'cts', 'company', 'said', 'pretax', 'net', 'rise', 'nine', 'mln', 'dlrs', 'six', 'mln', 'dlrs', 'rental', 'operation', 'revenues', 'mln', 'dlrs', 'mln', 'dlrs', 'said', 'cash', 'flow', 'per', 'share', 'year', 'three', 'dlrs', 'reuter']


In [12]:
# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# conda install -c conda-forge spacy-model-en_core_web_sm
print("Installing spacy")
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
print("Start lemmatizing words")
#data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])
data_lemmatized = lemmatization(data_words_nostops, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])
print(data_lemmatized[0])

Installing spacy
Start lemmatizing words
['say', 'result', 'december', 'acquisition', 'space', 'co', 'expect', 'earning', 'per', 'share', 'dlrs', 'per', 'share', 'ct', 'company', 'say', 'pretax', 'net', 'rise', 'nine', 'mln', 'dlrs', 'six', 'mln', 'dlrs', 'rental', 'operation', 'revenue', 'mln', 'dlrs', 'mln', 'dlrs', 'say', 'cash', 'flow', 'per', 'share', 'year', 'three', 'dlrs', 'reuter']


In [13]:
data_lemmatized_min_length = []

for sublist in data_lemmatized:
    # Use a list comprehension to filter out strings with less than two characters
    sublist = [word for word in sublist if len(word) > 3]
    data_lemmatized_min_length.append(sublist)
    
print(data_lemmatized_min_length[0])

['result', 'december', 'acquisition', 'space', 'expect', 'earning', 'share', 'dlrs', 'share', 'company', 'pretax', 'rise', 'nine', 'dlrs', 'dlrs', 'rental', 'operation', 'revenue', 'dlrs', 'dlrs', 'cash', 'flow', 'share', 'year', 'three', 'dlrs', 'reuter']


In [14]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized_min_length)

# Create Corpus
texts = data_lemmatized_min_length

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View 
print(corpus[0])

[(0, 1), (1, 1), (2, 1), (3, 1), (4, 6), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 3), (17, 1), (18, 1), (19, 1)]


In [15]:
# Number of documents
print(len(corpus))

# Size of the vocabulary
print(len(id2word))

7627
6191


In [16]:
import statistics

def sum_of_second_components(tuple_list):
    total_sum = 0
    for tup in tuple_list:
        total_sum += tup[1]  # Accessing the second component of each tuple
    return total_sum

lengths = []
for doc in corpus:
    lengths.append(sum_of_second_components(doc))
print(statistics.median(lengths))

37


### Topic Models

#### Vector Space Model (VSM)

In [None]:
from scipy.sparse import dok_matrix

# Define function to convert Gensim corpus to a sparse pandas DataFrame
def corpus_to_sparse_dataframe(corpus):
    word_freq = dok_matrix((len(corpus), len(id2word)), dtype=int)

    for i, doc in enumerate(corpus):
        for word_id, freq in doc:
            word_freq[i, word_id] = freq

    dataframe = pd.DataFrame.sparse.from_spmatrix(word_freq)
    dataframe.columns = [id2word[word_id] for word_id in range(len(id2word))]
    return dataframe

In [None]:
VSM = corpus_to_sparse_dataframe(corpus)

#### VSM and tf-idf (VSM & tf-idf)

In [None]:
from gensim.models import TfidfModel

model = TfidfModel(corpus)  # fit model
tfidf_corpus = model[corpus]

#### Latent Semantic Indexing

In [None]:
from gensim.models import LsiModel

K = 10
lsi_model = LsiModel(corpus, id2word=id2word, num_topics=K)
lsi_model.print_topics(num_topics=K, num_words=10)

#### Latent Semantic Indexing and tf-idf (LSI & tf-idf)

In [None]:
from gensim.models import LsiModel

K = 10
tfidf_lsi_model = LsiModel(tfidf_corpus, id2word=id2word, num_topics=K)
tfidf_lsi_model.print_topics(num_topics=K, num_words=10)

#### Non-Negative Matrix Factorization (NMF)

In [None]:
# https://github.com/piskvorky/gensim/blob/develop/docs/notebooks/nmf_tutorial.ipynb
from gensim.models.nmf import Nmf

K = 10
nmf_model = Nmf(corpus, id2word=id2word, num_topics=K)
nmf_model.show_topics(num_topics=K, num_words=10)

#### Non-Negative Matrix Factorization and tf-idf (NMF & tf-idf)

In [None]:
# https://github.com/piskvorky/gensim/blob/develop/docs/notebooks/nmf_tutorial.ipynb
from gensim.models.nmf import Nmf

K = 10
tfidf_nmf_model = Nmf(tfidf_corpus, id2word=id2word, num_topics=K)
tfidf_nmf_model.show_topics(num_topics=K, num_words=10)

#### Latent Dirichlet Allocation (LDA)

In [None]:
# Build LDA model
K = 10
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=K,
                                           random_state=100,
                                           update_every=1,
                                           chunksize=400,
                                           passes=30,
                                           alpha='auto',
                                           per_word_topics=True)

# Print the Keywords in the 3 topics
pprint(lda_model.print_topics())

#### Miscallenous

In [None]:
type(model)

In [None]:
model.print_topics(num_topics=20, num_words=10)

In [None]:
model[corpus[1]]

In [None]:
doc_top = []
for i in tqdm(range(len(corpus))):
    doc_top.append(model[corpus[i]])

In [None]:
rows = []
for i in range(len(corpus)):
    doc_top = []
    doc_topics = model[corpus[i]]
    for j in range(K):
        doc_top.append(doc_topics[j][1])
    rows.append(doc_top)

In [None]:
DTM = pd.DataFrame(rows)
#document_topic_matrix_sourcecode["identifier"] = df_sourcecode.iloc[:,0].tolist()
DTM

In [None]:
Y = train_labels_selected

In [None]:
# Dimensionality reduction: UMAP
import umap
import time

time_start = time.time()
umap = umap.UMAP(n_components = 2, n_neighbors = 10, min_dist = 0.1)
umap_results = umap.fit_transform(DTM)

print ('UMAP done! Time elapsed: {} seconds'.format(time.time()-time_start))


import matplotlib.pyplot as plt
%matplotlib inline
# Create the figure
fig = plt.figure( figsize=(8,8) )
ax = fig.add_subplot(1, 1, 1, title='UMAP' )
# Create the scatter
ax.scatter(
    x=umap_results[:,0], 
    y=umap_results[:,1], 
    c=Y, 
    cmap=plt.cm.get_cmap('Paired'), 
    alpha=1.0)
plt.show()