#### Topic Modeling on Article data

In [None]:
import pandas as pd
import numpy as np

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
import shared_data

In [4]:
content = shared_data.SharedData().get_content()

In [None]:
content.head()

Remove all docs that are not in English

In [None]:
content.language.value_counts()

In [None]:
content.shape

In [None]:
content = content[content['language'] == 'en']

In [None]:
content.shape

In [None]:
no_dups = content.sort_values('event_timestamp').drop_duplicates(subset=['title', 'text_description'], keep='last')

In [None]:
no_dups.head()

In [None]:
no_dups.reset_index(inplace=True)

In [None]:
no_dups.interaction_type.value_counts()

In [None]:
no_dups[no_dups['title'] == "Ethereum, a Virtual Currency, Enables Transactions That Rival Bitcoin's"]

In [None]:
content[content['title'] == "Ethereum, a Virtual Currency, Enables Transactions That Rival Bitcoin's"]

Makes sense

In [None]:
content = no_dups

Drop everything except the text.

In [None]:
trimmed = content[['title', 'text_description', 'interaction_type']]

In [None]:
trimmed.head()

Preprocessing

In [None]:
trimmed['text_no_punc'] = trimmed.text_description.str.replace(r'[^\w\s]+', '')

In [None]:
trimmed.head()

In [None]:
# Convert trimmed.text_no_punc to lowercase and assign to trimmed.text_no_punc_lower
trimmed['text_no_punc_lower'] = trimmed.text_no_punc.str.lower()

In [None]:
trimmed.head()

#### Build a WordCloud

In [None]:
%pip install wordcloud

In [None]:
from wordcloud import WordCloud

In [None]:
# Create a wordcloud of trimmed.text_no_punc_lower
long_string = ','.join(trimmed.text_no_punc_lower)
wc = WordCloud(background_color="white", max_words=5000, contour_width=5, contour_color='steelblue')
wc.generate(long_string)
wc.to_image()

#### Prepare the data for LDA

In [None]:
%pip install gensim

In [None]:
import gensim
from gensim.utils import simple_preprocess

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

In [None]:
stopwords_en = stopwords.words('english')

In [None]:
# Define a function to convert sentences to words using gensim's simple_preprocess
def sentences_to_words(sentences):
    for sentence in sentences:
        yield(simple_preprocess(str(sentence), deacc=True))

# define a function to remove stopwords
# def remove_stopwords(words):
    # return [w for w in words if w not in stopwords_en]

In [None]:
# Define a function to remove stopwords using gensim's simple_preprocess
def remove_stopwords(docs):
    return [[word for word in simple_preprocess(str(doc)) if word not in stopwords_en] for doc in docs]

In [None]:
data = trimmed.text_description.values.tolist()
data_words = list(sentences_to_words(data))

In [None]:
# Create bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example
print(trigram_mod[bigram_mod[data_words[0]]])

In [None]:
# Define a function to make bigrams
def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

In [None]:
%pip install spacy

In [None]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [None]:
# Define a function to lemmatize given texts
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent))
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [None]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
# data_words_bigrams = make_bigrams(data_words_nostops)

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_nostops)

In [None]:
# sentences = trimmed.text_description.values.tolist()

In [None]:
# words_array = list(sentences_to_words(sentences))

In [None]:
# words_array[0]

In [None]:
# no_stopwords = [remove_stopwords(words) for words in words_array]

In [None]:
# no_stopwords[0]

Build a Dictionary

In [None]:
from gensim.corpora import Dictionary

data_dictionary = Dictionary(data_lemmatized)
corpus = [data_dictionary.doc2bow(word_arr) for word_arr in data_lemmatized]

In [None]:
# data_dictionary.doc2bow(words[0])
# len(words[0])
# words[0][13]

In [None]:
# corpus[0]

In [None]:
# num_topics = 20

In [None]:
update_every = [1, 2, 5, 10]
chunksize = [10, 20, 25, 50, 75, 100]
passes = [1, 2, 5, 10]
num_topics = [5, 10, 15, 20, 25, 30, 35, 40, 45, 50]

In [None]:
import itertools

In [None]:
# Create a DataFrame to store the results
results = pd.DataFrame(columns=['update_every', 'chunksize', 'passes', 'num_topics', 'perplexity', 'coherence'])

In [None]:
for x in itertools.product(update_every, chunksize, passes, num_topics):
    (ue, cs, p, nt) = x
    lda_model = gensim.models.ldamodel.LdaModel(
        corpus=corpus, 
        id2word=data_dictionary, 
        random_state=42, 
        alpha='auto', 
        eta='auto',
        num_topics=nt, 
        update_every=ue, 
        chunksize=cs, 
        passes=p, 
        per_word_topics=True
    )
    
    # Compute Perplexity
    perplexity = lda_model.log_perplexity(corpus)

    # Compute Coherence Score
    coherence_model_lda = gensim.models.CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=data_dictionary, coherence='c_v')
    coherence = coherence_model_lda.get_coherence()

    results = results.append({'update_every': ue, 'chunksize': cs, 'passes': p, 'num_topics': nt, 'perplexity': perplexity, 'coherence': coherence}, ignore_index=True)

In [None]:
lda_model = gensim.models.ldamodel.LdaModel(
    corpus=corpus, 
    id2word=data_dictionary, 
    num_topics=num_topics, 
    random_state=42, 
    update_every=1, 
    chunksize=50, 
    passes=10, 
    alpha='auto', 
    eta='auto',
    per_word_topics=True
)

In [None]:
# from pprint import pprint
# pprint(lda_model.print_topics())
lda_model.print_topics()

In [None]:
doc_lda = lda_model[corpus]

Evaluation

In [None]:
lda_model.log_perplexity(corpus)

In [None]:
doc_lda

In [None]:
doc_lda[0]

In [None]:
lda_model.get_document_topics(corpus[0], minimum_probability=0.01)

In [None]:
lda_model.show_topic(0)

In [None]:
all_topics = lda_model.get_document_topics(corpus, minimum_probability=0.01)

In [None]:
all_topics_matrix = gensim.matutils.corpus2csc(all_topics)

In [None]:
all_topics_matrix

Matrix is 10 X 3026, meaning it has one row for each topic, and one column for each item.

So take the transpose

In [None]:
all_topics_arr = all_topics_matrix.T.toarray()

In [None]:
all_topics_df = pd.DataFrame(all_topics_arr)

In [None]:
all_topics_df.head()

In [None]:
all_topics_df[1].value_counts()

Squashing the topics into a single list

In [None]:
for idx, row in enumerate(lda_model[corpus]):
    print(idx, row)
    row = sorted(row, key=lambda x: (x[1]), reverse=True)
    print(row)

    # Extract the first item from each tuple in row
    topic_indices = [x[0] for x in row]

    # print(lda_model.show_topic(row[0][0]))
    print(topic_indices)
    break

In [None]:
def get_topic_indices(row):
    row = sorted(row, key=lambda x: (x[1]), reverse=True)
    topic_indices = [x[0] for x in row]
    return topic_indices

In [None]:
# Map the get_topic_indices function to lda_model[corpus]
topic_indices = list(map(get_topic_indices, lda_model[corpus]))

In [None]:
topic_indices[0]

In [None]:
# Convert topic_indices to a Series, and convert the Series into a DataFrame
topic_indices_df = pd.DataFrame(pd.Series(topic_indices))

In [None]:
topic_indices_df.head()

In [None]:
# Concatenate trimmed and topic_indices_df
trimmed_topics = pd.concat([trimmed, topic_indices_df], axis=1)

In [None]:
trimmed_topics.head()

In [None]:
# Rename the column in trimmed_topics
trimmed_topics = trimmed_topics.rename(columns={0: 'topic_indices'})

In [None]:
trimmed_topics.head()

In [None]:
lda_model.print_topics()

In [None]:
def get_topic_names_from_index(idx):
    if idx == 0:
        return ['Customer Service', 'Technology', 'Business']
    if idx == 1:
        return ['Encryption', 'Technology', 'Digital Security', 'Computer Science']
    if idx == 2:
        return ['Programming', 'API', 'Java', 'Technology']
    if idx == 3:
        return ['Cloud', 'AWS', 'Amazon Web Services', 'Internet', 'Technology']
    if idx == 4:
        return ['Teams', 'People', 'Office']
    if idx == 5:
        return ['Mobiles', 'Technology', 'Android', 'IOS', 'Apple', 'Google']
    if idx == 6:
        return ['Cloud', 'AWS', 'Amazon Web Services', 'Internet']
    if idx == 7:
        return ['Cloud', 'AWS', 'Amazon Web Services', 'Internet']
    if idx == 8:
        return ['Cloud', 'AWS', 'Amazon Web Services', 'Internet']
    if idx == 9:
        return ['Cloud', 'AWS', 'Amazon Web Services', 'Internet']
    return []

def get_topic_names(row):
    topic_names = [get_topic_names_from_index(idx) for idx in row]
    flat = [item for sublist in topic_names for item in sublist]
    return list(set(flat))

In [None]:
# Drop text_no_punc and text_no_punc_lower from trimmed_topics
trimmed_topics = trimmed_topics.drop(['text_no_punc', 'text_no_punc_lower'], axis=1)

In [None]:
trimmed_topics.head()

In [None]:
# Iterate over the topic_indices column in trimmed_topics and apply get_topic_names_from_index
trimmed_topics['topic_names'] = trimmed_topics['topic_indices'].apply(lambda x: get_topic_names(x))

In [None]:
trimmed_topics.head()

In [None]:
# Drop topic_indices from trimmed_topics
trimmed_topics = trimmed_topics.drop('topic_indices', axis=1)

In [None]:
trimmed_topics.head()

In [None]:
trimmed_topics['topic_names']

In [None]:
# Add the topic_names column to content
content = pd.concat([content, trimmed_topics['topic_names']], axis=1)

In [None]:
content.head()

In [None]:
trimmed.shape

In [None]:
y.shape

In [None]:
# x.columns

In [None]:
trimmed.shape

In [None]:
dups = trimmed[trimmed.duplicated(subset=['title', 'text_description'])]

In [None]:
dups.head(20)

Based on the user topics, the recommended articles change.

* At the start of the day, recommend 10 articles based on user interests
* Adjust user interests based on
    * Direct feedback
    * Clicks
    * Time spent on the article page (some threshold)
* When a new user is created, user will choose some topics
* New articles should be recommended based on collaborative and content-based filtering

### Topic modeling on articles