In [None]:
# imports
import pandas as pd, numpy as np

pd.set_option('display.max_colwidth', None)

In [None]:
# read data: food.csv | investing.csv
data = pd.read_csv('food.csv')
data = data.drop(columns=['Unnamed: 0'])
data.date = pd.to_datetime(data.date)

# get preprocessed + original reviews for analysis and intrinsic evaluation
fin_texts = [txt.split() for txt in data.loc[(data['for_analysis']) & (data['content_processed'].notnull()), 'content_processed'].values.tolist()]
original_texts = data.loc[(data['for_analysis']) & (data['content_processed'].notnull()), 'content'].values.tolist()
corpus_size = sum([len(doc) for doc in fin_texts])

In [None]:
# get wikipedia corpus for extrinsic evaluation
wiki_corpus_size = 2531105734 # precomputed
from scipy import sparse
wiki_matrix = sparse.load_npz("D:\\wiki\\wiki_dump_binary_matrix.npz")
with open("D:\\wiki\\features.txt", 'r', encoding='utf-8') as f:
    wiki_features = f.readlines()
wiki_features = [feature.strip() for feature in wiki_features]

In [None]:
# train a hybrid tfidf model with a given threshold
# higher thresholds give greater weight to longer texts
THRESH = 6

import hybridtfidf

h = hybridtfidf.HybridTfidf(threshold=THRESH)
document_vectors = h.fit_transform(fin_texts)
document_weights = h.transform_to_weights(fin_texts)

In [None]:
# generate top-k summaries
# lower similarity thresh. will produce more distinct seeds
K = 50
SIMILARITY = 0.2

from hybridtfidf import utils
most_significant = utils.select_salient_posts(document_vectors, document_weights, k=K, similarity_threshold=SIMILARITY)

for i in most_significant:
    print(original_texts[i]) 
    print(fin_texts[i]) 
    print()
del h
del document_vectors
del document_weights

keywords = [fin_texts[i] for i in most_significant] 
keywords

In [None]:
# main cell
# we use R's implementation of keyATM so conversion to Python's objects is necessary
# converting data
from rpy2 import robjects
from rpy2.robjects.packages import importr
from rpy2.robjects import r, StrVector, ListVector, pandas2ri

#pandas2ri.deactivate()
pandas2ri.activate()

import re
import math

quanteda = importr('quanteda')
# keyATM package has to be installed first
# https://cran.r-project.org/web/packages/keyATM/index.html
keyATM = importr('keyATM')

# next two functions below perform optimized NPMI calculation over intrinsic (corpus)
# and extrinsic (wikipedia) datasets
def calculate_npmi_corpus_keywords(keywords):
    keywords_counts = {}
    ret = []
    # generate combinations
    comb = list(combinations(keywords, 2))
        
    for i, (w1, w2) in enumerate(comb):
        if w1 not in keywords_counts:
            keywords_counts[w1] = sum([1 for doc in fin_texts for word in doc if w1 in doc])
        if w2 not in keywords_counts:
            keywords_counts[w2] = sum([1 for doc in fin_texts for word in doc if w2 in doc])
            
        # count both
        p_w1_w2 = sum([1 for doc in fin_texts for word in doc if w1 in doc and w2 in doc])   
        
        if p_w1_w2 == 0: # they never occur together
            res = -1.0
        else:
            p_w1 = keywords_counts[w1]
            p_w2 = keywords_counts[w2]

            p_w1 /= corpus_size
            p_w2 /= corpus_size
            p_w1_w2 /= corpus_size
            num = math.log(p_w1_w2 / (p_w1 * p_w2))
            res = num / -math.log(p_w1_w2)
        ret.append((w1, w2, res))
    return ret

# memoing
keywords_columns = {}
keywords_sums = {}
def calculate_npmi_wiki_keywords(keywords):
    ret = []
    keywords_updated = []
    for keyword in keywords:
        if keyword not in keywords_columns:
            try:
                w_id = wiki_features.index(keyword)
            except:
                continue
            keywords_updated.append(keyword)
            w_arr = wiki_matrix[:,w_id].A.flatten()
            p_w = w_arr.sum()
            keywords_columns[keyword] = w_arr
            keywords_sums[keyword] = p_w
        else:
            keywords_updated.append(keyword)
    # generate combinations
    comb = list(combinations(keywords_updated, 2))
        
    for i, (w1, w2) in enumerate(comb):
        w1_arr = keywords_columns[w1]
        w2_arr = keywords_columns[w2]
        p_w1_w2 = ((w1_arr == 1) & (w1_arr == w2_arr)).sum() 
        
        if p_w1_w2 == 0: # they never occur together
            res = -1.0
        else:
            p_w1 = keywords_sums[w1]
            p_w2 = keywords_sums[w2]

            p_w1 /= wiki_corpus_size
            p_w2 /= wiki_corpus_size
            p_w1_w2 /= wiki_corpus_size
            num = math.log(p_w1_w2 / (p_w1 * p_w2))
            res = num / -math.log(p_w1_w2)
        ret.append((w1, w2, res))
    return ret

# evaluates the topic (top-10 words) using intrinsic and extrinsic evaluation
def evaluate_topic_npmi(keywords):
    topic_npmis = calculate_npmi_wiki_keywords(keywords)
    topic_npmis_corpus = calculate_npmi_corpus_keywords(keywords)

    return np.mean([npmi for _,_, npmi in topic_npmis]), np.mean([npmi for _,_, npmi in topic_npmis_corpus])

# main function
def main_pipeline(data, keywords):
    # convert data to keyATM format
    vec = quanteda.tokens(data)
    dfm = quanteda.dfm(vec)
    keyATM_docs = keyATM.keyATM_read(texts=dfm)
    # prepare keywords
    keyATM_keywords = ListVector([("T" + str(i), StrVector(list(set(keyword_set)))) for i,keyword_set in enumerate(keywords)])
    # train
    scores = []
    out = keyATM.keyATM(docs=keyATM_docs, no_keyword_topics=0, keywords=keyATM_keywords, model='base')
    KEY_topic_word = pd.DataFrame(out.rx('phi')[0], columns=out.rx('vocab')[0])
    for i,tname in zip(KEY_topic_word.index, r.colnames(keyATM.top_docs(out))):
        topic_words = KEY_topic_word.iloc[i,:].sort_values(ascending=False).index[:10]
        score_wiki, score_corpus = evaluate_topic_npmi(topic_words)
        scores.append((score_wiki, score_corpus))

    return out, scores

In [None]:
import pickle
all_scores_npmi_wiki = []
all_scores_npmi_corpus = []
# train keyATM for K={10,20,...,100}
for i in range(10, 110, 10):
    out,scores = main_pipeline(fin_texts, keywords[:i])
    all_scores_npmi_wiki.append([score[0] for score in scores])
    all_scores_npmi_corpus.append([score[1] for score in scores])
    # save on each K
    with open("scores\\npmi_wiki_0.txt", 'wb') as f:
        pickle.dump(all_scores_npmi_wiki, f)
    with open("scores\\npmi_corpus_0.txt", 'wb') as f:
        pickle.dump(all_scores_npmi_corpus, f)

In [None]:
# for keyATM configurations (keyATM-5, -15, -25)
# thresh defines the lower percentile
# returns new keywords
# keyATM can be re-trained using the main_pipeline with new keywords
def remove_low_score(keywords, thresh=5):
    topic_npmis = []
    ret = []
    for keyw_set in keywords:
        new_keyw = []
        npmis = calculate_npmi_corpus_keywords(keyw_set)
        scores = {}
        for w1, w2, score in npmis:
            if w1 not in scores:
                scores[w1] = []
            if w2 not in scores:
                scores[w2] = []
            scores[w1].append(score)
            scores[w2].append(score)
        return scores
        try:
            scores = {w:np.mean(score) for w, score in scores.items()}
            t = np.percentile(list(scores.values()), thresh)
        except:
            ret.append(keyw_set)
            continue
        for k,v in sorted(scores.items(), key=lambda x: x[1], reverse=True):
            if v < t:
                print(f"REMOVED: {k} {v}")
            else:
                new_keyw.append(k)
            print(k, v)
        ret.append(new_keyw)
    return ret

In [None]:
# LDA baseline
import gensim.corpora as corpora
import gensim
# Create Dictionary
id2word = corpora.Dictionary(fin_texts)
# Create Corpus
texts = fin_texts
# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

lda_scores_npmi_wiki = []
lda_scores_npmi_corpus = []
for i in range(10, 110, 10):
    lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=i, 
                                       chunksize=100,
                                       per_word_topics=True,
                                       iterations=1500)
    LDA_doc_topic = pd.DataFrame([{pair[0]:pair[1] for pair in elem} for elem in lda_model.get_document_topics(corpus)], columns=range(0,16))
    LDA_topic_word = pd.DataFrame(lda_model.get_topics(), columns=[it[1] for it in id2word.items()])
    
    temp_wiki = []
    temp_corpus = []
    for j in LDA_topic_word.index:
        keyw = list(LDA_topic_word.iloc[j,:].sort_values(ascending=False).index[:10])
        score_wiki, score_corpus = evaluate_topic_npmi(keyw)
        temp_wiki.append(score_wiki)
        temp_corpus.append(score_corpus)
    lda_scores_npmi_wiki.append(temp_wiki)
    lda_scores_npmi_corpus.append(temp_corpus)