In [None]:
# Necessary for importing modules from a sub-directory
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
if os.getcwd().split('/')[-1] == 'notebooks':
    print("CHANGE DIR TO ROOT")
    os.chdir(r"../")
    
# Import libraries
import pandas as pd
import numpy as np
from datetime import datetime
import time
import pickle
import tqdm
import nltk
from preprocess import build_ne_gold_phraser, get_gold_ngrams, apply_ngrams
import gensim
from evaluation import get_median ,get_bot, get_top
from gensim import matutils, models, corpora
from gensim.models.coherencemodel import CoherenceModel

In [None]:
# Print topics with N top words
def print_topics_words(model, num_topics):
    word_dict = {};
    for i in range(num_topics):
        words = model.show_topic(i, topn = 10);
        word_dict['T#' + '{:02d}'.format(i)] = [i[0] for i in words];
        df = pd.DataFrame(word_dict);
        df.to_csv(f'DN_mallet_{num_topics}.csv')

In [None]:
# Extracting topic terms to be used for calcualtion of coherence
def get_lda_topics(model, num_topics):
    word_dict = [];
    for i in range(num_topics):
        wp = model.show_topic(i, topn=10)
        key_words = [topic[0] for topic in wp] 
        word_dict.append(key_words)
    return word_dict

In [None]:
# Calculate document-topic relative sparseness
def doc_top_relative_sparseness(mod_type, model, corpus, top_n):
    
    if mod_type == 'gensim':
        doc_topics = model.get_document_topics(corpus, per_word_topics=False, minimum_probability=0.0)
        res = []
        for k, doc in enumerate(doc_topics):
            topics = sorted([topic[1] for topic in doc], reverse=True)
            res.append(sum(topics[:top_n])/sum(topics))
        return res
        
    else:
        fname = model.fdoctopics()
        doc_topics = model.read_doctopics(fname, eps=0, renorm=False)
        res = []
    
        for k, doc in enumerate(doc_topics):
            topics = sorted([topic[1] for topic in doc], reverse=True)
            res.append(sum(topics[:top_n])/sum(topics))
        return res

In [None]:
# Set mallet path
mallet_path ='mallet-2.0.8/bin/mallet'

In [None]:
def compute_coherence_values(mod_type, corpus, id2word, data, s, k, alpha, eta, ntw, top_n):
    
    if mod_type == 'gensim':
        #LDA implemented with gensim
        model = gensim.models.LdaModel(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=k,
                                       alpha=alpha,
                                       eta=eta,
                                       passes=3,
                                       iterations=100)
    else:
        #LDA implemented with mallet
        model = models.wrappers.LdaMallet(mallet_path,
                                          corpus=corpus,
                                          id2word=id2word,
                                          num_topics=k)
    
    print_topics_words(model, k)
    topics = get_lda_topics(model, k)
    flat_topic_terms = [term for topic in topics for term in topic]
    topic_terms_not_in_ext_corpus = [[term] for term in flat_topic_terms if term not in tok2id]
    
    missing_topics = len(topic_terms_not_in_ext_corpus)
    print(f'{missing_topics} number of terms not in extrinsic corpus added as separate one-term documents')
    
    smooth_corpus = extrinsic_data + topic_terms_not_in_ext_corpus
    smooth_gensim_dict = corpora.Dictionary(smooth_corpus)
    
    #Calculate coherence
    cv = CoherenceModel(topics=topics,
                        texts=smooth_corpus,
                        dictionary=smooth_gensim_dict,
                        topn=ntw,
                        coherence='c_v')
    umass = CoherenceModel(topics=topics,
                           texts=data,
                           dictionary=id2word,
                           topn=ntw,
                           coherence='u_mass')
    
    #Calculate relative sparseness
    res = doc_top_relative_sparseness(mod_type, model, corpus, top_n)

    return model, cv.get_coherence_per_topic(), umass.get_coherence_per_topic(), res

In [None]:
def print_coherence_values(mod_type, corpus, id2word, data, s, topics_range, alpha, eta, ntw, top_n, n_grams):
    
    pbar = tqdm.tqdm(total=len(topics_range))
    
    model_results = {'Date': [],
                     'Model': [],
                     '#articles': [],
                     '#topics': [],
                     'time (s)': [],
                     'n_grams': [],
                     'cv_avg': [],
                     'cv_top': [],
                     'cv_bot': [],
                     'umass_avg': [],
                     'umass_top': [],
                     'umass_bot': [],
                     'rs_avg': [],
                     'rs_top': [],
                     'rs_bot': [],
                     }
    
    if 1 == 1:
        for k in topics_range:
            # get the coherence score for the given parameters
            t0 = time.time()
            model, cv, umass, res = compute_coherence_values(mod_type, corpus, id2word, data, s, k, alpha, eta, ntw, top_n)
            
            # calculate runtime
            t1 = time.time()
            runtime = round((t1-t0), 1)
            
            # get current date and time
            now = datetime.now()
            date = now.strftime("%d/%m/%Y %H:%M")

            # calculate cv metrics
            m_cv = sorted(cv)
            cv_avg = round(np.mean(m_cv), 3)
            cv_top = get_top(m_cv)
            cv_bot = get_bot(m_cv)
            
            # calculate umass metrics
            m_umass = sorted(umass)
            umass_avg = round(np.mean(m_umass), 3)
            umass_top = get_top(m_umass)
            umass_bot = get_bot(m_umass)
            
            # calculate rs metrics
            m_rs = sorted(res)
            rs_avg = round(np.mean(m_rs), 3)
            rs_top = get_top(m_rs)
            rs_bot = get_bot(m_rs)
        
            # Save the model results
            model_results['Date'].append(date)
            model_results['Model'].append(mod_type)
            model_results['#articles'].append(s)
            model_results['#topics'].append(k)
            model_results['time (s)'].append(runtime)
            model_results['n_grams'].append(n_grams)
            model_results['cv_avg'].append(cv_avg)
            model_results['cv_top'].append(cv_top)
            model_results['cv_bot'].append(cv_bot)
            model_results['umass_avg'].append(umass_avg)
            model_results['umass_top'].append(umass_top)
            model_results['umass_bot'].append(umass_bot)
            model_results['rs_avg'].append(rs_avg)
            model_results['rs_top'].append(rs_top)
            model_results['rs_bot'].append(rs_bot)
            pbar.update(1)

        pd.DataFrame(model_results).to_csv('DN_mallet_results.csv', index=False)
        #pd.DataFrame(model_results).to_csv('DN_gensim_results.csv', index=False, mode='a', header=False)
        pbar.close()
        return model

In [None]:
#############################################   TEST PARAMETERS  ###################################################

DATASET_TYPE = 'DN'
DATASET_SIZE = '10000'
DATASET_INDEX = '1'

# Load pickles:
c1 = pickle.load(open(f'pickles/efselab_{DATASET_TYPE}_{DATASET_SIZE}_1.pkl', 'rb'))
c2 = pickle.load(open(f'pickles/efselab_{DATASET_TYPE}_{DATASET_SIZE}_2.pkl', 'rb'))
c3 = pickle.load(open(f'pickles/efselab_{DATASET_TYPE}_{DATASET_SIZE}_3.pkl', 'rb'))
c4 = pickle.load(open(f'pickles/efselab_{DATASET_TYPE}_{DATASET_SIZE}_4.pkl', 'rb'))
c5 = pickle.load(open(f'pickles/efselab_{DATASET_TYPE}_{DATASET_SIZE}_5.pkl', 'rb'))
extrinsic_data = pickle.load( open("pickles/efselab_extrinsic_20000.pkl","rb"))
data = c1 + c2 + c3 + c4 + c5

# Use n_grams or not
n_grams = False


# Use NNGRAM is true, apply on data and extrinsic data
if (n_grams is True):
    NE_GRAMS_FILE = f'ngrams_per_dataset/ne_ngrams_{DATASET_TYPE}_{DATASET_SIZE}_{DATASET_INDEX}.pkl'
    ne_ngrams = pickle.load(open(NE_GRAMS_FILE, 'rb'))
    gold_ngrams = get_gold_ngrams()

    # Optimized pipeline for applying ne+gold ngrams to a corpus (with ne and gold pickles)
    phraser = build_ne_gold_phraser(ne_ngrams, gold_ngrams)
    data = apply_ngrams(data, phraser)
    
    # ngram on extrinsic
    extrinsic_data = apply_ngrams(extrinsic_data, phraser)
    
    
# Format Gensim components for data
s = len(data)
id2word = corpora.Dictionary(data)
id2word.filter_extremes(no_below=2)
corpus = [id2word.doc2bow(text) for text in data]

# Format Gensim components for extrisnic data
extrinsic_id2word = corpora.Dictionary(extrinsic_data)
extrinsic_corpus = [extrinsic_id2word.doc2bow(text) for text in extrinsic_data]

# Format Gensim components for Coherence metrics
ext_gensim_dict = corpora.Dictionary(extrinsic_data)
tok2id = ext_gensim_dict.token2id

# Model type, mallet och gensim
mod_type = 'mallet'

# Topics range
min_topics = 10
max_topics = 151
step_size = 10
topics_range = range(min_topics, max_topics, step_size)

# Alpha parameter
alpha = 'auto' # Dirichlet hyperparameter alpha: Document-Topic Density

# Beta parameter
eta = 'auto' # Dirichlet hyperparameter eta: Word-Topic Density

# Number of topwords used to compute coherence
ntw = 10

# Number of topic used for calculation of matrix spareness
top_n = 1

model = print_coherence_values(mod_type, corpus, id2word, data, s, topics_range, alpha, eta, ntw, top_n, n_grams)