In [10]:
# Necessary for importing modules from a sub-directory
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
if os.getcwd().split('/')[-1] == 'notebooks':
    print("CHANGE DIR TO ROOT")
    os.chdir(r"../")
    
# Import libraries
import pandas as pd
import numpy as np
from datetime import datetime
import time
import pickle
import tqdm
import nltk
from preprocess import build_ne_gold_phraser, get_gold_ngrams, apply_ngrams
import gensim
from evaluation import get_median ,get_bot, get_top
from gensim import matutils, models, corpora
from gensim.models.coherencemodel import CoherenceModel

In [11]:
# Extracting topic terms to be used for calcualtion of coherence
def get_lda_topics(model, num_topics):
    word_dict = [];
    for i in range(num_topics):
        wp = model.show_topic(i, topn=10)
        key_words = [topic[0] for topic in wp] 
        word_dict.append(key_words)
    return word_dict

In [12]:
# Calculate document-topic relative sparseness
def doc_top_relative_sparseness(mod_type, model, corpus, top_n):
    
    if mod_type == 'gensim':
        doc_topics = model.get_document_topics(corpus, per_word_topics=False, minimum_probability=0.0)
        res = []
        for k, doc in enumerate(doc_topics):
            topics = sorted([topic[1] for topic in doc], reverse=True)
            res.append(sum(topics[:top_n])/sum(topics))
        return res
        
    else:
        fname = model.fdoctopics()
        doc_topics = model.read_doctopics(fname, eps=0, renorm=False)
        res = []
    
        for k, doc in enumerate(doc_topics):
            topics = sorted([topic[1] for topic in doc], reverse=True)
            res.append(sum(topics[:top_n])/sum(topics))
        return res

In [13]:
# Set mallet path
mallet_path ='mallet-2.0.8/bin/mallet'

In [14]:
def compute_coherence_values(mod_type, corpus, id2word, data, s, k, alpha, eta, ntw, top_n):
    
    if mod_type == 'gensim':
        #LDA implemented with gensim
        model = gensim.models.LdaModel(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=k,
                                       alpha=alpha,
                                       eta=eta,
                                       passes=3,
                                       iterations=100)
    else:
        #LDA implemented with mallet
        model = models.wrappers.LdaMallet(mallet_path,
                                          corpus=corpus,
                                          id2word=id2word,
                                          num_topics=k,
                                          iterations=2000)
        
    topics = get_lda_topics(model, k)
    flat_topic_terms = [term for topic in topics for term in topic]
    topic_terms_not_in_ext_corpus = [[term] for term in flat_topic_terms if term not in tok2id]
    
    missing_topics = len(topic_terms_not_in_ext_corpus)
    print(f'{missing_topics} number of terms not in extrinsic corpus added as separate one-term documents')
    
    smooth_corpus = extrinsic_data + topic_terms_not_in_ext_corpus
    smooth_gensim_dict = corpora.Dictionary(smooth_corpus)
    
    #Calculate coherence
    cv = CoherenceModel(topics=topics,
                        texts=smooth_corpus,
                        dictionary=smooth_gensim_dict,
                        topn=ntw,
                        coherence='c_v')
    umass = CoherenceModel(topics=topics,
                           texts=data,
                           dictionary=id2word,
                           topn=ntw,
                           coherence='u_mass')
    
    #Calculate relative sparseness
    res = doc_top_relative_sparseness(mod_type, model, corpus, top_n)

    return model, cv.get_coherence_per_topic(), umass.get_coherence_per_topic(), res

In [15]:
def print_coherence_values(mod_type, corpus, id2word, data, s, k, alpha, eta, ntw, top_n, n_grams):
    
    model_results = {'Date': [],
                     'Model': [],
                     '#articles': [],
                     '#topics': [],
                     'time (s)': [],
                     'n_grams': [],
                     'cv_avg': [],
                     'cv_top': [],
                     'cv_bot': [],
                     'umass_avg': [],
                     'umass_top': [],
                     'umass_bot': [],
                     'rs_avg': [],
                     'rs_top': [],
                     'rs_bot': [],
                     }
    
    if 1 == 1:
        # get the coherence score for the given parameters
        t0 = time.time()
        model, cv, umass, res = compute_coherence_values(mod_type, corpus, id2word, data, s, k, alpha, eta, ntw, top_n)

        # calculate runtime
        t1 = time.time()
        runtime = round((t1-t0), 1)

        # get current date and time
        now = datetime.now()
        date = now.strftime("%d/%m/%Y %H:%M")

        # calculate cv metrics
        m_cv = sorted(cv)
        cv_avg = round(np.mean(m_cv), 3)
        cv_top = get_top(m_cv)
        cv_bot = get_bot(m_cv)

        # calculate umass metrics
        m_umass = sorted(umass)
        umass_avg = round(np.mean(m_umass), 3)
        umass_top = get_top(m_umass)
        umass_bot = get_bot(m_umass)

        # calculate rs metrics
        m_rs = sorted(res)
        rs_avg = round(np.mean(m_rs), 3)
        rs_top = get_top(m_rs)
        rs_bot = get_bot(m_rs)

        # Save the model results
        model_results['Date'].append(date)
        model_results['Model'].append(mod_type)
        model_results['#articles'].append(s)
        model_results['#topics'].append(k)
        model_results['time (s)'].append(runtime)
        model_results['n_grams'].append(n_grams)
        model_results['cv_avg'].append(cv_avg)
        model_results['cv_top'].append(cv_top)
        model_results['cv_bot'].append(cv_bot)
        model_results['umass_avg'].append(umass_avg)
        model_results['umass_top'].append(umass_top)
        model_results['umass_bot'].append(umass_bot)
        model_results['rs_avg'].append(rs_avg)
        model_results['rs_top'].append(rs_top)
        model_results['rs_bot'].append(rs_bot)

    #pd.DataFrame(model_results).to_csv('lda_tuning_results_one.csv', index=False)
    #pd.DataFrame(model_results).to_csv('lda_tuning_results.csv', index=False, mode='a', header=False)
    return model

In [16]:
# TEST PARAMETERS

DATASET_TYPE = 'BN'
DATASET_SIZE = '10000'
DATASET_INDEX = '1'

# Load pickles:
c1 = pickle.load(open(f'pickles/efselab_{DATASET_TYPE}_{DATASET_SIZE}_1.pkl', 'rb'))
c2 = pickle.load(open(f'pickles/efselab_{DATASET_TYPE}_{DATASET_SIZE}_2.pkl', 'rb'))
c3 = pickle.load(open(f'pickles/efselab_{DATASET_TYPE}_{DATASET_SIZE}_3.pkl', 'rb'))
c4 = pickle.load(open(f'pickles/efselab_{DATASET_TYPE}_{DATASET_SIZE}_4.pkl', 'rb'))
c5 = pickle.load(open(f'pickles/efselab_{DATASET_TYPE}_{DATASET_SIZE}_5.pkl', 'rb'))
extrinsic_data = pickle.load( open("pickles/efselab_extrinsic_20000.pkl","rb"))
data = c1 + c2 + c3 + c4 + c5


# Use n_grams or not
n_grams = False


# Use NNGRAM is true, apply on data and extrinsic data
if (n_grams is True):
    NE_GRAMS_FILE = f'ngrams_per_dataset/ne_ngrams_{DATASET_TYPE}_{DATASET_SIZE}_{DATASET_INDEX}.pkl'
    ne_ngrams = pickle.load(open(NE_GRAMS_FILE, 'rb'))
    gold_ngrams = get_gold_ngrams()

    # Optimized pipeline for applying ne+gold ngrams to a corpus (with ne and gold pickles)
    phraser = build_ne_gold_phraser(ne_ngrams, gold_ngrams)
    data = apply_ngrams(data, phraser)
    
    # ngram on extrinsic
    extrinsic_data = apply_ngrams(extrinsic_data, phraser)
    
    
# Format Gensim components for data
s = len(data)
id2word = corpora.Dictionary(data)
id2word.filter_extremes(no_below=2)
corpus = [id2word.doc2bow(text) for text in data]

# Format Gensim components for extrisnic data
extrinsic_id2word = corpora.Dictionary(extrinsic_data)
extrinsic_corpus = [extrinsic_id2word.doc2bow(text) for text in extrinsic_data]

# Format Gensim components for Coherence metrics
ext_gensim_dict = corpora.Dictionary(extrinsic_data)
tok2id = ext_gensim_dict.token2id

# Model type, mallet och gensim
mod_type = 'mallet'

# Number of topics
k = 40

# Alpha parameter
alpha = 'auto' # Dirichlet hyperparameter alpha: Document-Topic Density

# Beta parameter
eta = 'auto' # Dirichlet hyperparameter beta: Word-Topic Density

# Number of topwords used to compute coherence
ntw = 10

# Number of topic used for calculation of matrix spareness
top_n = 3

model = print_coherence_values(mod_type, corpus, id2word, data, s, k, alpha, eta, ntw, top_n, n_grams)

0 number of terms not in extrinsic corpus added as separate one-term documents


In [None]:
############################## EVALUATE AND ISNPECT TOPICS, NEEDS THE PARAMETER MODEL ##############################

In [1]:
if mod_type == 'gensim':
    doc_topics = model.get_document_topics(corpus, per_word_topics=False, minimum_probability=0.0)
    
    for k, num in enumerate(doc_topics):
        sorted_topics = sorted(num, key=lambda tup: tup[1], reverse=True)[:3]
        print(f'Doc{k}: ', sorted_topics)
        
else:
    fname = model.fdoctopics()
    doc_topics = model.read_doctopics(fname, eps=0, renorm=True)
    
    for k, num in enumerate(doc_topics):
        sorted_topics = sorted(num, key=lambda tup: tup[1], reverse=True)[:3]
        print(f'Doc{k}: ', sorted_topics)


NameError: name 'mod_type' is not defined

In [18]:
# Print topics with N top words
num_topics = 40

def print_topics_words(model, num_topics):
    word_dict = {};
    for i in range(num_topics):
        words = model.show_topic(i, topn = 10);
        word_dict['T#' + '{:02d}'.format(i)] = [i[0] for i in words];
        df = pd.DataFrame(word_dict);
    return df

topic_df = print_topics_words(model, num_topics)
topic_df.to_csv('BN_40_TOPICS.csv')
topic_df

Unnamed: 0,T#00,T#01,T#02,T#03,T#04,T#05,T#06,T#07,T#08,T#09,...,T#30,T#31,T#32,T#33,T#34,T#35,T#36,T#37,T#38,T#39
0,krona,familj,barn,vatten,match,företag,myndighet,tidning,byrå,bild,...,mat,aktie,polis,mål,malmö,fråga,stockholm,sverige,studie,kommun
1,miljon,liv,skola,skog,lag,kund,utredning,media,kampanj,tal,...,restaurang,bolag,brott,utsläpp,johan,problem,göteborg,land,forskare,förslag
2,peng,mamma,elev,plan,säsong,produkt,information,bonnier,kommunikation,kyrka,...,jul,bank,kvinna,företag,andersson,samhälle,peter,antal,risk,verksamhet
3,miljard,vän,förälder,område,mål,butik,fråga,expressen,kund,utställning,...,kött,dollar,händelse,rapport,anders,exempel,pris,värld,sjukdom,ordförande
4,resultat,pappa,ungdom,grad,spelare,marknad,regel,journalist,varumärke,museum,...,vin,kvartal,tingsrätt,hållbarhet,nilsson,svar,lars,svensk,patient,budget
5,bolag,barn,utbildning,hav,klubb,varumärke,uppgift,aftonbladet,resumé,verk,...,smak,miljard,åklagare,miljö,mikael,ord,björn,siffra,resultat,politiker
6,vinst,hand,lärare,träd,poäng,bolag,lag,metro,director,konstnär,...,ställe,tillväxt,mord,klimat,jonas,ställe,lista,norge,behandling,arbete
7,kostnad,son,student,väder,tränare,konsument,krav,vd,samarbete,konst,...,tips,börs,fängelse,värld,larsson,debatt,stad,danmark,forskning,behov
8,siffra,dotter,förskola,mark,vetlanda,bransch,åtgärd,svt,reklam,historia,...,kök,swedbank,plats,kommentar,andreas,folk,karin,finland,läkemedel,fråga
9,ökning,par,pojke,flygplats,division,marknadsföring,problem,läsare,idé,sten,...,öl,marknad,utredning,plast,persson,tal,mattias,europa,kvinna,peng


In [None]:
import pyLDAvis.gensim
pyLDAvis.enable_notebook()
import gensim    
#model_plot = gensim.models.wrappers.ldamallet.malletmodel2ldamodel(model)
vis = pyLDAvis.gensim.prepare(model, corpus, dictionary=id2word)
vis

In [None]:
DATASET_TYPE = 'random'
DATASET_SIZE = '4000'
DATASET_INDEX = '3'

other_texts = pickle.load(open(f'efselab_mod/pickles/efselab_parsed_{DATASET_TYPE}_{DATASET_SIZE}_{DATASET_INDEX}.pkl', 'rb'))

new_article = other_texts[10]

id2word = corpora.Dictionary(other_texts)
other_corpus = [id2word.doc2bow(text) for text in other_texts]

new_article_corpus = other_corpus[10]

In [None]:
#model_2 = gensim.models.wrappers.ldamallet.malletmodel2ldamodel(model)
#model.update(other_corpus)

In [None]:
new_article

In [None]:
new_article_corpus

In [None]:
vector = model[new_article_corpus]

In [None]:
vector