In [3]:
import pandas as pd

import nltk

import classla
import gensim
from gensim.utils import simple_preprocess

from utils import classla_lemmatize, preprocess_text, preprocess_and_lemmatize_news, preprocess_and_lemmatize_tweets
SEED = 42
ROOT_PATH = "/home/jhladnik"


#classla.download('sl')        # download non-standard models for Slovenian, use hr for Croatian and sr for Serbian
#classla.download('sl', type='nonstandard')        # download non-standard models for Slovenian, use hr for Croatian and sr for Serbian

#nltk.download('stopwords')
#nltk.download('punkt')
#nltk.download('wordnet')

## select data that we will work with

In [2]:
%%time
# 12mio tweets
df_all = pd.read_parquet(
     f'{ROOT_PATH}/data/sl-tweets/df_sl_tweets_21.parquet.gzip')

NUM_SAMPLES = 5000
df = df_all.sample(NUM_SAMPLES, random_state=SEED)

CPU times: user 20.1 s, sys: 7.15 s, total: 27.2 s
Wall time: 23.3 s


## preprocess text, tokenize and lemmatize if needed

In [8]:
df = preprocess_and_lemmatize_tweets(df)


2023-01-14 20:48:02 INFO: Loading these models for language: sl (Slovenian):
| Processor | Package     |
---------------------------
| tokenize  | nonstandard |
| pos       | nonstandard |
| lemma     | nonstandard |

2023-01-14 20:48:02 INFO: Use device: gpu
2023-01-14 20:48:02 INFO: Loading: tokenize
2023-01-14 20:48:02 INFO: Loading: pos
2023-01-14 20:48:03 INFO: Loading: lemma
2023-01-14 20:48:05 INFO: Done loading processors!


In [10]:
# save preprocessed df
df.to_parquet(
    f'{ROOT_PATH}/data/sl-tweets/df_tweets_lemmas_{NUM_SAMPLES}.parquet.gzip', compression='gzip')

## LDA on preprocessed data

In [6]:
from gensim.corpora import Dictionary
NUM_SAMPLES = 50000
df = pd.read_parquet(
    f'{ROOT_PATH}/data/sl-tweets/df_tweets_lemmas_{NUM_SAMPLES}.parquet.gzip')

dictionary = Dictionary(list(df['lemmatized_text']))
corpus = [dictionary.doc2bow(text) for text in list(df['lemmatized_text'])]


In [7]:
%%time
# Set training parameters.
num_topics = 6
chunksize = 2000
passes = 20
iterations = 400
eval_every = None  # Don't evaluate model perplexity, takes too much time.

# Make an index to word dictionary.
temp = dictionary[0]  # This is only to "load" the dictionary.
id2word = dictionary.id2token

trained_model = gensim.models.ldamodel.LdaModel(
    corpus=corpus,
    id2word=id2word,
    chunksize=chunksize,
    alpha='auto',
    eta='auto',
    iterations=iterations,
    num_topics=num_topics,
    passes=passes,
    eval_every=eval_every
)

CPU times: user 2min 13s, sys: 74.1 ms, total: 2min 13s
Wall time: 2min 13s


In [8]:
trained_model.show_topic(1, topn=25)


[('otrok', 0.020536497),
 ('šola', 0.012518178),
 ('cepljen', 0.011422472),
 ('cepiti', 0.010966514),
 ('maska', 0.008195595),
 ('rabiti', 0.0064248904),
 ('zdravnik', 0.0053547383),
 ('pasti', 0.005289468),
 ('hoditi', 0.004824069),
 ('nizek', 0.004500182),
 ('necepljen', 0.00438563),
 ('stroka', 0.0039608893),
 ('nositi', 0.0038669962),
 ('mama', 0.0037705293),
 ('organizacija', 0.003659485),
 ('cesta', 0.0036273156),
 ('pismo', 0.003429225),
 ('super', 0.0034016853),
 ('testiranje', 0.0033228688),
 ('premier', 0.0033098867),
 ('avto', 0.0032160825),
 ('mati', 0.002968089),
 ('žena', 0.0029330377),
 ('mali', 0.0028669166),
 ('zapis', 0.0027867146)]

In [9]:
from evaluation import get_top_words_for_topics_lda, topic_diversity

topics_representations = get_top_words_for_topics_lda(trained_model, 25)
topics_representations

[['covid',
  'cepljenje',
  'cepivo',
  'ukrep',
  'teden',
  'Ljubljana',
  'protest',
  'virus',
  'policija',
  'epidemija',
  'okužba',
  'oddaja',
  'oseba',
  'direktor',
  'umreti',
  'zdravje',
  'skrbeti',
  'zdravstven',
  'svetoven',
  'popolnoma',
  'starš',
  'obvezen',
  'zdrav',
  'dogajati',
  'pamet'],
 ['otrok',
  'šola',
  'cepljen',
  'cepiti',
  'maska',
  'rabiti',
  'zdravnik',
  'pasti',
  'hoditi',
  'nizek',
  'necepljen',
  'stroka',
  'nositi',
  'mama',
  'organizacija',
  'cesta',
  'pismo',
  'super',
  'testiranje',
  'premier',
  'avto',
  'mati',
  'žena',
  'mali',
  'zapis'],
 ['Slovenija',
  'minister',
  'zakon',
  'evropski',
  'denar',
  'ura',
  'jasen',
  'najti',
  'hud',
  'voda',
  'narod',
  'postati',
  'plača',
  'rdeč',
  'test',
  'vesel',
  'Evropa',
  'družba',
  'dejstvo',
  'sistem',
  'zgodovina',
  'glaven',
  'komunističen',
  'podjetje',
  'predstavljati'],
 ['pisati',
  'a',
  'svoboda',
  'skupina',
  'milijon',
  'zdeti',
  '

### calculate topic diversity

In [10]:


topic_diversity(topics_representations, 25)

1.0

### calculate topic coherence


In [14]:
from gensim.models import CoherenceModel


In [12]:
def topic_coherence(trained_model, topic_top_words, top_k_words=25):
    if topic_top_words is None:
        return -1
    
    if top_k_words > len(topic_top_words[0]):
        print("top_k_words is larger than the number of words in the topic")
        return -1

    coherence_model_lda = CoherenceModel(
        topics=topic_top_words,
        texts=list(df['lemmatized_text']),
        dictionary=Dictionary(list(df['lemmatized_text'])),
        coherence='c_npmi',
        topn=top_k_words)

    return coherence_model_lda.get_coherence()

topic_coherence(trained_model, topics_representations, top_k_words=10)

-0.08182563200260287

In [15]:
topics = trained_model.print_topics(num_words=10)
for topic in topics:
    print(topic)

(0, '0.013*"covid" + 0.012*"cepljenje" + 0.009*"cepivo" + 0.009*"ukrep" + 0.008*"teden" + 0.008*"Ljubljana" + 0.008*"protest" + 0.007*"virus" + 0.006*"policija" + 0.006*"epidemija"')
(1, '0.021*"otrok" + 0.013*"šola" + 0.011*"cepljen" + 0.011*"cepiti" + 0.008*"maska" + 0.006*"rabiti" + 0.005*"zdravnik" + 0.005*"pasti" + 0.005*"hoditi" + 0.005*"nizek"')
(2, '0.030*"Slovenija" + 0.011*"minister" + 0.009*"zakon" + 0.007*"evropski" + 0.007*"denar" + 0.006*"ura" + 0.006*"jasen" + 0.005*"najti" + 0.005*"hud" + 0.005*"voda"')
(3, '0.011*"pisati" + 0.006*"a" + 0.006*"svoboda" + 0.006*"skupina" + 0.006*"milijon" + 0.005*"zdeti" + 0.005*"video" + 0.005*"slika" + 0.005*"evro" + 0.005*"uspeti"')
(4, '0.034*"imeti" + 0.016*"iti" + 0.014*"vlada" + 0.014*"človek" + 0.013*"vedeti" + 0.010*"nov" + 0.009*"država" + 0.008*"slovenski" + 0.008*"čas" + 0.007*"delati"')
(5, '0.013*"državen" + 0.008*"opozicija" + 0.006*"ženska" + 0.005*"pct" + 0.005*"pogoj" + 0.005*"seja" + 0.005*"vseeno" + 0.004*"zmaga" + 0.

In [16]:
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
pyLDAvis.enable_notebook()

# feed the LDA model into the pyLDAvis instance
lda_viz = gensimvis.prepare(model, corpus, dictionary)
pyLDAvis.save_html(lda_viz, 'lda_tweets.html')


  default_term_info = default_term_info.sort_values(


## NMF on preprocessed, lemmatized data

In [16]:
%%time
## NMF on preprocessed, lemmatized tweets


# pickle.dump(corpus, open('corpus.pkl', 'wb'))
# dictionary.save('dictionary.gensim')

dictionary[0]
# Set training parameters.
num_topics = 20
chunksize = 2000
passes = 20
iterations = 400
eval_every = None  # Don't evaluate model perplexity, takes too much time.

# Make an index to word dictionary.
temp = dictionary[0]  # This is only to "load" the dictionary.
id2word = dictionary.id2token

trained_model = gensim.models.Nmf(
    corpus=corpus,
    id2word=id2word,
    #chunksize=chunksize,
    #alpha='auto',
    #eta='auto',
    #iterations=iterations,
    num_topics=num_topics,
    passes=passes,
    eval_every=eval_every,
    random_state=SEED
)


CPU times: user 4min 46s, sys: 19min 10s, total: 23min 56s
Wall time: 32.6 s


In [17]:
trained_model.show_topic(1, topn=25)


[('vlada', 0.1827437413576595),
 ('predsednik', 0.04169111147047659),
 ('ukrep', 0.006169626869184837),
 ('lev', 0.00604045382866434),
 ('opozicija', 0.005351711191176553),
 ('Janša', 0.004457527030351594),
 ('volitev', 0.004378331042516606),
 ('Janez', 0.004086266843545166),
 ('Janšev', 0.004081772864714959),
 ('voditi', 0.0032795616087939053),
 ('podpirati', 0.003003499414935349),
 ('podpora', 0.002856673434545596),
 ('oblast', 0.002833980582401722),
 ('protest', 0.002783133044230057),
 ('stranka', 0.0027534942119133736),
 ('odstopiti', 0.002621369509856332),
 ('kul', 0.0025486283990572792),
 ('šarčev', 0.0024392277815469044),
 ('državen', 0.0024002465485157836),
 ('desen', 0.0023919427139053252),
 ('SDS', 0.002382160577759575),
 ('epidemija', 0.002376019395057809),
 ('kriv', 0.002347877325921306),
 ('predlagati', 0.002317068798547082),
 ('skupina', 0.0022530935225341065)]

In [18]:
from evaluation import get_top_words_for_topics_lda, topic_diversity

topics_representations = get_top_words_for_topics_lda(trained_model, 25)
topics_representations

[['čas',
  'epidemija',
  'dolgo',
  'smrt',
  'teden',
  'vzeti',
  'vprašanje',
  'najbolj',
  'kriza',
  'narediti',
  'naprej',
  'skrajen',
  'neznan',
  'imeti',
  'večina',
  'deloven',
  'poslanec',
  'mesto',
  'državen',
  'oblast',
  'ura',
  'spomniti',
  'ukvarjati',
  'svoboda',
  'vrniti'],
 ['vlada',
  'predsednik',
  'ukrep',
  'lev',
  'opozicija',
  'Janša',
  'volitev',
  'Janez',
  'Janšev',
  'voditi',
  'podpirati',
  'podpora',
  'oblast',
  'protest',
  'stranka',
  'odstopiti',
  'kul',
  'šarčev',
  'državen',
  'desen',
  'SDS',
  'epidemija',
  'kriv',
  'predlagati',
  'skupina'],
 ['Slovenija',
  'republika',
  'evropski',
  'Slovenec',
  'Evropa',
  'zgodovina',
  'RTV',
  'samostojen',
  'obletnica',
  'državljan',
  'vojna',
  'podatek',
  'ponosen',
  'stanje',
  'prebivalec',
  'komunist',
  'predsedovanje',
  'Avstrija',
  'hrvaški',
  'via',
  'novinar',
  'gospodarski',
  'Bruselj',
  'medijski',
  'video'],
 ['covid',
  'sodišče',
  'cepljenje',


### calculate topic diversity

In [19]:


topic_diversity(topics_representations, 25)

0.562

### calculate topic coherence


In [21]:

topic_coherence(trained_model, topics_representations, top_k_words=25)

-0.062126078646029495

In [22]:
topics = trained_model.print_topics(num_words=15)
for topic in topics:
    print(topic)


(0, '0.137*"čas" + 0.005*"epidemija" + 0.004*"dolgo" + 0.004*"smrt" + 0.003*"teden" + 0.003*"vzeti" + 0.003*"vprašanje" + 0.003*"najbolj" + 0.003*"kriza" + 0.003*"narediti" + 0.002*"naprej" + 0.002*"skrajen" + 0.002*"neznan" + 0.002*"imeti" + 0.002*"večina"')
(1, '0.183*"vlada" + 0.042*"predsednik" + 0.006*"ukrep" + 0.006*"lev" + 0.005*"opozicija" + 0.004*"Janša" + 0.004*"volitev" + 0.004*"Janez" + 0.004*"Janšev" + 0.003*"voditi" + 0.003*"podpirati" + 0.003*"podpora" + 0.003*"oblast" + 0.003*"protest" + 0.003*"stranka"')
(2, '0.175*"Slovenija" + 0.005*"republika" + 0.005*"evropski" + 0.005*"Slovenec" + 0.003*"Evropa" + 0.003*"zgodovina" + 0.003*"RTV" + 0.003*"samostojen" + 0.003*"obletnica" + 0.002*"državljan" + 0.002*"vojna" + 0.002*"podatek" + 0.002*"ponosen" + 0.002*"stanje" + 0.002*"prebivalec"')
(3, '0.067*"covid" + 0.023*"sodišče" + 0.019*"cepljenje" + 0.018*"cepljen" + 0.017*"cepivo" + 0.014*"ustaven" + 0.012*"ukrep" + 0.011*"cepiti" + 0.008*"okužba" + 0.007*"test" + 0.007*"viru

In [23]:
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
pyLDAvis.enable_notebook()

# feed the LDA model into the pyLDAvis instance
lda_viz = gensimvis.prepare(trained_model, corpus, dictionary)
pyLDAvis.save_html(lda_viz, 'lda_news.html')


  and should_run_async(code)


AttributeError: 'Nmf' object has no attribute 'inference'