In [21]:
import io
import os.path
import re
import tarfile

import smart_open


In [None]:

def extract_documents(url='https://cs.nyu.edu/~roweis/data/nips12raw_str602.tgz'):
    with smart_open.open(url, "rb") as file:
        with tarfile.open(fileobj=file) as tar:
            for member in tar.getmembers():
                if member.isfile() and re.search(r'nipstxt/nips\d+/\d+\.txt', member.name):
                    member_bytes = tar.extractfile(member).read()
                    yield member_bytes.decode('utf-8', errors='replace')

docs = list(extract_documents())

In [4]:
docs[:2]

['387 \nNeural Net and Traditional Classifiers \x7f \nWilliam Y. Huang and Richard P. Lippmann \nMIT Lincoln Laboratory \nLexington, MA 02173, USA \nAbstract\nPrevious work on nets with continuous-valued inputs led to generative \nprocedures to construct convex decision regions with two-layer percepttons (one hidden \nlayer) and arbitrary decision regions with three-layer percepttons (two hidden layers). \nHere we demonstrate that two-layer perceptton classifiers trained with back propagation \ncan form both convex and disjoint decision regions. Such classifiers are robust, train \nrapidly, and provide good performance with simple decision regions. When complex \ndecision regions are required, however, convergence time can be excessively long and \nperformance is often no better than that of k-nearest neighbor classifiers. Three neural \nnet classifiers are presented that provide more rapid training under such situations. \nTwo use fixed weights in the first one or two layers and are s

## Pre-process and vectorize the documents

As part of preprocessing, we will:

Tokenize (split the documents into tokens).

Lemmatize the tokens.

Compute bigrams.

Compute a bag-of-words representation of the data.

In [2]:
# Tokenize the documents.
from nltk.tokenize import RegexpTokenizer

# Split the documents into tokens.
tokenizer = RegexpTokenizer(r'\w+')
for idx in range(len(docs)):
    docs[idx] = docs[idx].lower()  # Convert to lowercase.
    docs[idx] = tokenizer.tokenize(docs[idx])  # Split into words.

# Remove numbers, but not words that contain numbers.
docs = [[token for token in doc if not token.isnumeric()] for doc in docs]

# Remove words that are only one character.
docs = [[token for token in doc if len(token) > 1] for doc in docs]

In [7]:
# Lemmatize the documents.
import nltk
from nltk.stem.wordnet import WordNetLemmatizer
nltk.download('omw-1.4')

lemmatizer = WordNetLemmatizer()
docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in docs]

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\hladn\AppData\Roaming\nltk_data...


In [8]:
docs[0]

['neural',
 'net',
 'and',
 'traditional',
 'classifier',
 'william',
 'huang',
 'and',
 'richard',
 'lippmann',
 'mit',
 'lincoln',
 'laboratory',
 'lexington',
 'ma',
 'usa',
 'abstract',
 'previous',
 'work',
 'on',
 'net',
 'with',
 'continuous',
 'valued',
 'input',
 'led',
 'to',
 'generative',
 'procedure',
 'to',
 'construct',
 'convex',
 'decision',
 'region',
 'with',
 'two',
 'layer',
 'percepttons',
 'one',
 'hidden',
 'layer',
 'and',
 'arbitrary',
 'decision',
 'region',
 'with',
 'three',
 'layer',
 'percepttons',
 'two',
 'hidden',
 'layer',
 'here',
 'we',
 'demonstrate',
 'that',
 'two',
 'layer',
 'perceptton',
 'classifier',
 'trained',
 'with',
 'back',
 'propagation',
 'can',
 'form',
 'both',
 'convex',
 'and',
 'disjoint',
 'decision',
 'region',
 'such',
 'classifier',
 'are',
 'robust',
 'train',
 'rapidly',
 'and',
 'provide',
 'good',
 'performance',
 'with',
 'simple',
 'decision',
 'region',
 'when',
 'complex',
 'decision',
 'region',
 'are',
 'required',

We find bigrams in the documents. Bigrams are sets of two adjacent words. Using bigrams we can get phrases like “machine_learning” in our output (spaces are replaced with underscores); without bigrams we would only get “machine” and “learning”.

Note that in the code below, we find bigrams and then add them to the original data, because we would like to keep the words “machine” and “learning” as well as the bigram “machine_learning”.

In [9]:
# Compute bigrams.
from gensim.models import Phrases

# Add bigrams and trigrams to docs (only ones that appear 20 times or more).
bigram = Phrases(docs, min_count=20)
for idx in range(len(docs)):
    for token in bigram[docs[idx]]:
        if '_' in token:
            # Token is a bigram, add to document.
            docs[idx].append(token)

In [13]:
bigram[docs[0]]

['neural_net',
 'and',
 'traditional',
 'classifier',
 'william',
 'huang',
 'and',
 'richard_lippmann',
 'mit',
 'lincoln_laboratory',
 'lexington',
 'ma',
 'usa_abstract',
 'previous_work',
 'on',
 'net',
 'with',
 'continuous_valued',
 'input',
 'led',
 'to',
 'generative',
 'procedure',
 'to',
 'construct',
 'convex',
 'decision_region',
 'with',
 'two',
 'layer_percepttons',
 'one',
 'hidden_layer',
 'and',
 'arbitrary',
 'decision_region',
 'with',
 'three',
 'layer_percepttons',
 'two',
 'hidden_layer',
 'here',
 'we',
 'demonstrate',
 'that',
 'two',
 'layer',
 'perceptton',
 'classifier',
 'trained',
 'with',
 'back_propagation',
 'can',
 'form',
 'both',
 'convex',
 'and',
 'disjoint',
 'decision_region',
 'such',
 'classifier',
 'are',
 'robust',
 'train',
 'rapidly',
 'and',
 'provide',
 'good',
 'performance',
 'with',
 'simple',
 'decision_region',
 'when',
 'complex',
 'decision_region',
 'are',
 'required',
 'however',
 'convergence',
 'time',
 'can_be',
 'excessively',

In [16]:
docs[0][-100:]

['feature_map',
 'must_be',
 'nearest_neighbor',
 'monte_carlo',
 'error_rate',
 'back_prop',
 'percent_correct',
 'feature_map',
 'feature_map',
 'doe_not',
 'multi_layer',
 'back_propagation',
 'feature_map',
 'error_rate',
 'nearest_neighbor',
 'fewer_than',
 'feature_map',
 'probability_density',
 'non_zero',
 'can_be',
 'feature_map',
 'worse_than',
 'back_propagation',
 'slower_than',
 'maximum_likelihood',
 'multi_layer',
 'feature_map',
 'had_been',
 'split_into',
 'fig_show',
 'decision_region',
 'back_propagation',
 'error_rate',
 'feature_map',
 'le_than',
 'back_propagation',
 'more_than',
 'feature_map',
 'multi_layer',
 'had_been',
 'decision_region',
 'steady_state',
 'feature_map',
 'steady_state',
 'neural_net',
 'can_be',
 'nearest_neighbor',
 'multi_layer',
 'a_well',
 'such_a',
 'feature_map',
 'layer_percepttons',
 'hidden_layer',
 'decision_region',
 'back_propagation',
 'can_be',
 'decision_region',
 'multi_layer',
 'can_be',
 'feature_map',
 'can_be',
 'nearest_

We remove rare words and common words based on their document frequency. Below we remove words that appear in less than 20 documents or in more than 50% of the documents. Consider trying to remove words only based on their frequency, or maybe combining that with this approach.

In [17]:
# Remove rare and common tokens.
from gensim.corpora import Dictionary

# Create a dictionary representation of the documents.
# Dictionary – a mapping between words and their integer ids.
dictionary = Dictionary(docs)

# Filter out words that occur less than 20 documents, or more than 50% of the documents.
dictionary.filter_extremes(no_below=20, no_above=0.5)

In [33]:
dictionary[80]

'comparative'

In [36]:
texts = [['human', 'interface', 'computer']]
dct = Dictionary(texts)  # initialize a Dictionary
dct.add_documents([["cat", "say", "meow", "human"], ["dog"]])  # add more document (extend the vocabulary)
dct.doc2bow(["dog", "human", "human", "non_existent_word"])

[(1, 2), (6, 1)]

In [18]:
# Bag-of-words representation of the documents.
# doc2bow: Convert document into the bag-of-words (BoW) format = list of (token_id, token_count) tuples.
corpus = [dictionary.doc2bow(doc) for doc in docs] 

In [23]:
corpus[0]

[(0, 2),
 (1, 2),
 (2, 1),
 (3, 1),
 (4, 2),
 (5, 2),
 (6, 1),
 (7, 2),
 (8, 1),
 (9, 3),
 (10, 1),
 (11, 1),
 (12, 1),
 (13, 1),
 (14, 2),
 (15, 2),
 (16, 1),
 (17, 1),
 (18, 3),
 (19, 1),
 (20, 1),
 (21, 1),
 (22, 1),
 (23, 1),
 (24, 1),
 (25, 5),
 (26, 4),
 (27, 6),
 (28, 2),
 (29, 1),
 (30, 1),
 (31, 1),
 (32, 1),
 (33, 1),
 (34, 1),
 (35, 1),
 (36, 1),
 (37, 1),
 (38, 3),
 (39, 1),
 (40, 1),
 (41, 23),
 (42, 1),
 (43, 20),
 (44, 1),
 (45, 1),
 (46, 1),
 (47, 1),
 (48, 2),
 (49, 2),
 (50, 2),
 (51, 1),
 (52, 1),
 (53, 3),
 (54, 2),
 (55, 1),
 (56, 5),
 (57, 2),
 (58, 2),
 (59, 1),
 (60, 1),
 (61, 1),
 (62, 2),
 (63, 3),
 (64, 1),
 (65, 3),
 (66, 1),
 (67, 1),
 (68, 1),
 (69, 1),
 (70, 19),
 (71, 1),
 (72, 4),
 (73, 1),
 (74, 86),
 (75, 1),
 (76, 1),
 (77, 1),
 (78, 1),
 (79, 4),
 (80, 2),
 (81, 1),
 (82, 2),
 (83, 1),
 (84, 13),
 (85, 1),
 (86, 1),
 (87, 1),
 (88, 4),
 (89, 1),
 (90, 1),
 (91, 1),
 (92, 4),
 (93, 1),
 (94, 1),
 (95, 3),
 (96, 1),
 (97, 1),
 (98, 1),
 (99, 1),
 (100

In [19]:
print('Number of unique tokens: %d' % len(dictionary))
print('Number of documents: %d' % len(corpus))

Number of unique tokens: 8644
Number of documents: 1740


In [20]:
# Train LDA model.
from gensim.models import LdaModel

# Set training parameters.
num_topics = 10
chunksize = 2000 #chunksize controls how many documents are processed at a time in the training algorithm
passes = 20 #passes controls how often we train the model on the entire corpus (like epochs)
iterations = 400
eval_every = None  # Don't evaluate model perplexity, takes too much time.

# Make an index to word dictionary.
temp = dictionary[0]  # This is only to "load" the dictionary.
id2word = dictionary.id2token

model = LdaModel(
    corpus=corpus,
    id2word=id2word,
    chunksize=chunksize,
    alpha='auto',
    eta='auto',
    iterations=iterations,
    num_topics=num_topics,
    passes=passes,
    eval_every=eval_every
)

In [37]:
id2word

{0: '1st',
 1: '5oo',
 2: '7th',
 3: 'a2',
 4: 'a_well',
 5: 'able',
 6: 'adapting',
 7: 'addition',
 8: 'adjust',
 9: 'adjusted',
 10: 'advanced',
 11: 'agency',
 12: 'air',
 13: 'air_force',
 14: 'albus',
 15: 'along',
 16: 'alter',
 17: 'alternately',
 18: 'alternative',
 19: 'american',
 20: 'american_institute',
 21: 'amount',
 22: 'another',
 23: 'appeared',
 24: 'april',
 25: 'arbitrary',
 26: 'architecture',
 27: 'area',
 28: 'arrow',
 29: 'assigned',
 30: 'associative',
 31: 'assp',
 32: 'asymptotic',
 33: 'attained',
 34: 'attempt',
 35: 'august',
 36: 'author',
 37: 'automatically',
 38: 'available',
 39: 'averaged',
 40: 'b2',
 41: 'back',
 42: 'back_prop',
 43: 'back_propagation',
 44: 'because_they',
 45: 'before',
 46: 'behavior',
 47: 'belong',
 48: 'best',
 49: 'better',
 50: 'better_than',
 51: 'bin',
 52: 'bl',
 53: 'book',
 54: 'boolean',
 55: 'bottom',
 56: 'boundary',
 57: 'bp',
 58: 'brain',
 59: 'break',
 60: 'burr',
 61: 'calculate',
 62: 'called',
 63: 'capabi

In [21]:
top_topics = model.top_topics(corpus)

# Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
print('Average topic coherence: %.4f.' % avg_topic_coherence)

from pprint import pprint
pprint(top_topics)

Average topic coherence: -1.1230.
[([(0.012668772, 'hidden'),
   (0.008279954, 'hidden_unit'),
   (0.008079696, 'layer'),
   (0.005334097, 'rule'),
   (0.005044605, 'net'),
   (0.004333512, 'code'),
   (0.0041409824, 'propagation'),
   (0.003919791, 'gradient'),
   (0.0038634592, 'solution'),
   (0.0036693944, 'back'),
   (0.0034518326, 'prediction'),
   (0.003405478, 'cost'),
   (0.0033953947, 'training_set'),
   (0.0031170342, 'minimum'),
   (0.0029447128, 'back_propagation'),
   (0.0028661392, 'trained'),
   (0.0027028725, 'table'),
   (0.0026447475, 'connection'),
   (0.0026163682, 'region'),
   (0.0025777011, 'procedure')],
  -0.891225547427413),
 ([(0.017644314, 'cell'),
   (0.016823415, 'neuron'),
   (0.008924995, 'response'),
   (0.008490273, 'stimulus'),
   (0.0070334687, 'spike'),
   (0.0067016305, 'activity'),
   (0.0064425184, 'visual'),
   (0.005569585, 'synaptic'),
   (0.00526994, 'firing'),
   (0.0051779062, 'frequency'),
   (0.0051573347, 'signal'),
   (0.0048824996, 'c

# news data LDA

In [47]:
import pandas as pd

import nltk
import os
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.stem import SnowballStemmer
#from sklearn.feature_extraction.text import TfidfVectorizer
##from sklearn.manifold import TSNE
#from sklearn.manifold import MDS
#from sklearn.decomposition import PCA
import string
import re
from collections import Counter
import classla
from gensim.utils import simple_preprocess


classla.download('sl')        # download non-standard models for Slovenian, use hr for Croatian and sr for Serbian
#classla.download('sl', type='nonstandard')        # download non-standard models for Slovenian, use hr for Croatian and sr for Serbian

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

Downloading https://raw.githubusercontent.com/clarinsi/classla-resources/main/resources_1.0.2.json: 10.5kB [00:00, 1.05MB/s]
2022-10-02 16:54:32 INFO: Downloading these customized packages for language: sl (Slovenian)...
| Processor | Package  |
------------------------
| tokenize  | standard |
| pos       | standard |
| lemma     | standard |
| depparse  | standard |
| ner       | standard |
| pretrain  | standard |

2022-10-02 16:54:34 INFO: File exists: C:\Users\hladn\classla_resources\sl\pos\standard.pt.
2022-10-02 16:54:34 INFO: File exists: C:\Users\hladn\classla_resources\sl\lemma\standard.pt.
2022-10-02 16:54:35 INFO: File exists: C:\Users\hladn\classla_resources\sl\depparse\standard.pt.
2022-10-02 16:54:35 INFO: File exists: C:\Users\hladn\classla_resources\sl\ner\standard.pt.
2022-10-02 16:54:35 INFO: File exists: C:\Users\hladn\classla_resources\sl\pretrain\standard.pt.
2022-10-02 16:54:35 INFO: Finished downloading models and saved to C:\Users\hladn\classla_resources.
[nltk

True

### select data that we will work with

In [85]:
df_all = pd.read_parquet('C:/Users/hladn/FAKS/Magistrsko delo/data/eventregistry/df_news_2020.parquet.gzip')

df = df_all[df_all['media']=='MMC RTV Slovenija']
#df

In [86]:
df = df.sample(5000)

### set stopwords list

In [116]:
filepath = 'C:/Users/hladn/FAKS/Magistrsko delo/data/stopwords.txt'
with open(filepath, 'r') as f:
    additional_stopwords = f.read().splitlines()

stop_words=stopwords.words('slovene') + list(string.punctuation) + additional_stopwords
new_sw = ["rt","href", "http", "https", "quot", "nbsp", "mailto", "mail", "getty", "foto", "images", "urbanec", "sportid"]
stop_words.extend(new_sw)
len(set(stop_words))

2044

In [140]:
'treba' in stop_words

False

['a',
 'ali',
 'april',
 'avgust',
 'b',
 'bi',
 'bil',
 'bila',
 'bile',
 'bili',
 'bilo',
 'biti',
 'blizu',
 'bo',
 'bodo',
 'bojo',
 'bolj',
 'bom',
 'bomo',
 'boste',
 'bova',
 'boĹˇ',
 'brez',
 'c',
 'cel',
 'cela',
 'celi',
 'celo',
 'd',
 'da',
 'daleÄŤ',
 'dan',
 'danes',
 'datum',
 'december',
 'deset',
 'deseta',
 'deseti',
 'deseto',
 'devet',
 'deveta',
 'deveti',
 'deveto',
 'do',
 'dober',
 'dobra',
 'dobri',
 'dobro',
 'dokler',
 'dol',
 'dolg',
 'dolga',
 'dolgi',
 'dovolj',
 'drug',
 'druga',
 'drugi',
 'drugo',
 'dva',
 'dve',
 'e',
 'eden',
 'en',
 'ena',
 'ene',
 'eni',
 'enkrat',
 'eno',
 'etc.',
 'f',
 'februar',
 'g',
 'g.',
 'ga',
 'ga.',
 'gor',
 'gospa',
 'gospod',
 'h',
 'halo',
 'i',
 'idr.',
 'ii',
 'iii',
 'in',
 'iv',
 'ix',
 'iz',
 'j',
 'januar',
 'jaz',
 'je',
 'ji',
 'jih',
 'jim',
 'jo',
 'julij',
 'junij',
 'jutri',
 'k',
 'kadarkoli',
 'kaj',
 'kajti',
 'kako',
 'kakor',
 'kamor',
 'kamorkoli',
 'kar',
 'karkoli',
 'katerikoli',
 'kdaj',
 'kdo',
 

### preprocess text, tokenize and lemmatize

In [88]:
def print_some_texts(columns, df):
    text_idxs = [1, 2, 3]#7240, 7241, 8013, 14500, 16500, 16304, 18300,  21750, 34036]
    for i in text_idxs:
        for column in columns:
            print(df[column].iloc[i])
#print_some_texts(['text'])


def preprocess_text(text):
    text = re.sub(r"http\S+", " ", text)            # remove urls
    text = re.compile('\S*@\S*\s?').sub(r'', text)  # remove mails
    text = re.sub("@[A-Za-z0-9]+","", text)         # remove twitter handle
    text = re.sub('\s+', ' ', text)                 # remove new line
    text = re.sub("&amp;","", text)                  # &amp; is a special character for ampersand
    text = re.sub('<USER>', '', text)               # remove '<USER>' as there are some such strings as user or url is masked with this string
    text = re.sub('<URL>', '', text)
    text = re.sub('[^a-zA-Zčšž]', ' ', text)           # Remove punctuations
    text = text.lower()                             # Convert to lowercase
    text = re.sub("&lt;/?.*?&gt;"," &lt;&gt; ",text)#remove tags
    text = re.sub("(\\d|\\W)+"," ",text)            # remove special characters and digits
    return text

def tokenize(text):
    #print(text)
    text = preprocess_text(text)
    #print(text)
    tokens = word_tokenize(text)
    filtered_tokens = []
    # Filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation). (adapted from lab example)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            if token not in stop_words and len(token) > 2:
                filtered_tokens.append(token)
    return filtered_tokens


def tokenize_lemmatize_classla(text):
    config = {
    #'processors': 'tokenize, lemma', # Comma-separated list of processors to use
    'lang': 'sl', # Language code for the language to build the Pipeline in
    'tokenize_pretokenized': False, # Use pretokenized text as input and disable tokenization
    'use_gpu': True,
    'type':'nonstandard'    # initialize the default non-standard Slovenian pipeline, use hr for Croatian and sr for Serbian
    }
    nlp = classla.Pipeline(**config)

    doc = nlp(text)     # run the pipeline
    lemmas = [word.lemma for sent in doc.sentences for word in sent.words]
    
    return lemmas


def classla_preprocess(text, stop_words, nlp):
    preprocessed_body = []     # a list of words of a single article
    for token in simple_preprocess(text, min_len=3, max_len=25):
      # remove all words shorter than three characters
      if token not in stop_words:
        preprocessed_body.append(token)

    doc = nlp(' '.join(preprocessed_body))
    lemmas = [word.lemma for sent in doc.sentences for word in sent.words]
    return lemmas
    
def stemming(tokens):
    stemmer = SnowballStemmer("english")
    stems = [stemmer.stem(token) for token in tokens]
    return stems

def lemmatizing(tokens):
    lemmatizer = WordNetLemmatizer()
    lemmas = [lemmatizer.lemmatize(token) for token in tokens]
    return lemmas


In [89]:
%%time
df['preprocessed_text']=df['body'].apply(preprocess_text)
#df['tokenized_text']= df['preprocessed_text'].apply(tokenize) 

config = {
'processors': 'tokenize, pos, lemma', # Comma-separated list of processors to use
'lang': 'sl', # Language code for the language to build the Pipeline in
'tokenize_pretokenized': False, # Use pretokenized text as input and disable tokenization
'use_gpu': True,
'type':'nonstandard'    # initialize the default non-standard Slovenian pipeline, use hr for Croatian and sr for Serbian
}
nlp = classla.Pipeline(**config)
df['lemmatized_text']= df['preprocessed_text'].apply(classla_preprocess, args=(stop_words,nlp)) 


2022-10-02 18:43:33 INFO: Loading these models for language: sl (Slovenian):
| Processor | Package     |
---------------------------
| tokenize  | nonstandard |
| pos       | nonstandard |
| lemma     | nonstandard |

2022-10-02 18:43:33 INFO: Use device: cpu
2022-10-02 18:43:33 INFO: Loading: tokenize
2022-10-02 18:43:33 INFO: Loading: pos
2022-10-02 18:43:34 INFO: Loading: lemma
2022-10-02 18:43:36 INFO: Done loading processors!


CPU times: total: 3h 30min 20s
Wall time: 35min 21s


In [90]:
df

Unnamed: 0,body,media,title,date,preprocessed_text,lemmatized_text
243165,Nacionalni štab civilne zaščite je še v ponede...,MMC RTV Slovenija,"Na Hrvaškem 213 novookuženih, v BiH-u 13 umrlih",2020-09-30,nacionalni štab civilne zaščite je še v ponede...,"[nacionalen, štab, civilen, zaščita, ponedelje..."
68701,Župan Radencev Roman Leljak je odredil zaprtje...,MMC RTV Slovenija,Test starša otroka iz vrtca pri Kapeli je bil ...,2020-05-22,upan radencev roman leljak je odredil zaprtje...,"[upan, radencev, roman, Leljak, odrediti, zapr..."
282889,Nagrado Kristine Brenkove podeljuje Zbornica k...,MMC RTV Slovenija,Nagrada Kristine Brenkove za slikanico Timbukt...,2020-10-26,nagrado kristine brenkove podeljuje zbornica k...,"[nagrada, kristina, Brenkov, podeljevati, zbor..."
420542,"Inšpekcijski nadzor, redarska služba, notranja...",MMC RTV Slovenija,"Pet občin združilo moči, da bi od države dobil...",2020-01-08,inšpekcijski nadzor redarska služba notranja r...,"[inšpekcijski, nadzor, redarski, služba, notra..."
300849,"Za demokrate je to slab obet tudi v prihodnje,...",MMC RTV Slovenija,Izidi volitev v kongrese zveznih držav neprije...,2020-11-07,za demokrate je to slab obet tudi v prihodnje ...,"[demokrat, slab, obet, prihodnji, meja, kongre..."
...,...,...,...,...,...,...
293120,Naj demokrati še tako napadajo ameriškega pred...,MMC RTV Slovenija,"Trump ali Biden, kateri bo (iz)kopal ameriško ...",2020-11-02,naj demokrati še tako napadajo ameriškega pred...,"[demokrat, napadati, ameriški, predsednik, Don..."
413525,"Country pop duet The Shires, ki ga sestavljata...",MMC RTV Slovenija,Radio Si,2020-03-03,country pop duet the shires ki ga sestavljata ...,"[country, pop, duet, the, shires, sestavljati,..."
78352,"""Zavedam se vznemirjenosti zaradi izjav, vezan...",MMC RTV Slovenija,Kacin: Maske bodo postale sestavni del naše ci...,2020-05-29,zavedam se vznemirjenosti zaradi izjav vezani...,"[zavedati, vznemirjenost, izjava, vezan, pripo..."
200607,V tem času so testirali nekaj manj kot 3000 lj...,MMC RTV Slovenija,"Na Hrvaškem 145 novih okužb, iz Istre pozivi L...",2020-09-01,v tem času so testirali nekaj manj kot ljudi t...,"[čas, testirati, manj, človek, trenutno, držav..."


In [69]:
print_some_texts(["lemmatized_text", "body", "preprocessed_text" ], df)

['organizator', 'bovški', 'maraton', 'podpisati', 'pogodba', 'sodelovanje', 'lokalen', 'podjetje', 'mines', 'ukvarjati', 'proizvodnja', 'vodovoden', 'izdelek', 'vodilen', 'program', 'sanitaren', 'armatura', 'bliziti', 'podjetje', 'glaven', 'sponzor', 'omenjen', 'pogodba', 'postati', 'generalen', 'pokrovitelj', 'podpis', 'večleten', 'pogodba', 'generalen', 'pokroviteljstvo', 'prireditev', 'preimenovati', 'blitz', 'Bovec', 'maraton', 'sodelovanje', 'podjetje', 'mines', 'odločiti', 'pomagati', 'lokalen', 'skupnost', 'direktor', 'Matej', 'Klavora', 'stran', 'dosti', 'mednarodno', 'pomemben', 'dogodek', 'vidik', 'blagoven', 'znamka', 'sigurno', 'lahko', 'pripomoči', 'glede', 'prepoznavnost', 'predsednik', 'porten', 'društvo', 'bovec', 'maraton', 'Vasja', 'vitez', 'sodelovanje', 'povedati', 'verjeti', 'doprinesti', 'dober', 'rezultat', 'velik', 'število', 'udeleženec', 'Bovec', 'maraton', 'priprava', 'maraton', 'začeti', 'februar', 'takrat', 'začeti', 'pogovarjati', 'ureditev', 'trasa', 'let

In [91]:
# save preprocessed df
df.to_parquet('C:/Users/hladn/FAKS/Magistrsko delo/data/eventregistry/df_news_lemmas_5000.parquet.gzip',compression='gzip')


In [92]:
from gensim.corpora import Dictionary
dictionary = Dictionary(list(df['lemmatized_text']))
corpus = [dictionary.doc2bow(text) for text in list(df['lemmatized_text'])]

#pickle.dump(corpus, open('corpus.pkl', 'wb'))
#dictionary.save('dictionary.gensim')

In [124]:
%%time
# Set training parameters.
num_topics = 20
chunksize = 2000
passes = 20
iterations = 400
eval_every = None  # Don't evaluate model perplexity, takes too much time.

# Make an index to word dictionary.
temp = dictionary[0]  # This is only to "load" the dictionary.
id2word = dictionary.id2token

model = gensim.models.ldamodel.LdaModel(
    corpus=corpus,
    id2word=id2word,
    chunksize=chunksize,
    alpha='auto',
    eta='auto',
    iterations=iterations,
    num_topics=num_topics,
    passes=passes,
    eval_every=eval_every
)

CPU times: total: 3min 22s
Wall time: 3min 13s


In [125]:
topics = model.print_topics(num_words=10)
for topic in topics:
    print(topic)

(0, '0.030*"tekma" + 0.015*"točka" + 0.014*"minuta" + 0.012*"sezona" + 0.009*"zmaga" + 0.008*"liga" + 0.008*"ekipa" + 0.008*"zadnji" + 0.007*"igrati" + 0.007*"igra"')
(1, '0.028*"nov" + 0.027*"okužba" + 0.020*"človek" + 0.018*"koronavirus" + 0.015*"država" + 0.013*"dan" + 0.013*"več" + 0.012*"potrditi" + 0.012*"število" + 0.011*"primer"')
(2, '0.053*"aplikacija" + 0.021*"uporabnik" + 0.018*"telefon" + 0.012*"kralj" + 0.009*"stik" + 0.009*"podatek" + 0.006*"poroka" + 0.004*"Drava" + 0.004*"pameten" + 0.004*"mobilen"')
(3, '0.012*"okužba" + 0.012*"bolnik" + 0.011*"dom" + 0.010*"zdravstven" + 0.009*"imeti" + 0.008*"zaposlen" + 0.008*"bolnišnica" + 0.007*"nov" + 0.007*"lahko" + 0.007*"covid"')
(4, '0.022*"imeti" + 0.017*"lahko" + 0.011*"zelo" + 0.009*"čas" + 0.009*"iti" + 0.007*"veliko" + 0.007*"zdaj" + 0.007*"človek" + 0.007*"velik" + 0.007*"bolj"')
(5, '0.025*"potres" + 0.008*"odpadek" + 0.007*"hrvaški" + 0.007*"dovolilnica" + 0.007*"obveznica" + 0.006*"območje" + 0.006*"poročati" + 0.00

In [126]:
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
pyLDAvis.enable_notebook()

# feed the LDA model into the pyLDAvis instance
lda_viz = gensimvis.prepare(model, corpus, dictionary)

  by='saliency', ascending=False).head(R).drop('saliency', 1)


In [127]:
#pyLDAvis.display(lda_viz)
pyLDAvis.save_html(lda_viz, 'lda.html')

## classla experiments

In [58]:
config = {
'processors': 'tokenize, pos, lemma', # Comma-separated list of processors to use
'lang': 'sl', # Language code for the language to build the Pipeline in
'tokenize_pretokenized': False, # Use pretokenized text as input and disable tokenization
'use_gpu': True,
'type':'nonstandard'    # initialize the default non-standard Slovenian pipeline, use hr for Croatian and sr for Serbian
}
nlp = classla.Pipeline(**config)



2022-10-02 17:08:16 INFO: Loading these models for language: sl (Slovenian):
| Processor | Package     |
---------------------------
| tokenize  | nonstandard |
| pos       | nonstandard |
| lemma     | nonstandard |

2022-10-02 17:08:16 INFO: Use device: cpu
2022-10-02 17:08:16 INFO: Loading: tokenize
2022-10-02 17:08:16 INFO: Loading: pos
2022-10-02 17:08:18 INFO: Loading: lemma
2022-10-02 17:08:20 INFO: Done loading processors!


In [95]:
text_longer = "Tekmovanje za modri trak se je januarja že začelo na polmaratonu v italijanskem Škocjanu ob Soči, drugi del bo aprila na istrskem maratonu, sledita še bovški in goriški maraton. organizatorji bovškega maratona so podpisali pogodbo o sodelovanju z lokalnim podjetjem mines ki se ukvarja s proizvodnjo vodovodnih izdelkov njihov vodilni program so sanitarne armature blizt e pred tem je bilo podjetje eno izmed glavnih sponzorjev z omenjeno pogodbo pa je postalo generalni pokrovitelj s podpisom večletne pogodbe o generalnem pokroviteljstvu se je prireditev preimenovala v blitz bovec maraton za sodelovanje so se v podjetju mines odločili saj želijo s tem pomagati lokalni skupnosti direktor matej klavora po drugi strani je to dosti mednarodno pomemben dogodek in tudi z vidika naše blagovne znamke sigurno nam lahko kaj pripomore glede prepoznavnosti predsednik portnega društva bovec maraton vasja vitez pa je o sodelovanju povedal verjamem da bo to res doprineslo dobre rezultate in še večje število udeležencev na bovec maratonu priprave na maraton so se začele v februarju takrat so se začeli pogovarjati o ureditvi tras na katerih bodo letos tudi blitz točke mi se ukvarjamo s proizvodnjo armatur tako da bomo postavili dve taki točki kjer bodo fizično tuši in se bodo tekmovalci lahko tudi stuširali med tekom nam je pojasnil matej klavora novost letošnjega maratona je pokalno tekmovanje za modri trak ki bo povezalo štiri polmaratone na slovenski in italijanski strani tekmuje se med italijani in slovenci tako da v moški konkurenci šteje prvih šest italijanov in šest slovencev v ženski pa tri ženske na naši strani in tri na sosednji zanimivo je da lahko tudi vsi posamezniki tekmujejo morajo imeti pa vsaj tri udeležbe na tekmovanjih nam je razložil vasja vitez tekmovanje za modri trak se je januarja že začelo na polmaratonu v italijanskem kocjanu ob soči drugi del bo aprila na istrskem maratonu sledita še bovški in goriški maraton"

In [96]:
%%time
for i in range(200):
    doc = nlp(text_longer)     # run the pipeline


CPU times: total: 9min 1s
Wall time: 1min 30s


In [97]:
df_200 = df.sample(200)

In [98]:
%%time
df_200['preprocessed_text'].apply(classla_preprocess, args=(stop_words,nlp)) 

CPU times: total: 8min 27s
Wall time: 1min 25s


33029     [negotovost, kdaj, lahko, vrniti, teren, povzr...
169792    [protestnik, zahtevati, odstop, vlada, Džazira...
325416    [ameriški, različica, britanski, televizijski,...
405613    [predčasen, volitev, odstotek, vprašan, voliti...
10120     [zanimiv, lahko, dojemanje, avtomobilski, znam...
                                ...                        
368760    [televizija, Slovenija, poročati, neuraden, in...
212217    [relativno, miren, etapa, pričakovanje, končat...
79539     [človek, covid, hospitaliziran, intenziven, ne...
356927    [šprint, kontiolahtij, dober, slovenski, biatl...
162755    [družba, news, corporation, ustanoviti, danes,...
Name: preprocessed_text, Length: 200, dtype: object

In [36]:
nlp = classla.Pipeline('sl', type='nonstandard')  # initialize the default non-standard Slovenian pipeline, use hr for Croatian and sr for Serbian
doc = nlp("kva smo mi zural zadn let v zagrebu...")     # run the pipeline
print(doc.to_conll())  

2022-10-02 16:33:48 INFO: Loading these models for language: sl (Slovenian):
| Processor | Package     |
---------------------------
| tokenize  | nonstandard |
| pos       | nonstandard |
| lemma     | nonstandard |
| depparse  | standard    |
| ner       | nonstandard |

2022-10-02 16:33:48 INFO: Use device: cpu
2022-10-02 16:33:48 INFO: Loading: tokenize
2022-10-02 16:33:48 INFO: Loading: pos
2022-10-02 16:33:49 INFO: Loading: lemma
2022-10-02 16:33:51 INFO: Loading: depparse
2022-10-02 16:33:52 INFO: Loading: ner
2022-10-02 16:33:52 INFO: Done loading processors!


# newpar id = 1
# sent_id = 1.1
# text = kva smo mi zural zadn let v zagrebu...
1	kva	kaj	PRON	Pq-nsa	Case=Acc|Gender=Neut|Number=Sing|PronType=Int	4	obj	_	NER=O
2	smo	biti	AUX	Va-r1p-n	Mood=Ind|Number=Plur|Person=1|Polarity=Pos|Tense=Pres|VerbForm=Fin	4	aux	_	NER=O
3	mi	jaz	PRON	Pp1-sd--y	Case=Dat|Number=Plur|Person=1|PronType=Prs|Variant=Short	4	iobj	_	NER=O
4	zural	zurati	VERB	Vmep-pm	Aspect=Perf|Gender=Masc|Number=Plur|VerbForm=Part	0	root	_	NER=O
5	zadn	zadnji	ADJ	Agpnsa	Case=Acc|Degree=Pos|Gender=Masc|Number=Sing	6	amod	_	NER=O
6	let	leto	NOUN	Ncnsa	Case=Acc|Gender=Neut|Number=Sing	4	obl	_	NER=O
7	v	v	ADP	Sl	Case=Loc	8	case	_	NER=O
8	zagrebu	Zagreb	PROPN	Npmsl	Case=Loc|Gender=Masc|Number=Sing	4	obl	_	NER=B-LOC|SpaceAfter=No
9	...	...	PUNCT	Z	_	4	punct	_	NER=O


