**Procesiranje podatkov**

V tej beležnici preberemo in shranimo podatke iz podatkovne zbirke slovenskih člankov ter jih pred-procesiramo.

# Priprava okolja

In [None]:
!pip install classla

In [None]:
import zipfile
import tarfile
import json
import os

import classla
classla.download('sl')

from gensim.utils import simple_preprocess
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords


import sys
sys.path.insert(0, '/content/drive/MyDrive/Colab Notebooks/')
from utils import read_json_file, save_articles, prepare_dataframe, visualize_articles_by_media, read_preprocessed_specific_media, dataframe_info
import pandas as pd




import seaborn as sns
from matplotlib import pyplot as plt
from collections import OrderedDict, defaultdict

# Pomožne funkcije

In [None]:
def save_extracted_articles(articles, dir):
  """
  Save articles to a file.
  :param articles: articles to be saved
  """

  for media in articles.keys():
    filename = dir + media
    with open(filename, 'w', encoding='utf8') as fp:
      json.dump(articles[media], fp)

In [None]:
def read_data_json(json_file, articles_by_media):
  """
  This function reads a single json file and it returns a dictionary of articles
  in json_file

  :param json_file: json file
  :param articles_by_media: a dictionary of media names as keys and articles as
  values
  :return: articles_by_media (new articles added)
  """

  data = json.load(json_file)

  articles_full = data['articles']['results']       # a dictionary (JSON) of all articles' metadata

  for article in articles_full:
    body = article['body']
    media = article['source']['title']
    title = article['title']
    
    if media not in articles_by_media.keys():
      articles_by_media[media] = {}
      articles_by_media[media]['body'] = []
      articles_by_media[media]['title'] = []

    articles_by_media[media]['body'].append(body)
    articles_by_media[media]['title'].append(title)

  return articles_by_media  

In [None]:
def read_data_zip(filepath):

  """
  Read and save data from a zip file of dataset of Slovenian articles. A zip file contains 7 tar.gz 
  files, each one for a year from 2014 and 2020.

  :param filepath: path to the data zip file
  """

  with zipfile.ZipFile(filepath, 'r') as zip_file:

    year = 2014
    for year_file in zip_file.namelist()[1:8]:  
      save_path = f'/content/drive/MyDrive/Colab Notebooks/raw_articles/{year}/'

      articles_by_media = {}
      
      zip_file.extract(year_file)
      tar = tarfile.open(year_file)

      for member in tar.getmembers()[1:]:
        json_file = tar.extractfile(member.name)
        articles_by_media = read_data_json(json_file, articles_by_media)

      try:
        save_extracted_articles(articles_by_media, save_path)
      except FileNotFoundError as err:
        print(err)
      
      year += 1  

In [None]:
def preprocess_articles(articles, stop_words, nlp):
  """
  Preprocess a list of raw articles. Remove words in stop_words list and are 
  shorter than 4 words from each article from article list and lemmatize each 
  word with nlp pipeline.

  :param articles: list of strings to preprocess
  :param stop_words: list of words to be removed from articles
  :param nlp: stanza pipeline for word lemmatization

  :return preprocessed_articles: a list of preprocessed articles (lists of lemmas)
  """

  preprocessed_articles = []    # list of preprocessed articles

  for article in articles:
    

    doc = nlp(article)

    preprocessed = []
    for word in doc.iter_tokens():
      word_dict = word.to_dict()[0]

      if word_dict['upos'] in ['NOUN', 'ADJ', 'PROPN', 'VERB']:
        preprocessed.append(word_dict['lemma'])


      preprocessed_body = []     # a list of words of a single article
      for token in simple_preprocess(article, min_len=4, max_len=25):
        # remove all words shorter than three characters
        if token not in stop_words:
          preprocessed_body.append(token)

      doc = nlp(' '.join(preprocessed_body))
      lemmas = [word.lemma for sent in doc.sentences for word in sent.words]

      preprocessed_articles.append(lemmas)

  return preprocessed_articles



In [None]:
def preprocess_media_articles(media_list, load_dir, save_dir):
  """
  Preprocess articles from media_list files in load_dir and save them to save_dir

  :param media_list: a list of media names we want to preprocess
  :param load_dir: a path to directory of files with raw articles
  :param save_dir: a path to directory where preprocessed files will be saved
  """

  stop_words = stopwords.words('slovene')
  new_sw = ["href", "http", "https", "quot", "nbsp", "mailto", "mail", "getty", "foto", "images", "urbanec", "sportid"]
  stop_words.extend(new_sw)

  filepath = '/content/drive/MyDrive/Colab Notebooks/stopwords'
  with open(filepath, 'r') as f:
    additional_stopwords = f.read().splitlines()

  stop_words.extend(additional_stopwords)
  stop_words = list(set(stop_words))

  config = {
  	'processors': 'tokenize, lemma', # Comma-separated list of processors to use
  	'lang': 'sl', # Language code for the language to build the Pipeline in
    'tokenize_pretokenized': True, # Use pretokenized text as input and disable tokenization
    'use_gpu': True
  }
  nlp = classla.Pipeline(**config)

  
  for file in os.listdir(load_dir):

    if file not in media_list:
      continue

    save_filepath = save_dir + file
    if os.path.exists(save_filepath):
      print("File ", file, " already exists")
      continue

    if not os.path.exists(save_dir):
      os.mkdir(save_dir)

    load_filepath = load_dir + file
    articles = read_json_file(load_filepath)

    df = pd.DataFrame.from_dict(articles)
    df['word_length'] = df.body.apply(lambda x: len(str(x).split()))
    df = df.loc[df['word_length'] > 25]
    df = df.drop_duplicates(subset='title', keep="last")
    df = df.drop('word_length', axis=1)
    articles = df.to_dict('list')

    
    print(f"Preprocessing file: {file} with {len(articles['body'])} articles")
    preprocessed_articles = preprocess_articles(articles['body'], stop_words, nlp)
    
    save_articles(preprocessed_articles, save_filepath)
    print(f"File saved to {save_filepath}!\n**********************")

# Main

Nastavljanje konstant

In [None]:
YEAR = 2020
media_list = ['Dnevnik', 'MMC RTV Slovenija', '24ur.com', 'Siol.net Novice', 'Nova24TV', 'Tednik Demokracija', 'PortalPolitikis']
load_dir = f'/content/drive/MyDrive/Colab Notebooks/raw_articles/{YEAR}/'
save_dir = f'/content/drive/MyDrive/Colab Notebooks/preprocessed_articles/{YEAR}/'

## **Predprocesiranje člankov**

V tem delu se prebere članke navedenih medijev v media_list, odstrani tiste s krajšim besedilom od 25 besed in tiste, ki imajo znotraj posameznega medija enake naslove (duplikati).

Nato vsak članek razdeli na besede (angl. tokenize), odstrani vse besede, ki so v stop_words (besede, ki nimajo nekega pomena, npr. da, tako, in...) in ki so krajše od 4 črk. Besede, ki so ostale lematiziramo (spremenimo v osnovno obliko)



In [None]:
preprocess_media_articles(media_list, load_dir, save_dir)

**Post-procesiranje**

Pri določenih medijih se določeni deli člankov pojavljajo v mnogih člankih, zato je smiselno te ponavljajoče se dele odstraniti vsaj iz že pred-procesiranih člankov.

*Slovenska tiskovna agencija STA:* Vsak članek se začne na način: 'Ljubljana, 29. oktobra (STA)' - 'vsebina članka'. Te dele torej odstranimo.

*24ur.com*: Veliko člankov ima na začetku članka del besedila, ki se nanaša na omogočanje piškotkov spletnega mesta. Ta del odstranimo iz člankov.

*Siol.net Novice*: Veliko člankov se začne z besedilom, ki se nanaša ta t.i. *termometer*, ki bralcu razloži vlogo le-tega pri poročanju o popularnosti članka. Tudi te dele odstranimo iz člankov.


Poleg tega odstranimo tudi članke z manj kot 25 besedami.

In [None]:
df = prepare_dataframe(media_list, YEAR)

In [None]:
# ZA STA PREDPROCESIRANE BODY-JE

print(df.loc[df.media == 'Slovenska tiskovna agencija STA', 'preprocessed_body'])
# df.loc[df.media == 'Slovenska tiskovna agencija STA', 'preprocessed_body'] = df.loc[df.media == 'Slovenska tiskovna agencija STA', 'preprocessed_body'].apply(lambda x: x[1:])

# Če izhod naslednje vrstice na začetku vsakega seznama ne vsebuje več imena kraja ali meseca, smo odstranili ponavljajoče se dele
print(df.loc[df.media == 'Slovenska tiskovna agencija STA', 'preprocessed_body'])

# # ZA Siol PREDPROCESIRANE BODY-JE
# df.loc[df.media == 'Siol.net Novice', 'preprocessed_body'] = df.loc[df.media == 'Siol.net Novice', 'preprocessed_body'].apply(lambda x: x[10:] if x[0] == 'termometer' else x)

# Če je izhod naslednje vrstice enak '['ne']', potem smo odstranili ponavljajoče se dele besedila
print(df.loc[df.media == 'Siol.net Novice', 'preprocessed_body'].apply(lambda x: 'ja' if x[0] == 'termometer' else 'ne').unique())

# # ZA 24ur.com PREDPROCESIRANE BODY-JE
# df.loc[df.media == '24ur.com', 'preprocessed_body'] = df.loc[df.media == '24ur.com', 'preprocessed_body'].apply(lambda x: x[10:] if 'piškotek' in x[:9] else x)

# Če je izhod naslednje vrstice enak '['ne']', potem smo odstranili ponavljajoče se dele besedila
print(df.loc[df.media == '24ur.com', 'preprocessed_body'].apply(lambda x: 'ja' if 'piškotek' in x[:9] else 'ne').unique())


# save_preprocessed_articles(df.loc[df.media == 'Slovenska tiskovna agencija STA', 'preprocessed_body'].to_list(), '/content/gdrive/MyDrive/Colab Notebooks/preprocessed_articles/'+ str(2017) + '/' + 'Slovenska tiskovna agencija STA')
# save_preprocessed_articles(df.loc[df.media == 'Siol.net Novice', 'preprocessed_body'].to_list(), '/content/gdrive/MyDrive/Colab Notebooks/preprocessed_articles/'+ str(YEAR) + '/' + 'Siol.net Novice')
# save_preprocessed_articles(df.loc[df.media == '24ur.com', 'preprocessed_body'].to_list(), '/content/gdrive/MyDrive/Colab Notebooks/preprocessed_articles/'+ str(YEAR) + '/' + '24ur.com')

## **Predstavitev končnih podatkov**

Prikaz števila člankov posameznega leta v določenem letu.

In [None]:
count = {}
for f in os.listdir(load_dir):
  if os.path.isfile(f'{load_dir}{f}'):
    articles = read_json_file(f'{load_dir}{f}')
    count[f] = len(articles['body'])

count = dict(sorted(count.items(), key=lambda item: item[1], reverse=True)[:20])

visualize_articles_by_media(list(count.keys()), list(count.values()))

In [None]:
df = prepare_dataframe(media_list, YEAR)

Prikaz števila člankov izbranih medijev v izbranem letu

In [None]:
count_articles = df.media.value_counts().to_dict()
media_names = list(count_articles.keys())
counts = list(count_articles.values())
print(f'Število vseh člankov skupaj: {sum(counts)}')
visualize_articles_by_media(count_articles, counts)

Prikaz števila besed v člankih izbranih medijev (skupno)

In [None]:
dataframe_info(df, 'word_length')

Prikaz števila besed v člankih izbranih medijev (vsak medij posebej)

In [None]:
for media in media_list:
  print(f'\n{media}')
  dataframe_info(df.loc[df.media == media], 'word_length', media)