In [1]:
import json
import pandas as pd
import numpy as np
import os
import gzip


# read news data and select relevant info into dataframe

In [7]:
def read_json_file(filepath):
    """
    Reads a json file.
    :param filepath: path to a file to read
    :return data: json data from file  
    """

    with open(filepath, encoding="utf-8") as infile:
        data = json.load(infile)

    return data

def read_news_raw_data_json(filepath):
    """
    This function takes a single json file that is a page of results from a single year.
    Returns list of dictionaries that contain info (that we are interested in) about single articles. 
    Each dictionary is a single article.

    :param filepath: path to json file
    :return article_dicts:  (new articles added)
    """

    json_data = read_json_file(filepath)
    articles_full = json_data['articles']['results']       # a dictionary (JSON) of all articles' metadata

    # can later add more fields if necessary (such as url,...)
    article_dicts = []
    for article in articles_full:
        article_dict = {}
        article_dict['body'] = article['body']
        article_dict['media'] = article['source']['title']
        article_dict['title'] = article['title']
        article_dict['date'] = article['date']
    
        article_dicts.append(article_dict)

    return article_dicts


def read_year_news_raw_into_df(path, year):
    """
    Creates a dataframe of news articles for a specified year.
    :param path: path to the news data directory
    :param year: year we want to make dataframe from
    :return: dataframe with data about news from specified year
    """
    year_path = os.path.join(path, year)
    df_year = pd.DataFrame()
    for filename in os.listdir(year_path):
        #print(filename)
        
        f = os.path.join(year_path, filename)
        # checking if it is a file
        try:
            articles = read_news_raw_data_json(f)
        except ValueError as e:
            print('invalid json: %s' % e)
            return None # or: raise
        
        df = pd.DataFrame(articles)
        df_year = pd.concat([df_year, df], ignore_index=True)
        #print(df_year)
    return df_year
        
        
    

In [8]:
%%time
EVENTREGISTRY_PATH = "C:/Users/hladn/FAKS/Magistrsko delo/data/eventregistry"
filepath = os.path.join(EVENTREGISTRY_PATH, "2020", "clanki.2020.page3.json")
df_year_news =  read_year_news_raw_into_df(EVENTREGISTRY_PATH, '2020')
df_year_news

CPU times: total: 2min 48s
Wall time: 2min 56s


Unnamed: 0,body,media,title,date
0,"Ljubljana, 01. januarja (STA) - Vstopili smo v...",Slovenska tiskovna agencija STA,Več desettisoč ljudi novo leto pričakalo na pr...,2020-01-01
1,Številna slovenska večja in manjša mesta že tr...,MMC RTV Slovenija,V Sloveniji številni na prostem pričakali novo...,2020-01-01
2,Zmago Jelinčič Plemeniti je resnično plemenit ...,Časnik.si - Spletni magazin z mero,"Šarec in Jelinčič ljubita Srbe, vohunstvo in k...",2020-01-01
3,Ob vstopu v novo leto je nebo nad Ljutomerom r...,Prlekija-on.net,Vstopili smo v leto 2020. Srečno!,2020-01-01
4,Vstopili smo v leto 2020. Številni so novo let...,Tednik Demokracija,Več desettisoč ljudi novo leto pričakalo na pr...,2020-01-01
...,...,...,...,...
450420,V ZDA se je do ponedeljka večer z novim korona...,PortalPolitikis,V ZDA s koronavirusom okuženih več kot 164.000...,2020-03-31
450421,"""Na začetku mislim, da smo vsi vse skupaj malc...",MMC RTV Slovenija,"Samir Handanović doma karta, igra šah, bere kn...",2020-03-31
450422,Idrija - Območje krajinskega parka Zgornja Idr...,Delo,Krajinski park vendarle dobil upravljavca,2020-03-31
450423,"Singapur, 31. marca (STA) - Cene nafte so se v...",Slovenska tiskovna agencija STA,Cene nafte nekoliko okrevale,2020-03-31


In [13]:
df_year_news.to_parquet('C:/Users/hladn/FAKS/Magistrsko delo/data/eventregistry/df_news_2020.parquet.gzip',compression='gzip')


In [14]:
df_year_news = pd.read_parquet('C:/Users/hladn/FAKS/Magistrsko delo/data/eventregistry/df_news_2020.parquet.gzip')
df_year_news

Unnamed: 0,body,media,title,date
0,"Ljubljana, 01. januarja (STA) - Vstopili smo v...",Slovenska tiskovna agencija STA,Več desettisoč ljudi novo leto pričakalo na pr...,2020-01-01
1,Številna slovenska večja in manjša mesta že tr...,MMC RTV Slovenija,V Sloveniji številni na prostem pričakali novo...,2020-01-01
2,Zmago Jelinčič Plemeniti je resnično plemenit ...,Časnik.si - Spletni magazin z mero,"Šarec in Jelinčič ljubita Srbe, vohunstvo in k...",2020-01-01
3,Ob vstopu v novo leto je nebo nad Ljutomerom r...,Prlekija-on.net,Vstopili smo v leto 2020. Srečno!,2020-01-01
4,Vstopili smo v leto 2020. Številni so novo let...,Tednik Demokracija,Več desettisoč ljudi novo leto pričakalo na pr...,2020-01-01
...,...,...,...,...
450420,V ZDA se je do ponedeljka večer z novim korona...,PortalPolitikis,V ZDA s koronavirusom okuženih več kot 164.000...,2020-03-31
450421,"""Na začetku mislim, da smo vsi vse skupaj malc...",MMC RTV Slovenija,"Samir Handanović doma karta, igra šah, bere kn...",2020-03-31
450422,Idrija - Območje krajinskega parka Zgornja Idr...,Delo,Krajinski park vendarle dobil upravljavca,2020-03-31
450423,"Singapur, 31. marca (STA) - Cene nafte so se v...",Slovenska tiskovna agencija STA,Cene nafte nekoliko okrevale,2020-03-31


# reading tweet data and selecting relevant attributes into a dataframe

In [2]:


def read_tweets_raw_data(filepath):
    """
    This function takes a single filepath of json.gz file containing tweets.
    Returns list of dictionaries that contain info (that we are interested in) about single tweets. 
    Each dictionary is a single article.

    :param filepath: dictionary (JSON)
    :return article_dicts:  (new articles added)
    """

    with gzip.open(filepath, 'r') as fin:
        data = json.loads(fin.read().decode('utf-8'))
        
    tweets_full = data       # a dictionary (JSON) of all articles' metadata

    # can later add more fields if necessary (such as url,...)
    tweet_dicts = []
    for tweet in tweets_full:
        tweet_dict = {}
        tweet_dict['full_text'] = tweet['full_text']
        tweet_dict['created_at'] = tweet['created_at']
        tweet_dict['user_screen_name'] = tweet['user']['screen_name']
        tweet_dict['in_reply_to_user_id'] = tweet['in_reply_to_user_id']
        tweet_dict['is_quote_status'] = tweet['is_quote_status']
        tweet_dict['user_id'] = tweet['user']['id']
        tweet_dict['id'] = tweet['id']        
        
    
        tweet_dicts.append(tweet_dict)

    return tweet_dicts


def read_tweets_raw_into_df(path):
    """
    Creates a dataframe of tweets .
    :param path: path to the tweet data directory
    :return: dataframe with data about tweets
    """
    df_full = pd.DataFrame()
    for filename in os.listdir(path):
        print(filename)
        
        f = os.path.join(path, filename)
        # checking if it is a file
        try:
            tweets = read_tweets_raw_data(f)
        except ValueError as e:
            print('invalid json: %s' % e)
            return None # or: raise
        
        df = pd.DataFrame(tweets)
        df_full = pd.concat([df_full, df], ignore_index=True)
    return df_full
        

In [23]:
TWEETS_PATH = "C:/Users/hladn/FAKS/Magistrsko delo/data/sl-tweets/sl-tweets-2021"
%%time
all_tweets = read_tweets_raw_into_df(TWEETS_PATH)
all_tweets.to_parquet('C:/Users/hladn/FAKS/Magistrsko delo/data/sl-tweets/df_sl_tweets_21.parquet.gzip',compression='gzip')


In [5]:
df_tweets = pd.read_parquet('C:/Users/hladn/FAKS/Magistrsko delo/data/sl-tweets/df_sl_tweets_21.parquet.gzip')
df_tweets

Unnamed: 0,full_text,created_at,user_screen_name,in_reply_to_user_id,is_quote_status,user_id,id
0,Skrivnost ženske https://t.co/BcYeRacFol,Sun Jan 17 18:35:26 +0000 2021,Karmen5,,False,36331012,1350874387394797578
1,Duhovni odmik za ženske MARIJA MAGDALENA – Žen...,Tue Apr 06 10:44:07 +0000 2021,Karmen5,,False,36331012,1379384416384999425
2,@Janez_Mezan Kljub temu jabka ne pade daleč od...,Fri Jan 01 14:14:01 +0000 2021,marjandv,2.354632e+09,False,131540541,1345010394352541696
3,@Japreva Si pa res čuden.,Fri Jan 01 21:54:05 +0000 2021,marjandv,4.343839e+08,False,131540541,1345126173571952645
4,@Janez_Mezan @Dnevnik_si Levak odpre usta in s...,Mon Jan 04 18:51:20 +0000 2021,marjandv,2.354632e+09,False,131540541,1346167347560783874
...,...,...,...,...,...,...,...
11920896,RT @VRUNDRAGICA: @strankaSD Vi ste pustili pra...,Thu Mar 18 15:06:37 +0000 2021,MartinTincek,,False,1055402646,1372565109550243851
11920897,"RT @MarkoPoznic: Opozicija ima tisoč, deset ti...",Thu Mar 18 15:07:12 +0000 2021,MartinTincek,,False,1055402646,1372565256212451328
11920898,"RT @JJansaSDS: “Vojska slonov, ki jo vodi lev,...",Thu Mar 18 15:08:32 +0000 2021,MartinTincek,,True,1055402646,1372565592004304902
11920899,RT @PodobnikMar: @JJansaSDS Koliko trpljenja j...,Thu Mar 18 15:09:06 +0000 2021,MartinTincek,,False,1055402646,1372565732668624898
