In [1]:
import numpy as np
import newspaper
from newspaper import Article
import json
import os
from os import path

In [2]:
#newspapers considered
NEWSPAPERS = {
    'http://libertatea.ro': 'news_libertatea',
    'https://stirileprotv.ro': 'news_protv',
    'https://www.digi24.ro': 'news_digi',
    'https://www.realitatea.net': 'news_realitatea',
    'https://www.mediafax.ro': 'news_mediafax',
    'https://adevarul.ro': 'news_adevarul',
    'https://alephnews.ro': 'news_aleph',
    'https://www.cotidianul.ro': 'news_cotidianul',
    'https://www.zf.ro': 'news_zf',
    'https://evz.ro': 'news_evz'
}
BASE_PATH = '/Users/hakanmeva/Desktop/ro_news_datasets/RomanianNewsArticlesDataset/datasets/'

In [3]:
test_dict = {
    'https://evz.ro': 'news_evz'
}

In [4]:
def get_articles_urls(news_url):
    articles = []
    urls_set = set()
    paper = newspaper.build(news_url, memoize_articles=False)
    for article in paper.articles:
        if article.url not in urls_set:
            urls_set.add(article.url)
            articles.append(article.url)
    return articles

In [5]:
def get_article_data(article_url):
    article = Article(article_url, language="ro") 
    try:
        article.download() 
        article.parse() 
        article.nlp()
        title = article.title
        text = article.text
        summary = article.summary
    except Exception as e:
        title = ""
        text = ""
        summary = ""
    return title, text, summary

In [6]:
def append_articles(NEWSPAPERS, BASE_PATH):
    nr_articles = []
    for news_url in NEWSPAPERS:
        print('Working on: {}'.format(news_url))
        path = BASE_PATH + NEWSPAPERS[news_url] + '.json'
        if os.stat(path).st_size == 0:
            dataset_list = []
        else:
            with open(path) as fp:
                dataset_list = json.load(fp)
                
        urls = get_articles_urls(news_url)
        print('Number of urls: {}'.format(len(urls)))
        counter = 0
        for url in urls:
            title, text, summary = get_article_data(url)
            if title != "" and text != "" and summary != "":
                dataset_list.append(
                    {
                    'title': title,
                    'text': text,
                    'summary': summary,
                    }
                )
                counter += 1
                if counter %10 == 0:
                    print(counter)
        with open(path, 'w') as json_file:
            json.dump(dataset_list, json_file, 
                        indent=4,  
                        separators=(',',': '))
        nr_articles.append(counter)
    
    print('____________________________')
    news_counter = 0
    for news_url in NEWSPAPERS:
        print('Articles appended for {} : {}'.format(news_url, nr_articles[news_counter]))
        news_counter += 1
    print('Articles appended in total: {}'.format(np.sum(nr_articles)))
    print('____________________________')

In [154]:
%%time
#run for : 07.08.22
append_articles(NEWSPAPERS, BASE_PATH)

Working on: http://libertatea.ro
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
280
290
Working on: https://stirileprotv.ro
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
280
290
300
310
320
330
340
350
360
370
380
390
400
410
420
430
440
450
460
470
480
490
500
510
520
Working on: https://www.digi24.ro
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
Working on: https://www.realitatea.net
10
20
30
40
50
60
70
80
90
Working on: https://www.mediafax.ro
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
280
290
300
310
320
330
340
350
360
370
380
390
400
410
420
430
440
450
460
Working on: https://adevarul.ro
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
280
290
300
310
320
330
340
350
360
370
380
390
400
410
420
430
440
450
460
470
480


In [7]:
%%time
#run for : 08.08.22
append_articles(NEWSPAPERS, BASE_PATH)

Working on: http://libertatea.ro
Number of urls: 294
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
280
290
Working on: https://stirileprotv.ro
Number of urls: 372
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
280
290
300
310
320
330
340
350
360
370
Working on: https://www.digi24.ro
Number of urls: 270
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
Working on: https://www.realitatea.net
Number of urls: 90
10
20
30
40
50
60
70
80
Working on: https://www.mediafax.ro
Number of urls: 452
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
280
290
300
310
320
330
340
350
360
370
380
390
400
410
420
430
440
Working on: https://adevarul.ro
Number of urls: 903
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
280
290
300
310
320
330
340
350
360


In [7]:
%%time
#run for : 09.08.22
append_articles(NEWSPAPERS, BASE_PATH)

Working on: http://libertatea.ro
Number of urls: 295
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
280
290
Working on: https://stirileprotv.ro
Number of urls: 531
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
280
290
300
310
320
330
340
350
360
370
380
390
400
410
420
430
440
450
460
470
480
490
500
510
520
530
Working on: https://www.digi24.ro
Number of urls: 270
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
Working on: https://www.realitatea.net
Number of urls: 91
10
20
30
40
50
60
70
80
90
Working on: https://www.mediafax.ro
Number of urls: 439
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
280
290
300
310
320
330
340
350
360
370
380
390
400
410
420
430
Working on: https://adevarul.ro
Number of urls: 903
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
2

In [7]:
%%time
#run for : 10.08.22
append_articles(NEWSPAPERS, BASE_PATH)

Working on: http://libertatea.ro
Number of urls: 299
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
280
290
Working on: https://stirileprotv.ro
Number of urls: 375
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
280
290
300
310
320
330
340
350
360
370
Working on: https://www.digi24.ro
Number of urls: 267
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
Working on: https://www.realitatea.net
Number of urls: 92
10
20
30
40
50
60
70
80
90
Working on: https://www.mediafax.ro
Number of urls: 448
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
280
290
300
310
320
330
340
350
360
370
380
390
400
410
420
430
440
Working on: https://adevarul.ro
Number of urls: 899
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
280
290
300
310
320
330
340
350
360
3

In [7]:
%%time
#run for : 11.08.22
append_articles(NEWSPAPERS, BASE_PATH)

Working on: http://libertatea.ro
Number of urls: 290
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
280
290
Working on: https://stirileprotv.ro
Number of urls: 528
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
280
290
300
310
320
330
340
350
360
370
380
390
400
410
420
430
440
450
460
470
480
490
500
510
520
Working on: https://www.digi24.ro
Number of urls: 282
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
280
Working on: https://www.realitatea.net
Number of urls: 92
10
20
30
40
50
60
70
80
90
Working on: https://www.mediafax.ro
Number of urls: 455
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
280
290
300
310
320
330
340
350
360
370
380
390
400
410
420
430
440
450
Working on: https://adevarul.ro
Number of urls: 919
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
1

In [7]:
%%time
#run for : 12.08.22
append_articles(NEWSPAPERS, BASE_PATH)

Working on: http://libertatea.ro
Number of urls: 282
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
280
Working on: https://stirileprotv.ro
Number of urls: 382
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
280
290
300
310
320
330
340
350
360
370
380
Working on: https://www.digi24.ro
Number of urls: 271
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
Working on: https://www.realitatea.net
Number of urls: 92
10
20
30
40
50
60
70
80
90
Working on: https://www.mediafax.ro
Number of urls: 456
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
280
290
300
310
320
330
340
350
360
370
380
390
400
410
420
430
440
450
Working on: https://adevarul.ro
Number of urls: 893
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
280
290
300
310
320
330
340
3

In [7]:
%%time
#run for : 13.08.22
append_articles(NEWSPAPERS, BASE_PATH)

Working on: http://libertatea.ro
Number of urls: 284
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
280
Working on: https://stirileprotv.ro
Number of urls: 530
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
280
290
300
310
320
330
340
350
360
370
380
390
400
410
420
430
440
450
460
470
480
490
500
510
520
530
Working on: https://www.digi24.ro
Number of urls: 270
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
Working on: https://www.realitatea.net
Number of urls: 92
10
20
30
40
50
60
70
80
90
Working on: https://www.mediafax.ro
Number of urls: 460
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
280
290
300
310
320
330
340
350
360
370
380
390
400
410
420
430
440
450
Working on: https://adevarul.ro
Number of urls: 872
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
2

In [7]:
%%time
#run for : 14.08.22
append_articles(NEWSPAPERS, BASE_PATH)

Working on: http://libertatea.ro
Number of urls: 282
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
280
Working on: https://stirileprotv.ro
Number of urls: 371
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
280
290
300
310
320
330
340
350
360
370
Working on: https://www.digi24.ro
Number of urls: 273
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
Working on: https://www.realitatea.net
Number of urls: 92
10
20
30
40
50
60
70
80
90
Working on: https://www.mediafax.ro
Number of urls: 477
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
280
290
300
310
320
330
340
350
360
370
380
390
400
410
420
430
440
450
460
470
Working on: https://adevarul.ro
Number of urls: 868
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
280
290
300
310
320
330
3

In [7]:
%%time
#run for : 15.08.22
append_articles(NEWSPAPERS, BASE_PATH)

Working on: http://libertatea.ro
Number of urls: 287
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
280
Working on: https://stirileprotv.ro
Number of urls: 536
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
280
290
300
310
320
330
340
350
360
370
380
390
400
410
420
430
440
450
460
470
480
490
500
510
520
530
Working on: https://www.digi24.ro
Number of urls: 283
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
280
Working on: https://www.realitatea.net
Number of urls: 91
10
20
30
40
50
60
70
80
90
Working on: https://www.mediafax.ro
Number of urls: 470
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
280
290
300
310
320
330
340
350
360
370
380
390
400
410
420
430
440
450
460
Working on: https://adevarul.ro
Number of urls: 860
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
1

In [7]:
%%time
#run for : 16.08.22
append_articles(NEWSPAPERS, BASE_PATH)

Working on: http://libertatea.ro
Number of urls: 288
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
280
Working on: https://stirileprotv.ro
Number of urls: 542
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
280
290
300
310
320
330
340
350
360
370
380
390
400
410
420
430
440
450
460
470
480
490
500
510
520
530
540
Working on: https://www.digi24.ro
Number of urls: 270
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
Working on: https://www.realitatea.net
Number of urls: 91
10
20
30
40
50
60
70
80
90
Working on: https://www.mediafax.ro
Number of urls: 449
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
280
290
300
310
320
330
340
350
360
370
380
390
400
410
420
430
440
Working on: https://adevarul.ro
Number of urls: 893
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
2

In [7]:
%%time
#run for : 17.08.22
append_articles(NEWSPAPERS, BASE_PATH)

Working on: http://libertatea.ro
Number of urls: 298
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
280
290
Working on: https://stirileprotv.ro
Number of urls: 382
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
280
290
300
310
320
330
340
350
360
370
380
Working on: https://www.digi24.ro
Number of urls: 277
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
Working on: https://www.realitatea.net
Number of urls: 91
10
20
30
40
50
60
70
80
90
Working on: https://www.mediafax.ro
Number of urls: 461
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
280
290
300
310
320
330
340
350
360
370
380
390
400
410
420
430
440
450
Working on: https://adevarul.ro
Number of urls: 862
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
280
290
300
310
320
330
3

In [7]:
%%time
#run for : 18.08.22
append_articles(NEWSPAPERS, BASE_PATH)

Working on: http://libertatea.ro
Number of urls: 298
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
280
290
Working on: https://stirileprotv.ro
Number of urls: 538
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
280
290
300
310
320
330
340
350
360
370
380
390
400
410
420
430
440
450
460
470
480
490
500
510
520
530
Working on: https://www.digi24.ro
Number of urls: 271
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
Working on: https://www.realitatea.net
Number of urls: 91
10
20
30
40
50
60
70
80
Working on: https://www.mediafax.ro
Number of urls: 452
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
280
290
300
310
320
330
340
350
360
370
380
390
400
410
420
430
440
Working on: https://adevarul.ro
Number of urls: 882
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200


In [7]:
%%time
#run for : 19.08.22
append_articles(NEWSPAPERS, BASE_PATH)

Working on: http://libertatea.ro
Number of urls: 292
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
280
290
Working on: https://stirileprotv.ro
Number of urls: 377
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
280
290
300
310
320
330
340
350
360
370
Working on: https://www.digi24.ro
Number of urls: 280
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
280
Working on: https://www.realitatea.net
Number of urls: 91
10
20
30
40
50
60
70
80
Working on: https://www.mediafax.ro
Number of urls: 459
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
280
290
300
310
320
330
340
350
360
370
380
390
400
410
420
430
440
450
Working on: https://adevarul.ro
Number of urls: 474
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
280
290
300
310
320
330
340
