https://www.err.ee/uudised

In [1]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import json
import calendar
import datetime

In [2]:
headers = requests.utils.default_headers()
headers.update({'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0'})

In [3]:
col_names = ['id', 'heading', 'lead', 'publishDate', 'updatedDate', 'categoryName', 'isPodcast', 'lang', 'url']
df = pd.DataFrame(columns=col_names)
df

Unnamed: 0,id,heading,lead,publishDate,updatedDate,categoryName,isPodcast,lang,url


In [4]:
year = 2021
month = 12

start_dates = pd.date_range(start=datetime.date(year, month, 1), 
                            end=datetime.date(year, month, calendar.monthrange(year, month)[1]))

end_dates = start_dates + pd.DateOffset(1)

# Scrape article meta info

In [5]:
def fetch_content(start_date, end_date):
    url = f'https://www.err.ee/api/category/latest/109?from={start_date}&to={end_date}&limit=500'
    req = requests.get(url, headers)
    soup = BeautifulSoup(req.content, 'html.parser')
    content_json = json.loads(soup.text)
    return content_json

In [6]:
for s, e in zip(start_dates, end_dates):
    start_date = s.strftime('%d.%m.%Y')
    end_date = e.strftime('%d.%m.%Y')
    content_json = fetch_content(start_date, end_date)
    
    for content in content_json:
        values = [content['id'], 
                  content['heading'],
                  content['lead'],
                  pd.to_datetime(content['publicStart'], unit='s'),
                  pd.to_datetime(content['updated'], unit='s'),
                  content['primaryCategory']['name'],
                  content['primaryCategory']['isPodcast'],
                  content['primaryCategory']['lang'],
                  content['canonicalUrl']
                 ]
        data = dict(zip(df.columns, values))
        df = df.append(data, ignore_index=True)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3462 entries, 0 to 3461
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   id            3462 non-null   object        
 1   heading       3462 non-null   object        
 2   lead          3462 non-null   object        
 3   publishDate   3462 non-null   datetime64[ns]
 4   updatedDate   3462 non-null   datetime64[ns]
 5   categoryName  3462 non-null   object        
 6   isPodcast     3462 non-null   object        
 7   lang          3462 non-null   object        
 8   url           3462 non-null   object        
dtypes: datetime64[ns](2), object(7)
memory usage: 243.5+ KB


# Scrape article text

In [8]:
def fetch_article_bodytext(url):
    try:
        req = requests.get(url, headers)
        soup = BeautifulSoup(req.content, 'html.parser')
        body = soup.find('div', {'class':'text'}).text.strip()
        return body
    except:
        return 'Something went wrong'

In [9]:
# df['articleBody'] = df['url'].apply(fetch_article_bodytext)

In [10]:
urls = list(df['url'])
texts = []

for u in urls:
    texts.append(fetch_article_bodytext(u))
    now = datetime.datetime.now().strftime(format='%H:%M:%S')
    percent = round(len(texts)/len(urls)*100, 4)
#     print(f'{now} {len(texts)}/{len(urls)} {percent}%')
    
df['articleText'] = texts

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3462 entries, 0 to 3461
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   id            3462 non-null   object        
 1   heading       3462 non-null   object        
 2   lead          3462 non-null   object        
 3   publishDate   3462 non-null   datetime64[ns]
 4   updatedDate   3462 non-null   datetime64[ns]
 5   categoryName  3462 non-null   object        
 6   isPodcast     3462 non-null   object        
 7   lang          3462 non-null   object        
 8   url           3462 non-null   object        
 9   articleText   3462 non-null   object        
dtypes: datetime64[ns](2), object(8)
memory usage: 270.6+ KB


In [12]:
df.to_csv(f'data/news-{year}-{month:02d}.csv', index=False)