In [None]:
from xml.dom import minidom
from typing import Dict
import sqlite3
from pathlib import Path

import requests
from IPython.display import display, HTML
from bs4 import BeautifulSoup

from secrets.config import config

In [None]:
def get_url(url: str) -> str:
    try:
        response = requests.get(url)
        if response.ok:
            return response.content.decode('utf-8')
        else:
            raise Exception('invalid response code', response)
    except Exception as e:
        raise e

def get_token(login_url: str='https://commerce.reuters.com/rmd/rest/xml/login?username={username}&password={password}',
              config: Dict[str, str] = None) -> str:
    login_url = login_url.format(username=config['user'], password=config['password'])
    content = get_url(login_url)
    soup = BeautifulSoup(content, 'lxml')
    return soup.find('authtoken').text

auth_token = get_token(config=config)

In [None]:
channels_raw = get_url(f'https://rmb.reuters.com/rmd/rest/xml/channels?&token={auth_token}')
channels = minidom.parseString(channels_raw)
print(channels.toprettyxml())

In [None]:
soup = BeautifulSoup(channels_raw, 'lxml')
channels = [(ci.find('alias').text, ci.find('description').text)
            for ci in soup.find_all('channelinformation')
            if ci.find('category').get('id') == 'OLR']
channels[:5]

In [None]:
channel = channels[0][0]
items_raw = get_url(f'https://rmb.reuters.com/rmd/rest/xml/items?channel={channel}&mediaType=T&token={auth_token}')
items = minidom.parseString(items_raw)
print(items.toprettyxml())

In [None]:
soup = BeautifulSoup(items_raw, 'lxml')
items = [(ci.find('id').text, ci.find('headline').text,
          ci.find('guid').text, (ci.find('version').text))
            for ci in soup.find_all('result')]

# filter multiple versions, only take newest
tmp = {}
for item in items:
    guid = item[2]
    if guid not in tmp:
        tmp[guid] = item[3]
    else:
        version = tmp[guid]
        if version < item[3]:
            tmp[guid] = item[3]

items = [(item[0], item[1])
         for item in items
         if tmp[item[2]] == item[3]]

items[:5]

In [None]:
item_id = items[0][0]
item_raw = get_url(f'https://rmb.reuters.com/rmd/rest/xml/item?id={item_id}&token={auth_token}')
item = minidom.parseString(item_raw)
print(item.toprettyxml())

In [None]:
soup = BeautifulSoup(item_raw, 'lxml')
content = [c.text for c in soup.find('inlinexml').find_all('p')]
content[:5]

In [None]:
path_data = Path('sql')
path_data.mkdir(exist_ok=True)

db_file = path_data / 'db.sqlite'
conn = sqlite3.connect(str(db_file))