In [None]:
import sqlite3
from pathlib import Path
import json
import requests
from collections import namedtuple

from IPython.display import display, HTML
from bs4 import BeautifulSoup

from secrets.config import config

In [None]:
def get_url(url: str) -> str:
    try:
        response = requests.get(url)
        if response.ok:
            return response
        else:
            raise Exception('invalid response code', response)
    except Exception as e:
        raise e

apiKey = config["newsApiKey"]
        
#print("https://newsapi.org/v1/articles?source=daily-mail&sortBy=top&apiKey={}".format(apiKey))
response = get_url("https://newsapi.org/v1/articles?source=daily-mail&sortBy=top&apiKey={}".format(apiKey))
j = response.json()
articles = j["articles"]
print(len(j["articles"]))
print(articles[0])

In [None]:
# get list of objects with article id (count up with source+count), source_id, headline, excerpt, image_url, 
# article_url and full_text

article = namedtuple("article", ["headline", "excerpt", "full_text", "image_url", "article_url"])

article_tuples = [article(a["title"], a["description"], "", a["urlToImage"], a["url"]) for a in articles]

def get_article_text(a_url: str) -> str:
    response = get_url(a_url)
    data = response.content.decode('utf-8')
    soup = BeautifulSoup(data, 'lxml')
    text = ""
    body = soup.find('div', {'itemprop': 'articleBody'})
    if body is not None:
        text = body.find_all('p', recursive=False)
        text = [t.text for t in text]
        return '\n'.join(text)
    else:
        return ""

# for each article get the text and add it to the array
article_tuples_full = []
for t in article_tuples:
    text = get_article_text(t.article_url)
    article_tuples_full.append(article(t.headline, t.excerpt, text, t.image_url, t.article_url))

#print(article_tuples_full[0])

In [None]:
path_data = Path('sql')
path_data.mkdir(exist_ok=True)

db_file = path_data / 'db.sqlite'
create_tables_file = path_data / 'create_tables.sql'


company_name = 'daily-mail'

with sqlite3.connect(str(db_file)) as con:
    cur = con.cursor()
    cur.execute("select count(*) from company")
    print(cur.fetchone()[0])
    cur.execute("select count(*) from article")
    print(cur.fetchone()[0])
    
    cur.execute(f'SELECT source_id from company WHERE name = ?', (company_name,))
    x = cur.fetchone()
    if x and len(x) > 0:
        cur.execute(f'DELETE FROM article WHERE source_id = ?', (x[0],))
        
    cur.execute(f'DELETE FROM company WHERE name = ?', (company_name,))
    cur.execute(f'INSERT INTO company (name) VALUES (?)', (company_name,))
    cur.execute(f'SELECT source_id from company WHERE name = ?', (company_name,))
    x = cur.fetchone()
    if x and len(x) > 0:
        company_id = x[0]
    else:
        raise Exception('?')
    con.commit()
        
    cur.execute("select count(*) from article")
    print(cur.fetchone()[0])
    #"article", ["headline", "excerpt", "full_text", "image_url", "article_url"]
    for a in article_tuples_full:
        cur.execute(f'INSERT INTO article (source_id, headline, excerpt, image_url, article_url, full_text) VALUES (?, ?, ?, ?, ?, ?)',
                    (company_id, a.headline, a.excerpt, a.image_url, a.article_url, a.full_text))
    con.commit()    
    cur.execute("select count(*) from article")
    print(cur.fetchone()[0])
    
    cur.execute("select count(*) from company")
    print(cur.fetchone()[0])