In [83]:
import feedparser
import requests
import newspaper
import pandas
import newsgraph
import json
import datetime
from google.cloud import bigquery
from newsgraph.bigquery import insert_into_table, create_table_if_not_exists

In [72]:
feed_url = "http://ep00.epimg.net/rss/elpais/portada.xml"
feed = feedparser.parse(feed_url)
feed.entries[0].keys()

dict_keys(['title', 'title_detail', 'links', 'link', 'id', 'guidislink', 'authors', 'author', 'author_detail', 'summary', 'summary_detail', 'published', 'published_parsed', 'tags', 'content', 'comments'])

In [73]:
def convert_article_to_dict(article):
    attributes = [ 'additional_data', 'article_html', 'authors', 'canonical_link',
        'download_exception_msg', 'download_state', 'html', 'images', 'imgs', 'keywords',
        'link_hash', 'meta_data', 'meta_description', 'meta_favicon', 'meta_img',
        'meta_keywords', 'meta_lang', 'meta_site_name', 'movies', 'publish_date',
        'source_url', 'summary', 'tags', 'text', 'title', 'top_image', 'top_img', 'url']
    return {attr: getattr(article, attr) for attr in attributes}


def download_article(url):
    article = newspaper.Article(url)
    article.download()
    article.parse()
    # print(article.text)
    return convert_article_to_dict(article)


def prepend_prefix(d, prefix):
    return {prefix+k: d[k] for k in d}


# download_article(feed.entries[0].link).images
# prepend_prefix({'a': 1, 'b': 2}, 'rss_')
article = download_article(feed.entries[0].link)
article.keys()

dict_keys(['additional_data', 'article_html', 'authors', 'canonical_link', 'download_exception_msg', 'download_state', 'html', 'images', 'imgs', 'keywords', 'link_hash', 'meta_data', 'meta_description', 'meta_favicon', 'meta_img', 'meta_keywords', 'meta_lang', 'meta_site_name', 'movies', 'publish_date', 'source_url', 'summary', 'tags', 'text', 'title', 'top_image', 'top_img', 'url'])

In [77]:
ARTICLES_TABLE_SCHEMA = """
[
    {
        "mode": "NULLABLE",
        "name": "feed_url",
        "type": "STRING"
    },
    {
        "mode": "NULLABLE",
        "name": "downloaded_at",
        "type": "STRING"
    },
    {
        "mode": "NULLABLE",
        "name": "url",
        "type": "STRING"
    },
    {
        "mode": "NULLABLE",
        "name": "canonical_link",
        "type": "STRING"
    },
    {
        "mode": "NULLABLE",
        "name": "source_url",
        "type": "STRING"
    },
    {
        "mode": "NULLABLE",
        "name": "link_hash",
        "type": "STRING"
    },
    {
        "mode": "NULLABLE",
        "name": "title",
        "type": "STRING"
    },
    {
        "mode": "NULLABLE",
        "name": "text",
        "type": "STRING"
    },
    {
        "mode": "NULLABLE",
        "name": "html",
        "type": "STRING"
    },
    {
        "mode": "NULLABLE",
        "name": "meta_data",
        "type": "STRING"
    },
    {
        "mode": "NULLABLE",
        "name": "meta_description",
        "type": "STRING"
    },
    {
        "mode": "NULLABLE",
        "name": "meta_img",
        "type": "STRING"
    },
    {
        "mode": "NULLABLE",
        "name": "meta_lang",
        "type": "STRING"
    },
    {
        "mode": "NULLABLE",
        "name": "meta_site_name",
        "type": "STRING"
    },
    {
        "mode": "NULLABLE",
        "name": "meta_favicon",
        "type": "STRING"
    },
    {
        "mode": "REPEATED",
        "name": "meta_keywords",
        "type": "STRING"
    },
    {
        "mode": "NULLABLE",
        "name": "publish_date",
        "type": "STRING"
    },
    {
        "mode": "NULLABLE",
        "name": "top_image",
        "type": "STRING"
    },
    {
        "mode": "NULLABLE",
        "name": "summary",
        "type": "STRING"
    },
    {
        "mode": "REPEATED",
        "name": "authors",
        "type": "STRING"
    },
    {
        "mode": "REPEATED",
        "name": "images",
        "type": "STRING"
    },
    {
        "mode": "REPEATED",
        "name": "movies",
        "type": "STRING"
    },
    {
        "mode": "REPEATED",
        "name": "keywords",
        "type": "STRING"
    },
    {
        "mode": "REPEATED",
        "name": "tags",
        "type": "STRING"
    },
    {
        "mode": "NULLABLE",
        "name": "rss_link",
        "type": "STRING"
    },
    {
        "mode": "NULLABLE",
        "name": "rss_title",
        "type": "STRING"
    },
    {
        "mode": "NULLABLE",
        "name": "rss_author",
        "type": "STRING"
    },
    {
        "mode": "REPEATED",
        "name": "rss_authors",
        "type": "STRING"
    },
    {
        "mode": "NULLABLE",
        "name": "rss_summary",
        "type": "STRING"
    },
    {
        "mode": "NULLABLE",
        "name": "rss_published",
        "type": "STRING"
    },
    {
        "mode": "REPEATED",
        "name": "rss_tags",
        "type": "STRING"
    },
    {
        "mode": "NULLABLE",
        "name": "rss_raw_data",
        "type": "STRING"
    }
]
"""

def enrich_article_with_rss_data(article, rss_entry):
    article['rss_link'] = str(rss_entry.get('link'))
    article['rss_title'] = str(rss_entry.get('title'))
    article['rss_author'] = str(rss_entry.get('author'))
    article['rss_authors'] = [author['name'] for author in rss_entry.get('authors', [])]
    article['rss_summary'] = str(rss_entry.get('summary'))
    article['rss_published'] = str(rss_entry.get('published'))
    article['rss_tags'] = [tag['term'] for tag in rss_entry.get('tags', [])]
    article['rss_raw_data'] = str(rss_entry)

def build_bq_row(article, table_schema_json, skip_fields=['html']):
    table_schema = json.loads(table_schema_json)
    row = {}
    for field in table_schema:
        if field['name'] in skip_fields:
            continue
        if field['mode'] == 'REPEATED':
            row[field['name']] = list(article.get(field['name'], []))
        elif field['type'] == 'STRING':
            row[field['name']] = str(article.get(field['name']))
        else:
            row[field['name']] = article.get(field['name'])
    return row

In [79]:
rss_entry = feed.entries[0]

article = download_article(rss_entry.link)
enrich_article_with_rss_data(article, rss_entry)
row = build_bq_row(article, ARTICLES_TABLE_SCHEMA, skip_fields=['html'])

for k, v in row.items():
    print(k, type(v))
    

rows = [row]

client = bigquery.Client('newsgraphapp')

create_table_if_not_exists(client, 'test_dataset', 'articles', ARTICLES_TABLE_SCHEMA)

insert_into_table(client, 'test_dataset', 'articles', rows)

feed_url <class 'str'>
downloaded_at <class 'str'>
url <class 'str'>
canonical_link <class 'str'>
source_url <class 'str'>
link_hash <class 'str'>
title <class 'str'>
text <class 'str'>
meta_data <class 'str'>
meta_description <class 'str'>
meta_img <class 'str'>
meta_lang <class 'str'>
meta_site_name <class 'str'>
meta_favicon <class 'str'>
meta_keywords <class 'list'>
publish_date <class 'str'>
top_image <class 'str'>
summary <class 'str'>
authors <class 'list'>
images <class 'list'>
movies <class 'list'>
keywords <class 'list'>
tags <class 'list'>
rss_link <class 'str'>
rss_title <class 'str'>
rss_author <class 'str'>
rss_authors <class 'list'>
rss_summary <class 'str'>
rss_published <class 'str'>
rss_tags <class 'list'>
rss_raw_data <class 'str'>




In [64]:
rows = []
for entry in feed.entries:
    print(entry.link)
    article = download_article(entry.link)
    enrich_article_with_rss_data(article, entry)
    row = build_bq_row(article, ARTICLES_TABLE_SCHEMA, skip_fields=['html'])
    rows.append(row)

https://elpais.com/internacional/2019/07/07/actualidad/1562510044_592540.html#?ref=rss&format=simple&link=link
https://elpais.com/internacional/2019/07/07/actualidad/1562482149_326365.html#?ref=rss&format=simple&link=link
https://elpais.com/elpais/2019/07/04/icon/1562248238_771513.html#?ref=rss&format=simple&link=link
https://elpais.com/politica/2019/07/06/actualidad/1562437441_877497.html#?ref=rss&format=simple&link=link
https://elpais.com/deportes/2019/07/07/actualidad/1562505010_689635.html#?ref=rss&format=simple&link=link
https://elpais.com/internacional/2019/07/06/actualidad/1562438065_962487.html#?ref=rss&format=simple&link=link
https://elpais.com/politica/2019/07/06/actualidad/1562429540_590293.html#?ref=rss&format=simple&link=link
https://elpais.com/sociedad/2019/07/07/actualidad/1562495757_033586.html#?ref=rss&format=simple&link=link
https://elpais.com/internacional/2019/07/06/actualidad/1562424916_024730.html#?ref=rss&format=simple&link=link
https://elpais.com/ccaa/2019/07/06

In [56]:
rows[0].keys()

dict_keys(['url', 'canonical_link', 'source_url', 'link_hash', 'title', 'text', 'meta_data', 'meta_description', 'meta_img', 'meta_lang', 'meta_site_name', 'meta_favicon', 'meta_keywords', 'publish_date', 'top_image', 'summary', 'authors', 'images', 'movies', 'keywords', 'tags', 'rss_link', 'rss_title', 'rss_author', 'rss_authors', 'rss_summary', 'rss_published', 'rss_tags', 'rss_raw_data'])

In [66]:
create_table_if_not_exists(client, 'test_dataset', 'articles', ARTICLES_TABLE_SCHEMA)
insert_into_table(client, 'test_dataset', 'articles', rows)

In [104]:
ARTICLES_DATASET = 'feed'
ARTICLES_TABLE = 'articles'

def download_feed(feed_url):
    # TODO: sacar de bigquery la lista de links que ya descargue en el pasado, y no volver a hacerlo
    # TODO: se queda pillado a veces en el insert_into_table() o en el create_table()?
    now = str(datetime.datetime.now())
    print('downloading rss feed...')
    feed = feedparser.parse(feed_url)
    rows = []
    print('downloading feed entries...')
    for entry in feed.entries:
        print(entry.link)
        article = download_article(entry.link)
        article['meta_data'] = dict(article['meta_data'])
        enrich_article_with_rss_data(article, entry)
        article['downloaded_at'] = now
        article['feed_url'] = feed_url
        row = build_bq_row(article, ARTICLES_TABLE_SCHEMA, skip_fields=['html'])
        rows.append(row)
    print('saving {} rows to bigquery...'.format(len(rows)))
    # create_table_if_not_exists(client, ARTICLES_DATASET, ARTICLES_TABLE, ARTICLES_TABLE_SCHEMA)
    insert_into_table(client, ARTICLES_DATASET, ARTICLES_TABLE, rows)
    print('done')


# feed_url = 'https://elpais.com/tag/rss/latinoamerica/a/'
feed_url = 'http://elmundotoday.com/rss'
download_feed(feed_url)

downloading rss feed...
downloading feed entries...
https://www.elmundotoday.com/2019/07/estos-son-los-cinco-mejores-pisos-de-idealista-que-ya-vienen-amueblados-y-con-familia/
https://www.elmundotoday.com/2019/07/un-lector-consigue-resolver-en-un-minuto-el-ultimo-crucigrama-de-el-pais-desbloquea-el-logro-y-se-hace-con-la-direccion-del-periodico/
https://www.elmundotoday.com/2019/07/que-te-entre-jabon-en-los-ojos-y-otras-cuatro-experiencias-que-solo-puedes-vivir-en-barcelona/
https://www.elmundotoday.com/2019/07/audiencias-antena-3-sustituye-sus-informativos-por-la-serie-big-little-lies/
https://www.elmundotoday.com/2019/07/vox-culpa-al-cm-de-verano-del-reciente-asesinato-de-15-contrincantes-politicos/
https://www.elmundotoday.com/2019/07/comparativa-trabajo-infantil-vs-ropa-cara/
https://www.elmundotoday.com/2019/07/como-cada-ano-cierran-las-webs-de-pornografia-en-verano-porque-hace-demasiado-calor-para-el-sexo/
https://www.elmundotoday.com/2019/07/los-expertos-confirman-que-tras-un-de

In [84]:
str(datetime.datetime.now())

'2019-07-07 23:48:41.511211'