# New

In [10]:
import re
import json
import codecs
import requests
from bs4 import BeautifulSoup
from collections import defaultdict
from tqdm import tqdm


with codecs.open("deepmind.html", 'r', encoding='utf-8') as f:
    deepmind = f.read()
    
soup = BeautifulSoup(deepmind, 'html.parser')
links = sorted(list(set(re.findall(r'/blog/[_\sa-zA-Z0-9/-]+', str(soup)))))    
main_url = 'https://deepmind.com'

In [20]:
results = defaultdict(dict)

for ix, link in enumerate(tqdm(links)):
    html = requests.get(main_url + link).text
    soup = BeautifulSoup(html, 'html.parser')
    results[ix]['title'] = soup.title.text.split('|')[0].strip()
    results[ix]['date'] = soup.find(class_='caption date').text
    if soup.find(class_='authors-list') is not None:
        results[ix]['authors'] = [
            ul.find(class_='author-name small ng-star-inserted').text.strip()
            for ix, ul in enumerate(soup.find(class_='authors-list')) if ix > 0
        ]
    else:
        results[ix]['authors'] = []
    tags = soup.find(class_='tags')
    if tags is not None:
        results[ix]['tags'] = [tag.text.strip() for tag in tags.find_all('li')]
    else:
        results[ix]['tags'] = []
    results[ix]['type'] = link.split('/')[-2]
    results[ix]['url'] = main_url + link

100%|████████████████████████████████████████| 125/125 [01:40<00:00,  1.25it/s]


In [47]:
import numpy as np
import pandas as pd

In [48]:
df = pd.DataFrame([
    [val['title'] for val in results.values()],
    [val['date'] for val in results.values()],
    [val['type'] for val in results.values()],
    [val['url'] for val in results.values()],
]).T
df.columns = ['title', 'date', 'type', 'url']

In [70]:
nums = len(list(set([tag for val in results.values() for tag in val['tags']])))
cols = list(set([tag for val in results.values() for tag in val['tags']]))

tags = pd.DataFrame(np.zeros((len(df), nums)), columns=cols)

for ix, ts in enumerate([val['tags'] for val in results.values()]):
    for t in ts:
        tags.loc[ix, t] = 1

In [77]:
pd.concat([df, tags.replace({1.:'O', 0.:''})], axis=1)

Unnamed: 0,title,date,type,url,Continual & transfer learning,Multi-agent learning,Sciences,Abstraction & concepts,Meta-Learning,Tensorflow,...,Representation learning,Environments,Vision,Self-supervised learning,Theory & foundations,Visualization,Simulations,Imitation,Safety,Performance
0,Retour à Paris / A return to Paris,29 Mar 2018,announcements,https://deepmind.com/blog/announcements/a-retu...,,,,,,,...,,,,,,,,,,
1,Announcing DeepMind Health research partnershi...,05 Jul 2016,announcements,https://deepmind.com/blog/announcements/announ...,,,,,,,...,,,,,,,,,,
2,Announcing the Partnership on AI to Benefit Pe...,28 Sep 2016,announcements,https://deepmind.com/blog/announcements/announ...,,,,,,,...,,,,,,,,,,
3,Applying machine learning to mammography scree...,24 Nov 2017,announcements,https://deepmind.com/blog/announcements/applyi...,,,,,,,...,,,,,,,,,,
4,Applying machine learning to radiotherapy plan...,30 Aug 2016,announcements,https://deepmind.com/blog/announcements/applyi...,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
120,Unsupervised learning: The curious pupil,25 Jun 2019,article,https://deepmind.com/blog/article/unsupervised...,,,,,,,...,,,,,,,,,,
121,Using machine learning to accelerate ecologica...,08 Aug 2019,article,https://deepmind.com/blog/article/using-machin...,,,,,,,...,,,,,,,,,,
122,WaveNet: A generative model for raw audio,08 Sep 2016,article,https://deepmind.com/blog/article/wavenet-gene...,,,,,,,...,,,,,,,,,,
123,WaveNet launches in the Google Assistant,04 Oct 2017,article,https://deepmind.com/blog/article/wavenet-laun...,,,,,,,...,,,,,,,,,,


In [84]:
df.date.map(pd.to_datetime)

0     2018-03-29
1     2016-07-05
2     2016-09-28
3     2017-11-24
4     2016-08-30
         ...    
120   2019-06-25
121   2019-08-08
122   2016-09-08
123   2017-10-04
124   2019-08-16
Name: date, Length: 125, dtype: datetime64[ns]

In [73]:
pd.concat([df, tags.replace({1.:'O', 0.:''})], axis=1).to_excel('DeepMindBlogList.xlsx')

In [76]:
with open('DeepMindBlogList.json', 'w') as f:
    json.dump(results, f)