In [1]:
import requests
import json
from bs4 import BeautifulSoup

import pandas as pd
import numpy as np

We can connect to public APIs and download data. This one corresponds to the international station.

In [2]:
response = requests.get('http://api.open-notify.org/iss-now.json')
response.text

'{"timestamp": 1654017813, "iss_position": {"latitude": "-51.4371", "longitude": "171.4141"}, "message": "success"}'

We can convert a json-formatted string such as the one we get in the response into a Python object with the json library.

In [3]:
response = json.loads(response.text)
response

{'timestamp': 1654017813,
 'iss_position': {'latitude': '-51.4371', 'longitude': '171.4141'},
 'message': 'success'}

In [4]:
response['iss_position']['latitude']

'-51.4371'

In [5]:
response['iss_position']['longitude']

'171.4141'

We also can go in the other direction and generate json-formatted strings from Python objects:

In [6]:
profs = [{'name': 'Dani'}, {'name': 'Toni'}]
json.dumps(profs)

'[{"name": "Dani"}, {"name": "Toni"}]'

In [7]:
response = requests.get('https://api.agify.io?name=jorge')
response.text

'{"name":"jorge","age":62,"count":114531}'

In [8]:
def age(name='Grandma'):
    response = requests.get('https://api.agify.io?name='+name)
    json_age = json.loads(response.text)['age']
    return f'{name} is {json_age} years old.'

In [9]:
age()

'Grandma is 72 years old.'

In [10]:
parameters = {'name': 'Jorge', 'country_id': 'ES'}

In [11]:
response = requests.get('https://api.agify.io?name=jorge', params=parameters)
response.text

'[{"name":"jorge","age":46,"count":36357,"country_id":"ES"},{"name":"Jorge","age":46,"count":36357,"country_id":"ES"}]'

In order to read web pages (html) we need to use BeautifulSoup library.

## Exercise

Get all the articles on the front page os elpais.com (title and URL)

Store it in a csv file.

In [14]:
response = requests.get('https://elpais.com')
soup = BeautifulSoup(response.text)
articles = soup.findAll('article')

titulares = []
enlaces = []
for a in articles:
    # first we find all links on the front page
    link = a.find('a', href=True)
    enlaces.append(link['href'])
    
    # second we get the text from the respective class
    titulo = a.find('h2', attrs={'class': 'c_t'})
    titulares.append(titulo.text)

    # after appending it to empty lists we just need to store it in a csv.


df = pd.DataFrame({'Titular':titulares, 'URL': enlaces})

In [15]:
df.to_csv('elpais.csv', index=False)

### BONUS EXERCISE

Create a field in the csv that specifies if the article is premium or not

In [33]:
response = requests.get('https://elpais.com')
soup = BeautifulSoup(response.text)
articles = soup.findAll('article')


titulares = []
enlaces = []
premium = []
for a in articles:
    # first we find all links on the front page
    link = a.find('a', href=True)
    enlaces.append(link['href'])
    
    # second we get the text from the respective class
    titulo = a.find('h2', attrs={'class': 'c_t'})
    titulares.append(titulo.text)
 
    # third, we look for a subcategory within the articles to determine if it is premium or not.
    vip = a.find('span', attrs={'aria-label': 'Exclusivo suscriptores'})
    premium.append(bool(vip))


# after appending it to empty lists we just need to store it in a csv.

df = pd.DataFrame({'Titular':titulares, 'URL': enlaces, 'Premium': premium})
df.to_csv('elpais.csv', index=False)

In [37]:
df.head(12)

Unnamed: 0,Titular,URL,Premium
0,Sánchez se abre a prorrogar tres meses las ayu...,/espana/2022-05-31/sanchez-apunta-que-prorroga...,False
1,La Unión Europea pacta un rearme conjunto ante...,/internacional/2022-05-31/la-union-europea-pac...,False
2,El sexto golpe de las sanciones de la UE llega...,/internacional/2022-05-31/el-sexto-golpe-de-la...,True
3,Alcaraz gana el tercer ‘set’ del partido y rec...,/deportes/2022-05-31/directo-zverev-alcaraz-en...,False
4,Nadal contra Djokovic y sus dinámicas invertid...,/deportes/2022-05-31/nadal-contra-djokovic-y-s...,False
5,No pensar en lo que no se puede controlar,/deportes/2022-05-31/no-pensar-en-lo-que-no-se...,True
6,Código de honor y barbarie,https://elpais.com/opinion/editoriales/,False
7,Sumar gente o sumar partidos,/opinion/2022-05-31/sumar-gente-o-sumar-partid...,True
8,Hasta los dientes,/opinion/2022-05-31/hasta-los-dientes.html,True
9,"14 Champions, 140 leyes",/opinion/2022-05-31/14-champions-140-leyes.html,True


In [11]:
from selenium import webdriver
import time

In [6]:
driver = webdriver.Chrome(r'C:\Users\Usuario\Desktop\Master DATA SCIENCE\06 APIs y web scrapping con Python\chromedriver_win32\chromedriver.exe')

# webdriver.Firefox(executable_path=r'C:\Users\Usuario\Desktop\Master DATA SCIENCE\06 APIs y web scrapping con Python\geckodriver-v0.31.0-win32\geckodriver.exe')

In [7]:
driver.get('http://www.aflcio.org/Legislation-and-Politics/Legislative-Alerts')

In [10]:
button = driver.find_element_by_class_name('btn-load-more')
button.click()

In [13]:
for _ in range(5):
    button = driver.find_element_by_class_name('btn-load-more')
    button.click()
    time.sleep(5)

We look for every element in the class that fits best the container of the data we want to scrap.

In [17]:
alert = driver.find_element_by_class_name('content-details')
alert.find_element_by_tag_name('h2').text

AttributeError: 'list' object has no attribute 'find_element_by_tag_name'

In [None]:
alert.find_element_by_tag_name('a').get_property('href')

# Annex: ultra easy scraping with pandas!

When the data we want is already formatted as a table, we can do it even more easily! Just use `pandas.read_html`:

#### Exercise:

Extract the date of the worst aviation disaster from: https://en.wikipedia.org/wiki/List_of_accidents_and_disasters_by_death_toll

Prerequisites: pandas, pd.read_html

In [None]:
import pandas as pd

tables = pd.read_html('https://en.wikipedia.org/wiki/List_of_accidents_and_disasters_by_death_toll')
len(tables)

In [None]:
tables[2]

#### Exercise

Using Selenium, get the link to the video for Pasapalabra's "Rosco" of 2019-08-30.

https://www.telecinco.es/pasapalabra/

In [47]:
driver = webdriver.Chrome(r'C:\Users\Usuario\Desktop\Master DATA SCIENCE\06 APIs y web scrapping con Python\chromedriver_win32\chromedriver.exe')
driver.get('https://www.telecinco.es/pasapalabra/')

One way to do it is the following.
We set a range long enaugh for it to get to the page we look for.
and compare dates in the elements of the page until we find the one we want and exit the loop.

In [None]:
ref_date = "30/08/2019"
for _ in range(1):
    date = driver.find_elements_by_class_name('cards__postcard__content-1w21 ')
    for element in date:
        if ref_date == element.text[0:10]:
            link = element.find_element('a')
            break
        else:
            pass
    button = driver.find_element_by_class_name('pagination__pagination_viewmore-z9ko')
    button.click()        
    time.sleep(5)

-----------

Other way is to define a function that looks for the date and start a while loop with it.

In [None]:
def is_date_present(date, driver):

    date_texts = [element.text for element in driver.find_elements_by_class_name('cards__postcard__date_time-3Ach')]
    
    return any([date in text for text in date_texts])

is_date_present('17/09/1955', driver)

In [None]:
while not is_date_present('30/08/2019', driver):
    button = driver.find_element_by_class_name('pagination__pagination_viewmore-z9ko')
    button.click()