# Trabalho CPA - Webscraping


### Integrantes: 
- Andresssa Moreira ->  19204096
- João Pedro de Moura -> 21100803

## Importação das bibliotecas


In [49]:
from bs4 import BeautifulSoup
import requests
import time
import re

import pandas as pd
import numpy as np

import hashlib

In [50]:
def download(url, user_agent='wswp', num_retries=2, proxies=None):
    """ Download a given URL and return the page content
        args:
            url (str): URL
        kwargs:
            user_agent (str): user agent (default: wswp)
            proxies (dict): proxy dict w/ keys 'http' and 'https', values
                            are strs (i.e. 'http(s)://IP') (default: None)
            num_retries (int): # of retries if a 5xx error is seen (default: 2)
    """
    print('Downloading:', url)
    headers = {'User-Agent': user_agent}
    try:
        resp = requests.get(url, headers=headers, proxies=proxies)
        html = resp.text
        if resp.status_code >= 400:
            print('Download error:', resp.text)
            html = None
            if num_retries and 500 <= resp.status_code < 600:
                # recursively retry 5xx HTTP errors
                return download(url, num_retries - 1)
    except requests.exceptions.RequestException as e:
        print('Download error:', e)
        html = None
    return html

In [51]:
# função responsável por salvar os htmls
def save_html(html, name):
    file_name = f'htmls/{str(name)}.html'
    f = open(file_name, 'w')
    f.write(html)
    f.close()

In [52]:
def save_in_csv(colunas, D, lista):
    dic = {} # estrutura de dados usada para guardar os dados que depois irão para o csv

    # adicionado o restante do conteúdo no dataframe
    i = 0
    for keys in D:
        aux = []
        for l in lista: 
            try:
                aux.append(D[keys][l][1])
            except KeyError:
                pass
        dic[i] = aux
        i+=1

    df = pd.DataFrame(data = dic)
    df2 = df.T # tranpose para deixar as linhas e as colunas na forma correta 
    df2.set_axis(colunas, axis=1, inplace=True)
    # salvando o dataframe num csv
    df2.to_csv(f"tarefa1.csv", index=False)
    print('csv is done.')

In [53]:
# função responsável por chamar a função dowload fazendo um findall, retornando a lista de paises
def pega_paises():
    downloaded = download('http://127.0.0.1:8000/places/default/sitemap.xml')
    pages = re.findall('<loc>(.*?)</loc>', downloaded)
    return pages
pega_paises()

Downloading: http://127.0.0.1:8000/places/default/sitemap.xml


['http://127.0.0.1:8000/places/default/view/Afghanistan-1',
 'http://127.0.0.1:8000/places/default/view/Aland-Islands-2',
 'http://127.0.0.1:8000/places/default/view/Albania-3',
 'http://127.0.0.1:8000/places/default/view/Algeria-4',
 'http://127.0.0.1:8000/places/default/view/American-Samoa-5',
 'http://127.0.0.1:8000/places/default/view/Andorra-6',
 'http://127.0.0.1:8000/places/default/view/Angola-7',
 'http://127.0.0.1:8000/places/default/view/Anguilla-8',
 'http://127.0.0.1:8000/places/default/view/Antarctica-9',
 'http://127.0.0.1:8000/places/default/view/Antigua-and-Barbuda-10',
 'http://127.0.0.1:8000/places/default/view/Argentina-11',
 'http://127.0.0.1:8000/places/default/view/Armenia-12',
 'http://127.0.0.1:8000/places/default/view/Aruba-13',
 'http://127.0.0.1:8000/places/default/view/Australia-14',
 'http://127.0.0.1:8000/places/default/view/Austria-15',
 'http://127.0.0.1:8000/places/default/view/Azerbaijan-16',
 'http://127.0.0.1:8000/places/default/view/Bahamas-17',
 'h

In [54]:
def webscrap():

    print('FAZENDO O DOWNLOAD...')

    all_data = {} # dicionario maior que vai ganhar todos os dados que serão tratados e inserido no csv depois
    lista_colunas = [] # lista de colunas do csv

    # lista de elementos que serão explorados
    lista_elementos = ['places_national_flag__row', 
                    'places_area__row', 
                    'places_population__row',
                    'places_iso__row',
                    'places_country__row',
                    'places_capital__row',
                    'places_continent__row',
                    'places_tld__row',
                    'places_currency_code__row',
                    'places_currency_name__row',
                    'places_phone__row',
                    'places_postal_code_format__row',
                    'places_postal_code_regex__row',
                    'places_languages__row',
                    'places_neighbours__row'
                    ]


    pagina_paises = pega_paises()

    i = 0
    for paises in pagina_paises: # iteramos sobre todos os paises pegando as informações que queremos

        pais = download(paises)
        all_data[pais] = {}

        # salvando o html
        save_html(pais, i) 
        
        for l in lista_elementos: # iterando sobre elementos que precisam ser acessados

            soup = BeautifulSoup(pais, 'html.parser') 
            target = soup.find(id = l)
            readonly = target.find(class_ ='readonly')
            w2p_fw = target.find(class_ = 'w2p_fw')

            # filtra o nome das colunas
            readonly_regex = re.findall('([A-Za-z]*):\s<\/label>', str(readonly)) 
            lista_colunas.append(readonly_regex)
        
            # fazendo o regex correto para pegar cada item
            if l == 'places_neighbours__row' or l == 'places_continent__row':
                w2p_fw_regex = re.findall('([A-Z]*.)<\/a>', str(w2p_fw))
            if l == 'places_national_flag__row':
                w2p_fw_regex = re.findall('img src=(".*?")', str(w2p_fw))
            else:
                w2p_fw_regex = re.findall('w2p_fw">(.*?)<\/td>', str(w2p_fw)) 

            r = ''.join(readonly)
            w = ''.join(w2p_fw_regex)

            all_data[pais][l] = [r, w]

        # guardando o momento em que os dados foram obtidos
        year, month, day, hour, minu = map(int, time.strftime("%Y %m %d %H %M").split())
        time_stamp = f'{day}/{month}/{year} {hour}:{minu}' 
        # adicionando uma coluna extra no dicionario para guardar o momento em que os dados foram obtidos
        all_data[pais]['time_stamp'] = ['time_stamp: ', time_stamp] 

        i+= 1
            
    lista_elementos.append('time_stamp')
    # pegando só as 16 primeiras strings que serão o nome das colunas
    colunas = list(map(''.join, lista_colunas))[0:15] 
    colunas.append('Time_stamp')

    # chama a função que cria o csv passando como parametro as colunas, todos os dados armazenados no dicionário all_data
    # e os elementos sobre qual são iterados e retirados os dados
    save_in_csv(colunas, all_data, lista_elementos) 
webscrap()

FAZENDO O DOWNLOAD...
Downloading: http://127.0.0.1:8000/places/default/sitemap.xml
Downloading: http://127.0.0.1:8000/places/default/view/Afghanistan-1
Downloading: http://127.0.0.1:8000/places/default/view/Aland-Islands-2
Downloading: http://127.0.0.1:8000/places/default/view/Albania-3
Downloading: http://127.0.0.1:8000/places/default/view/Algeria-4
Downloading: http://127.0.0.1:8000/places/default/view/American-Samoa-5
Downloading: http://127.0.0.1:8000/places/default/view/Andorra-6
Downloading: http://127.0.0.1:8000/places/default/view/Angola-7
Downloading: http://127.0.0.1:8000/places/default/view/Anguilla-8
Downloading: http://127.0.0.1:8000/places/default/view/Antarctica-9
Downloading: http://127.0.0.1:8000/places/default/view/Antigua-and-Barbuda-10
Downloading: http://127.0.0.1:8000/places/default/view/Argentina-11
Downloading: http://127.0.0.1:8000/places/default/view/Armenia-12
Downloading: http://127.0.0.1:8000/places/default/view/Aruba-13
Downloading: http://127.0.0.1:8000/

In [55]:
# funcao que monitora eternamente a pagina buscando por atualizacoes
def monitoring_page():

    lista_elem = ['places_national_flag__row', 
                    'places_area__row', 
                    'places_population__row',
                    'places_iso__row',
                    'places_country__row',
                    'places_capital__row',
                    'places_continent__row',
                    'places_tld__row',
                    'places_currency_code__row',
                    'places_currency_name__row',
                    'places_phone__row',
                    'places_postal_code_format__row',
                    'places_postal_code_regex__row',
                    'places_languages__row',
                    'places_neighbours__row'
                    ]

    downloaded = download('http://127.0.0.1:8000/places/default/sitemap.xml')
    paises = re.findall('<loc>(.*?)</loc>', downloaded)
    
    while True: 
        # itera sobre todos os paises
        for i, pais in enumerate(paises): 
            html = open(f'htmls/{str(i)}.html', 'r', encoding='utf-8') # pega os htmls

            html_antigo = html.read() # le o html que salvamos anteriormente
            hash_antigo = hashlib.sha224(html_antigo.encode('utf-8')).hexdigest() 

            # novos dados
            html_novo = download(pais)
            hash_novo = hashlib.sha224(html_novo.encode('utf-8')).hexdigest()

            # novo time_stamp
            year, month, day, hour, minu = map(int, time.strftime("%Y %m %d %H %M").split())
            time_stamp_atualizado = f'{day}/{month}/{year} {hour}:{minu}'

            # comparando os dados antigos com os novos 
            if hash_antigo != hash_novo:
                
                print('Mudança detectada. Novo HTML salvo.')
                # salva o novo html
                save_html(html_novo, i)

                # pega os dados atualizados
                dados = []

                soup = BeautifulSoup(html_novo, 'html.parser') 

                for l in lista_elem:
                    target = soup.find(id = l)
                    w2p_fw = target.find(class_ = 'w2p_fw')
                    # fazendo o regex correto para pegar cada item
                    if l == 'places_neighbours__row' or l == 'places_continent__row':
                        wp_fw_regex = re.findall('([A-Z]*.)<\/a>', str(w2p_fw))
                    if l == 'places_national_flag__row':
                        w2p_fw_regex = re.findall('img src=(".*?")', str(w2p_fw))
                    else:
                        w2p_fw_regex = re.findall('w2p_fw">(.*?)<\/td>', str(w2p_fw)) # type list

                    w = ''.join(w2p_fw_regex)

                    dados.append(w)

                dados.append(time_stamp_atualizado)
                df = pd.read_csv('tarefa1.csv')
                df.iloc[i] = dados 
                df.to_csv('tarefa1.csv', index = False)
                print('CSV atualizado.')
            else:
                print('Nenhuma mudança detectada.')
            
monitoring_page()

Downloading: http://127.0.0.1:8000/places/default/sitemap.xml
Downloading: http://127.0.0.1:8000/places/default/view/Afghanistan-1
Nenhuma mudança detectada.
Downloading: http://127.0.0.1:8000/places/default/view/Aland-Islands-2
Nenhuma mudança detectada.
Downloading: http://127.0.0.1:8000/places/default/view/Albania-3
Nenhuma mudança detectada.
Downloading: http://127.0.0.1:8000/places/default/view/Algeria-4
Nenhuma mudança detectada.
Downloading: http://127.0.0.1:8000/places/default/view/American-Samoa-5
Nenhuma mudança detectada.
Downloading: http://127.0.0.1:8000/places/default/view/Andorra-6
Nenhuma mudança detectada.
Downloading: http://127.0.0.1:8000/places/default/view/Angola-7
Nenhuma mudança detectada.
Downloading: http://127.0.0.1:8000/places/default/view/Anguilla-8
Nenhuma mudança detectada.
Downloading: http://127.0.0.1:8000/places/default/view/Antarctica-9
Nenhuma mudança detectada.
Downloading: http://127.0.0.1:8000/places/default/view/Antigua-and-Barbuda-10
Nenhuma muda

KeyboardInterrupt: 