# Data loading

In [7]:

import json
import requests
import time

import pandas as pd
import joblib


### URL Search

In [None]:
query = 'Porto'
websites = 'expresso.pt'
max_items = 2000
from_ = '2021'
to_ = '2022'
link = f"https://arquivo.pt/textsearch?versionHistory={websites}&maxItems={max_items}&dedupValue=1&prettyPrint=true&from={from_}&to={to_}"
f = requests.get(link)
df = json.loads(f.text)

In [None]:
[(s['title'], s['tstamp']) for s in df['response_items']]

### Query Search

In [2]:
cities = [
    "Lisboa",
    "Porto",
    "Setúbal",
    "Braga",
    "Aveiro",
    "Faro",
    "Leiria",
    "Santarém",
    "Coimbra",
    "Viseu",
    "Viana do Castelo",
    "Vila Real",
    "Castelo Branco",
    "Évora",
    "Beja",
    "Guarda",
    "Bragança",
    "Portalegre",
]

In [85]:
query = 'Porto'
#websites = 'expresso.pt,publico.pt,jn.pt,dn.pt,cmjornal.pt,sol.sapo.pt,visao.sapo.pt'
websites = 'nit.pt'
max_items = 2000
from_ = '1996'
to_ = '2022'
link = f"https://arquivo.pt/textsearch?q={query}&siteSearch={websites}&maxItems={max_items}&dedupValue=1&prettyPrint=true&from={from_}&to={to_}"
f = requests.get(link)
df = json.loads(f.text)

In [3]:
newspapers = ['expresso.pt' ,'publico.pt', 'jn.pt', 'dn.pt', 'cmjornal.pt', 'sol.sapo.pt', 'visao.sapo.pt', 'jornaldenegocios.pt', 'observador.pt']
others = ['turismodeportugal.pt', 'nit.pt']
sources = newspapers + others

In [4]:

max_items = 2000
from_ = 1996
to_ = 2023

columns=['city', 'title', 'content', 'year', 'tstamp', 'link']

data = pd.DataFrame(columns=columns)
for city in cities:
    for year in range(from_, to_):
        
        link = f"https://arquivo.pt/textsearch?q={city}&siteSearch={','.join(sources)} \
            &maxItems={max_items}&dedupValue=1&prettyPrint=true&from={str(year)}&to={str(year+1)}"
        
        try:
        
            r = requests.get(link)
            payload = json.loads(r.text)['response_items']
        
            for idx, article in enumerate(payload):
                
                time.sleep(1)
    
                data = pd.concat([data, pd.DataFrame({
                    'city': city, 'title': article['title'], 'content': requests.get(article['linkToExtractedText']).text, 
                    'year': year, 'tstamp': article['tstamp'], 'link': article['linkToArchive']}, index=[idx])
                ], ignore_index=True)
        
        except Exception as e:
            #print(e)
            data = pd.concat([data, pd.DataFrame({
                    'city': city, 'title': None, 'content': None, 
                    'year': year, 'tstamp': None, 'link': None
                    }, index=[idx])
            ], ignore_index=True)

data.to_csv('../data/scraping_data.csv', index=False)   

In [8]:
data = pd.read_csv('../data/scraping_data.csv')

In [9]:
data.head()

Unnamed: 0,city,title,content,year,tstamp,link
0,Lisboa,POL | Local Lisboa,POL | Local Lisboa SECÇÕES 1ª Página Destaque ...,1999,19991111042737,https://arquivo.pt/wayback/19991111042737/http...
1,Lisboa,JN Editorial - Text57,JN Editorial - Text57 26 milhões para dar casa...,1999,19990822002536,https://arquivo.pt/wayback/19990822002536/http...
2,Lisboa,Outras Paginas,Outras Paginas 11 de Novembro de 1999 Igreja d...,1999,19991117215651,https://arquivo.pt/wayback/19991117215651/http...
3,Lisboa,JN Editorial - Texult1,JN Editorial - Texult1 Macau: Rão Kyao é o aut...,1999,19991118004529,https://arquivo.pt/wayback/19991118004529/http...
4,Lisboa,PÚBLICONLINE-Os Destaques da Primeira Página,PÚBLICONLINE-Os Destaques da Primeira Página S...,1999,19991012235908,https://arquivo.pt/wayback/19991012235908/http...


In [11]:
data.to_csv('../data/scraping_data.csv.gz', compression='gzip')

### Parallelism

In [4]:
def article_scrapper(city, sources):

    max_items = 10
    from_ = 1996
    to_ = 2023

    columns=['city', 'title', 'content', 'year', 'tstamp', 'link']
    data = pd.DataFrame(columns=columns)
    for year in range(from_, to_):
        
        link = f"https://arquivo.pt/textsearch?q={city}&siteSearch={','.join(sources)} \
            &maxItems={max_items}&dedupValue=1&prettyPrint=true&from={str(year)}&to={str(year+1)}"
        
        try:
        
            r = requests.get(link)
            payload = json.loads(r.text)['response_items']
        
            for idx, article in enumerate(payload):
                
                time.sleep(1)

                data = pd.concat([data, pd.DataFrame({
                    'city': city, 'title': article['title'], 'content': requests.get(article['linkToExtractedText']).text, 
                    'year': year, 'tstamp': article['tstamp'], 'link': article['linkToArchive']}, index=[idx])
                ], ignore_index=True)
        
        except Exception as e:
            
            data = pd.concat([data, pd.DataFrame({
                    'city': city, 'title': None, 'content': None, 
                    'year': year, 'tstamp': None, 'link': None
                    }, index=[idx])
            ], ignore_index=True)

    data.to_csv(f'../data/scraping_data.csv', index=False)   


In [5]:
joblib.Parallel(n_jobs=-1)(
        joblib.delayed(article_scrapper)(
            city=city,
            sources=sources,    
        )
        for city in cities
    )

In [5]:
data = pd.read_csv('../data/scraping_data.csv')

In [6]:
data['city'].value_counts()

Lisboa              5363
Porto               3835
Coimbra             2080
Braga               2047
Guarda              1969
Setúbal             1792
Aveiro              1471
Beja                1403
Leiria              1344
Bragança            1215
Faro                1208
Viseu               1188
Vila Real           1104
Santarém            1072
Castelo Branco       958
Viana do Castelo     943
Évora                908
Portalegre           859
Name: city, dtype: int64