In [1]:
import os
import requests
import pandas as pd
from urllib.parse import urlparse   # Para parsear las URL 
import time
from datetime import datetime
import random
from time import sleep
import json

from selenium import webdriver
from selenium.webdriver.common.by import By
# Automatiza la búsqueda e instalación de la versión de driver correcta para el navegador disponible en el OS:
from webdriver_manager.firefox import GeckoDriverManager  # https://pypi.org/project/webdriver-manager/
from selenium.webdriver.firefox.options import Options as FirefoxOptions  # Para poder configurar las opciones de Geckdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.common.exceptions import NoSuchElementException  # Import the exception

In [2]:
# Define the URL for the API call
url = "http://web.archive.org/cdx/search/cdx?url=ground.news*&output=txt"

# Make the API call
response = requests.get(url)

# Las líneas de texto de la respuesta son como estas:
# news,ground)/ 20180430191221 https://ground.news/ text/html 200 YMWQUN4TK55LBNTMWOFSIWHOG2S5YR4Q 14044
# news,ground)/ 20180612183329 https://ground.news/ text/html 200 SBAWULRK6FFOHBHBKVVPSR62Q2W5MHNO 14120

In [3]:
if response.status_code == 200:
    # Successfully accessed, split the text into lines based on the newline character
    lines = response.text.split('\n')
    
    # Initialize an empty list to hold each row's data
    rows = []

    # Process each line in the lines list
    for line in lines:
        # Skip empty lines to avoid creating empty rows in the DataFrame
        if not line.strip():
            continue

        # Split the line into components based on whitespace
        components = line.split()
        
        # Add the components as a new row in the rows list
        rows.append(components)

    # Create a DataFrame from the list of rows
    df = pd.DataFrame(rows, columns=["Descript", "Timestamp", "URL", "Content_type", "Status", "Checksum", "Length"])
    
    # Save the DataFrame to a CSV file
    df.to_csv("wayback_snapshots.csv", index=False)
    print("Data saved to wayback_snapshots.csv")
else:
    print("Failed to fetch data from the API")

Data saved to wayback_snapshots.csv


In [4]:
# Para cargarlo de nuevo posteriormente a partir de fichero:

# Load the CSV file into a DataFrame
df = pd.read_csv("wayback_snapshots.csv")


In [5]:
df

Unnamed: 0,Descript,Timestamp,URL,Content_type,Status,Checksum,Length
0,"news,ground)/",20180430191221,https://ground.news/,text/html,200,YMWQUN4TK55LBNTMWOFSIWHOG2S5YR4Q,14044
1,"news,ground)/",20180612183329,https://ground.news/,text/html,200,SBAWULRK6FFOHBHBKVVPSR62Q2W5MHNO,14120
2,"news,ground)/",20180612184710,https://ground.news/,warc/revisit,-,SBAWULRK6FFOHBHBKVVPSR62Q2W5MHNO,758
3,"news,ground)/",20180717061218,https://ground.news/,text/html,200,GKX7Q7WBYEUDWCQCEQN2OSFMYZGFUANN,14127
4,"news,ground)/",20180728142351,https://ground.news/,text/html,200,2LMX765WKYAPDLYPKEKY3CZE5V6KMFVF,14137
...,...,...,...,...,...,...,...
869879,"news,ground)/year-in-review/2022/ground",20240216230442,https://ground.news/year-in-review/2022/Ground,text/html,200,55TWNCOVODTZEFUUROKLLVFOPNG65BMP,6371
869880,"news,ground)/year-in-review/2022/ground?utm_ca...",20221221233411,https://ground.news/year-in-review/2022/Ground...,text/html,200,CK4C64W6NI6YZ5GXRHNOVWVVJRM743HO,6922
869881,"news,ground)/ziroth",20230527002613,https://ground.news/ziroth,unk,307,3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ,643
869882,"news,ground)/zuby",20210426092427,https://ground.news/zuby,unk,307,3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ,336


La primera columna esencialmente es una especie de reformulación de la URL; la eliminamos por superflua.

In [6]:
df = df.drop("Descript", axis = 1)

Ordenamos la tabla de forma creciente según el "Timestamp" (que parece seguir el formato yyyymmddhhmmss). Es decir, ordenaremos de más antiguo a más reciente.

In [7]:
df = df.sort_values(by="Timestamp")
df.reset_index(drop=True, inplace=True)  # Reseteamos el índice.
df

Unnamed: 0,Timestamp,URL,Content_type,Status,Checksum,Length
0,20180430191215,https://ground.news/robots.txt,text/plain,200,CZML653JDKEHW3ESO53XY3PDCRH7K3NR,802
1,20180430191221,https://ground.news/,text/html,200,YMWQUN4TK55LBNTMWOFSIWHOG2S5YR4Q,14044
2,20180527061137,https://ground.news/robots.txt,text/plain,200,JZBJEQF6DBCMXULI5RQTV6LFS7SKDPIW,794
3,20180527061141,https://ground.news/home?format=RSS,application/rss+xml,200,ANX3LBVN33EHY7T7XWME4RNPWT2KN4EQ,1130
4,20180527062400,https://ground.news/favicon.ico,image/png,200,A3NNN42JV3JACYSFOTDTSDKHTL5P3XSE,4853
...,...,...,...,...,...,...
869879,20240301102607,https://ground.news/_next/static/chunks/pages/...,application/javascript,200,5ATGPG34RMX7PSZHHXLM6G3LSNMAD4YJ,7066
869880,20240301102607,https://ground.news/_next/static/chunks/6206-c...,application/javascript,200,H3A4UZ3CHMJLWCQYB5NIIUIAZ7JDCEEZ,6678
869881,20240301102607,https://ground.news/_next/static/chunks/pages/...,application/javascript,200,SJML4KB3S7VDZOKZ6GW2V736B3QXREHS,5059
869882,20240301102607,https://ground.news/_next/static/chunks/pages/...,application/javascript,200,RWBQD3E2EXGHGLWNQLK7WJEFTR2YGFN3,3321


In [8]:
# Count NaN values in each column
nan_counts = df.isna().sum()

# Print columns with NaN values and their corresponding counts
print("Columns with NaN values:")
print(nan_counts[nan_counts > 0])

Columns with NaN values:
Series([], dtype: int64)


### Interpretación registros existentes

Interpretemos qué nos dicen estos registros (pero recordemos que estos no son aún nuestros datos de trabajo, sino que son la vía para obtenerlos por posterior scraping). Tenemos:

a) El "timestamp", en formato "yyyymmddhhmmss" (aunque expresado como potencia, en el dataframe).

b) URL, indica una dirección web almacenada en la Internet Archive Wayback Machine.

c) Content_type nos indica si se trata de contenido HTML, otros textos, fotos, aplicaciones, etc.

d) Checksum es un valor que sirve para comprobar la autenticidad y/o integridad de un fichero que se ha descargado.

e) Longitud, nos habla de la extensión del fichero. ¿Son kilobytes?

Bien, ¿y cómo sabemos encontrar cada uno de estos registros almacenados en a qué corresponde esto en en la base de datos de registros de la Internet Archive Wayback Machine? Si vamos al [buscador de Wayback Machine](https://web.archive.org/) para buscar la primera vez que se guardó la web propiamente dicha, encontramos que corresponde a esta URL:

https://web.archive.org/web/20180430191221/https://ground.news/

Este registro se guardó el 30 de abril de 2018, a las 19:12:21. Como podemos deducir, las URL dentro de la Internet Archive Wayback Machine se compondrán de los siguientes framentos:

'https://web.archive.org/web/' + timestamp + '/' + URL_de_interés

Es decir:

https://web.archive.org/web/timestamp/https://ground.news/

Esto será clave más adelante para scapear cada uno de los contenidos de interés.

### Limpieza de la lista de registros de Ground News disponibles en el Internet Archive

Para acelerar el proceso posterior y reducir la carga sobre el servidor, eliminaremos de la lista los registros que previsiblemente no nos será útiles de cara al scraping. De lo contrario, estaríamos haciendo muchísimas llamadas innecesarias al servidor de Internet Archive, que es además muy y muy lento.

De todas las páginas y ficheros de Ground News almacenados en la Internet Archive Wayback Machine, ¿nos interesan todos? Si echamos un vistazo, veremos que claramente no (entre otros, hay imágenes, fuentes, elementos de aplicaciones, etc.):

In [9]:
df.Content_type.unique()

array(['text/plain', 'text/html', 'application/rss+xml', 'image/png',
       'warc/revisit', 'unk', 'image/svg+xml', 'application/json',
       'text/css', 'image/vnd.microsoft.icon', 'application/javascript',
       'application/xml', 'font/woff2', 'font/woff', 'image/jpeg',
       'application/octet-stream', 'image/webp'], dtype=object)

Después de examinar los que generaban algunas dudas sobre si también podrían sernos útiles (p.ej. 'text/plain', aunque al final se refería a ficheros 'robots.txt'), vemos que la información de interés está contenida únicamente entre los registros de tipo "text/html". Nos quedaremos solo con esos registros.

In [10]:
df_only_HTML = df[df['Content_type'] == 'text/html']
len(df_only_HTML)
# 'text/plain', he podido comprobar que solo contenía el fichero robots.txt, y ninguna otra información de interés.
# Es argumentable que en 'text/css' pudiese haber algo indirectamente aprovechable (no he visto cómo), pero a priori son ficheros muy abstrusos

383860

Solo con esto, hemos reducido a menos de la mitad el número de registros. 

Aún así, aquí hay centenares de miles de registros, de los cuales en principio solo una parte contendrá los datos que deseamos recoger. Aplicaremos un filtrado en varios pasos.

Hay ficheros que en realidad a la práctica no son accesibles (Status 200 implica OK), cualquier otro valor implica que hay algún error.

#### DUDA: ¿si en lugar de código 200 nos da un error de otro tipo, es que Internet Archive intentó guardar la web pero falló, o es un error nuestro cuando intentamos acceder a qué tiene guardado Internet Archive?

In [11]:
df_only_HTML = df_only_HTML[df_only_HTML['Status'] == '200']  # Nos quedamos solo con los códigos 200 ("ok").
len(df_only_HTML)  # Solo hemos eliminado una pequeña porción adicional de registros.

375914

Bien, ahora si navegamos a la web de Ground News, veremos que hay una serie de noticias y apartados web que tienen información que nos puede interesar. Las páginas que pueden ser aprovechables, son las ubicadas aquí (y sus variaciones):

Esta nos interesa seguro (es la "landing page"):

https://ground.news/

Pudiéramos potencialmente obtener información extra de estas otras:

https://ground.news/blindspot

    
https://ground.news/article/... [URLS individuales]

https://ground.news/feed/... [URLS individuales]

En todo caso, la información que nos interesa está contenida en el cuerpo del código HTML de las páginas.

#### Registros de la "landing page"

Por lo que respecta a la "landing page", recopilamos cuáles son esos registros, así:

In [12]:
# Nos quedamos con las filas que contengan en el campo URL la página principal de Ground News.

landing_Ground_News = ("https://ground.news/", "http://ground.news/", "https://www.ground.news/", "http://www.ground.news/")
# Nota: he comprobado que no aparece ninguna variación de la URL con www. o sin la antibarra final.

ground_news_landing = df_only_HTML[df_only_HTML["URL"].isin(landing_Ground_News)]

Nota: hasta decidirnos por el enfoque de obtención de datos final, barajamos también el obtener otras URL como por ejemplo "https://ground.news/blindspot", "https://ground.news/feed/", "https://ground.news/article/..." y  "https://ground.news/feed/...", pero finalmente se han descartado, por superfluas.

El motivo es que, en la versión más moderna de la web, en lugar de hacer complejos "scrapings" adaptados a cada sección y a cada versión de la web, hemos hallado incrustado el equivalente a un fichero .json o un diccionario de python, del cual directamente podemos obtener la información de interés, de forma mucho más sencilla.

Es posible que con ello se sacrifique una parte de la información total, pero la inversión de tiempo de hacer un enfoque "tailored" para cosechar por scraping individual la información de cada sección, seleccionar cada "tag" y cada clase CSS, adaptarlo a cada una de las versiones de la web era prohibitiba.

In [13]:
ground_news_landing

Unnamed: 0,Timestamp,URL,Content_type,Status,Checksum,Length
1,20180430191221,https://ground.news/,text/html,200,YMWQUN4TK55LBNTMWOFSIWHOG2S5YR4Q,14044
5,20180612183329,https://ground.news/,text/html,200,SBAWULRK6FFOHBHBKVVPSR62Q2W5MHNO,14120
16,20180717061218,https://ground.news/,text/html,200,GKX7Q7WBYEUDWCQCEQN2OSFMYZGFUANN,14127
18,20180728142351,https://ground.news/,text/html,200,2LMX765WKYAPDLYPKEKY3CZE5V6KMFVF,14137
29,20180831144331,https://ground.news/,text/html,200,UFAXG7IDCM467L55HYLMKLNGTBN6QVZV,13980
...,...,...,...,...,...,...
867274,20240224015824,https://ground.news/,text/html,200,P6MMVBVRYWVJLZNVZY6Z4OMEGHU7K4CF,74678
867732,20240225022316,https://ground.news/,text/html,200,ZWD62L5YPLZIO55EHEHFMPXTJYV5PAYY,77408
868049,20240226111410,https://ground.news/,text/html,200,7BYQNZATUTQJJE3IGGZKN2JPFGU5Y2M2,72221
868854,20240228221813,https://ground.news/,text/html,200,CLSVHH5KHXBSA22FXRF7LZWXQBW5HIIQ,72683


Desde el 11 de octubre de 2020 en adelante, el código de la página lleva inserto una suerte de diccionario o .json en la parte de "<script></script>", que contiene bastante de la información de interés.

Esto comienza, por lo tanto, en el timestamp: 
20201011182643

Hay algunos cambios de interfaz, en los cuales puede valer la pena comprobar que se mantiene la estructura del .json.
Un cambio sucede en el timestamp: 
20201112101102

Otro sucede aquí, aunque el .json podría ser igual o al menos muy parecido a los anteriores: 20201212123713
No obstante, a dicho diccionario/json le falta información sí disponible en la siguiente versión.

En todo caso, la versión más completa (a partir de 20 de enero de 2022), se encuentra desde el siguiente timestamp, y en adelante:
20220120212349
Esta última versión del .json incluye una métrica muy interesante ausente en versiones anteriores, que es la "factualidad" cubriendo las noticias (y alguna otra métrica).

Dado que el tiempo es limitado, a que pudiera haber leves cambios en la estructura del .json que nos complique la vida si intentamos reunir toda la información a que realmente el más útil será el más reciente, nos centramos en la última versión, sin cerrar la puerta a cosechar la información de versiones anteriores más adelante si conviene.

Con ello también reduciremos el riesgo de bloqueo del acceso a la página (al acceder menos veces). 

In [14]:
# Convertimos el "Timestamp" a objeto "datetime":
ground_news_landing['Timestamp_datetime'] = pd.to_datetime(ground_news_landing['Timestamp'], format='%Y%m%d%H%M%S')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ground_news_landing['Timestamp_datetime'] = pd.to_datetime(ground_news_landing['Timestamp'], format='%Y%m%d%H%M%S')


In [15]:
ground_news_landing

Unnamed: 0,Timestamp,URL,Content_type,Status,Checksum,Length,Timestamp_datetime
1,20180430191221,https://ground.news/,text/html,200,YMWQUN4TK55LBNTMWOFSIWHOG2S5YR4Q,14044,2018-04-30 19:12:21
5,20180612183329,https://ground.news/,text/html,200,SBAWULRK6FFOHBHBKVVPSR62Q2W5MHNO,14120,2018-06-12 18:33:29
16,20180717061218,https://ground.news/,text/html,200,GKX7Q7WBYEUDWCQCEQN2OSFMYZGFUANN,14127,2018-07-17 06:12:18
18,20180728142351,https://ground.news/,text/html,200,2LMX765WKYAPDLYPKEKY3CZE5V6KMFVF,14137,2018-07-28 14:23:51
29,20180831144331,https://ground.news/,text/html,200,UFAXG7IDCM467L55HYLMKLNGTBN6QVZV,13980,2018-08-31 14:43:31
...,...,...,...,...,...,...,...
867274,20240224015824,https://ground.news/,text/html,200,P6MMVBVRYWVJLZNVZY6Z4OMEGHU7K4CF,74678,2024-02-24 01:58:24
867732,20240225022316,https://ground.news/,text/html,200,ZWD62L5YPLZIO55EHEHFMPXTJYV5PAYY,77408,2024-02-25 02:23:16
868049,20240226111410,https://ground.news/,text/html,200,7BYQNZATUTQJJE3IGGZKN2JPFGU5Y2M2,72221,2024-02-26 11:14:10
868854,20240228221813,https://ground.news/,text/html,200,CLSVHH5KHXBSA22FXRF7LZWXQBW5HIIQ,72683,2024-02-28 22:18:13


In [16]:
comparison_timestamp = datetime.strptime("20220120212349", '%Y%m%d%H%M%S')

# Filter the DataFrame using the datetime comparison
filtered_df = ground_news_landing[ground_news_landing["Timestamp_datetime"] >= comparison_timestamp]

In [17]:
# filtered_df = ground_news_landing[ground_news_landing["Timestamp"] >= "20220120212349"]

In [18]:
filtered_df

Unnamed: 0,Timestamp,URL,Content_type,Status,Checksum,Length,Timestamp_datetime
112281,20220120212349,https://ground.news/,text/html,200,624OYQEUHRBXIKOGL476X5EITYJIDED4,31663,2022-01-20 21:23:49
112808,20220122064039,https://ground.news/,text/html,200,2EIGNCGAQ36NNMIKHUHFVW65R5LOO5JX,37527,2022-01-22 06:40:39
113067,20220123022125,https://www.ground.news/,text/html,200,7FF5FCUC54GUPR5DWMT4DZG5S7ODJRZC,28366,2022-01-23 02:21:25
113236,20220123123816,https://ground.news/,text/html,200,4MPDTDZBADTQUOQQOLVMRYQ3SWSLGQBY,36163,2022-01-23 12:38:16
113491,20220124052335,https://ground.news/,text/html,200,M5NQJIYZMEDJXYXZB3SNSQFOPVO3FJLQ,32815,2022-01-24 05:23:35
...,...,...,...,...,...,...,...
867274,20240224015824,https://ground.news/,text/html,200,P6MMVBVRYWVJLZNVZY6Z4OMEGHU7K4CF,74678,2024-02-24 01:58:24
867732,20240225022316,https://ground.news/,text/html,200,ZWD62L5YPLZIO55EHEHFMPXTJYV5PAYY,77408,2024-02-25 02:23:16
868049,20240226111410,https://ground.news/,text/html,200,7BYQNZATUTQJJE3IGGZKN2JPFGU5Y2M2,72221,2024-02-26 11:14:10
868854,20240228221813,https://ground.news/,text/html,200,CLSVHH5KHXBSA22FXRF7LZWXQBW5HIIQ,72683,2024-02-28 22:18:13


In [19]:
filtered_df['time_diff'] = filtered_df['Timestamp_datetime'].diff().fillna(pd.Timedelta(seconds=0))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['time_diff'] = filtered_df['Timestamp_datetime'].diff().fillna(pd.Timedelta(seconds=0))


In [20]:
filtered_df

Unnamed: 0,Timestamp,URL,Content_type,Status,Checksum,Length,Timestamp_datetime,time_diff
112281,20220120212349,https://ground.news/,text/html,200,624OYQEUHRBXIKOGL476X5EITYJIDED4,31663,2022-01-20 21:23:49,0 days 00:00:00
112808,20220122064039,https://ground.news/,text/html,200,2EIGNCGAQ36NNMIKHUHFVW65R5LOO5JX,37527,2022-01-22 06:40:39,1 days 09:16:50
113067,20220123022125,https://www.ground.news/,text/html,200,7FF5FCUC54GUPR5DWMT4DZG5S7ODJRZC,28366,2022-01-23 02:21:25,0 days 19:40:46
113236,20220123123816,https://ground.news/,text/html,200,4MPDTDZBADTQUOQQOLVMRYQ3SWSLGQBY,36163,2022-01-23 12:38:16,0 days 10:16:51
113491,20220124052335,https://ground.news/,text/html,200,M5NQJIYZMEDJXYXZB3SNSQFOPVO3FJLQ,32815,2022-01-24 05:23:35,0 days 16:45:19
...,...,...,...,...,...,...,...,...
867274,20240224015824,https://ground.news/,text/html,200,P6MMVBVRYWVJLZNVZY6Z4OMEGHU7K4CF,74678,2024-02-24 01:58:24,1 days 02:41:58
867732,20240225022316,https://ground.news/,text/html,200,ZWD62L5YPLZIO55EHEHFMPXTJYV5PAYY,77408,2024-02-25 02:23:16,1 days 00:24:52
868049,20240226111410,https://ground.news/,text/html,200,7BYQNZATUTQJJE3IGGZKN2JPFGU5Y2M2,72221,2024-02-26 11:14:10,1 days 08:50:54
868854,20240228221813,https://ground.news/,text/html,200,CLSVHH5KHXBSA22FXRF7LZWXQBW5HIIQ,72683,2024-02-28 22:18:13,2 days 11:04:03


In [21]:
filtered_df['time_diff_hours'] = filtered_df['time_diff'].dt.total_seconds()/3600

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['time_diff_hours'] = filtered_df['time_diff'].dt.total_seconds()/3600


In [22]:
filtered_df

Unnamed: 0,Timestamp,URL,Content_type,Status,Checksum,Length,Timestamp_datetime,time_diff,time_diff_hours
112281,20220120212349,https://ground.news/,text/html,200,624OYQEUHRBXIKOGL476X5EITYJIDED4,31663,2022-01-20 21:23:49,0 days 00:00:00,0.000000
112808,20220122064039,https://ground.news/,text/html,200,2EIGNCGAQ36NNMIKHUHFVW65R5LOO5JX,37527,2022-01-22 06:40:39,1 days 09:16:50,33.280556
113067,20220123022125,https://www.ground.news/,text/html,200,7FF5FCUC54GUPR5DWMT4DZG5S7ODJRZC,28366,2022-01-23 02:21:25,0 days 19:40:46,19.679444
113236,20220123123816,https://ground.news/,text/html,200,4MPDTDZBADTQUOQQOLVMRYQ3SWSLGQBY,36163,2022-01-23 12:38:16,0 days 10:16:51,10.280833
113491,20220124052335,https://ground.news/,text/html,200,M5NQJIYZMEDJXYXZB3SNSQFOPVO3FJLQ,32815,2022-01-24 05:23:35,0 days 16:45:19,16.755278
...,...,...,...,...,...,...,...,...,...
867274,20240224015824,https://ground.news/,text/html,200,P6MMVBVRYWVJLZNVZY6Z4OMEGHU7K4CF,74678,2024-02-24 01:58:24,1 days 02:41:58,26.699444
867732,20240225022316,https://ground.news/,text/html,200,ZWD62L5YPLZIO55EHEHFMPXTJYV5PAYY,77408,2024-02-25 02:23:16,1 days 00:24:52,24.414444
868049,20240226111410,https://ground.news/,text/html,200,7BYQNZATUTQJJE3IGGZKN2JPFGU5Y2M2,72221,2024-02-26 11:14:10,1 days 08:50:54,32.848333
868854,20240228221813,https://ground.news/,text/html,200,CLSVHH5KHXBSA22FXRF7LZWXQBW5HIIQ,72683,2024-02-28 22:18:13,2 days 11:04:03,59.067500


In [23]:
sorted(filtered_df.time_diff_hours)

[0.0,
 0.0,
 0.0,
 0.0,
 0.0002777777777777778,
 0.0002777777777777778,
 0.0002777777777777778,
 0.0005555555555555556,
 0.0008333333333333334,
 0.0011111111111111111,
 0.001388888888888889,
 0.0033333333333333335,
 0.003611111111111111,
 0.0044444444444444444,
 0.0077777777777777776,
 0.009166666666666667,
 0.010277777777777778,
 0.014722222222222222,
 0.03166666666666667,
 0.035555555555555556,
 0.051944444444444446,
 0.08888888888888889,
 0.10333333333333333,
 0.11916666666666667,
 0.14416666666666667,
 0.16472222222222221,
 0.17444444444444446,
 0.205,
 0.25333333333333335,
 0.2538888888888889,
 0.26472222222222225,
 0.2847222222222222,
 0.3894444444444444,
 0.41833333333333333,
 0.4525,
 0.4675,
 0.47888888888888886,
 0.7597222222222222,
 0.7633333333333333,
 0.7727777777777778,
 0.7888888888888889,
 0.7911111111111111,
 0.8005555555555556,
 0.8227777777777778,
 0.8844444444444445,
 0.9180555555555555,
 0.9511111111111111,
 0.975,
 0.9752777777777778,
 0.9977777777777778,
 1.04861

In [24]:
filtered_df['full_archive_URL'] = filtered_df['Timestamp'].apply(lambda x: f"https://web.archive.org/web/{x}/https://ground.news/")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['full_archive_URL'] = filtered_df['Timestamp'].apply(lambda x: f"https://web.archive.org/web/{x}/https://ground.news/")


In [25]:
filtered_df

Unnamed: 0,Timestamp,URL,Content_type,Status,Checksum,Length,Timestamp_datetime,time_diff,time_diff_hours,full_archive_URL
112281,20220120212349,https://ground.news/,text/html,200,624OYQEUHRBXIKOGL476X5EITYJIDED4,31663,2022-01-20 21:23:49,0 days 00:00:00,0.000000,https://web.archive.org/web/20220120212349/htt...
112808,20220122064039,https://ground.news/,text/html,200,2EIGNCGAQ36NNMIKHUHFVW65R5LOO5JX,37527,2022-01-22 06:40:39,1 days 09:16:50,33.280556,https://web.archive.org/web/20220122064039/htt...
113067,20220123022125,https://www.ground.news/,text/html,200,7FF5FCUC54GUPR5DWMT4DZG5S7ODJRZC,28366,2022-01-23 02:21:25,0 days 19:40:46,19.679444,https://web.archive.org/web/20220123022125/htt...
113236,20220123123816,https://ground.news/,text/html,200,4MPDTDZBADTQUOQQOLVMRYQ3SWSLGQBY,36163,2022-01-23 12:38:16,0 days 10:16:51,10.280833,https://web.archive.org/web/20220123123816/htt...
113491,20220124052335,https://ground.news/,text/html,200,M5NQJIYZMEDJXYXZB3SNSQFOPVO3FJLQ,32815,2022-01-24 05:23:35,0 days 16:45:19,16.755278,https://web.archive.org/web/20220124052335/htt...
...,...,...,...,...,...,...,...,...,...,...
867274,20240224015824,https://ground.news/,text/html,200,P6MMVBVRYWVJLZNVZY6Z4OMEGHU7K4CF,74678,2024-02-24 01:58:24,1 days 02:41:58,26.699444,https://web.archive.org/web/20240224015824/htt...
867732,20240225022316,https://ground.news/,text/html,200,ZWD62L5YPLZIO55EHEHFMPXTJYV5PAYY,77408,2024-02-25 02:23:16,1 days 00:24:52,24.414444,https://web.archive.org/web/20240225022316/htt...
868049,20240226111410,https://ground.news/,text/html,200,7BYQNZATUTQJJE3IGGZKN2JPFGU5Y2M2,72221,2024-02-26 11:14:10,1 days 08:50:54,32.848333,https://web.archive.org/web/20240226111410/htt...
868854,20240228221813,https://ground.news/,text/html,200,CLSVHH5KHXBSA22FXRF7LZWXQBW5HIIQ,72683,2024-02-28 22:18:13,2 days 11:04:03,59.067500,https://web.archive.org/web/20240228221813/htt...


In [26]:
sorted_filtered_df = filtered_df[filtered_df["time_diff_hours"] >= 1].sort_values(by="time_diff_hours")

In [27]:
sorted_filtered_df

Unnamed: 0,Timestamp,URL,Content_type,Status,Checksum,Length,Timestamp_datetime,time_diff,time_diff_hours,full_archive_URL
729715,20230728204121,https://ground.news/,text/html,200,LXZYU57HADWIPZQO2VPARSJQTUWSP4VF,80957,2023-07-28 20:41:21,0 days 01:02:55,1.048611,https://web.archive.org/web/20230728204121/htt...
135230,20220313191912,https://ground.news/,text/html,200,L3CEZMWMR4U5CM2HMPZBONFNZUJAIOYC,31898,2022-03-13 19:19:12,0 days 01:08:36,1.143333,https://web.archive.org/web/20220313191912/htt...
840412,20240113212647,https://ground.news/,text/html,200,Z2O54QMPBAT5BD4GNYSWWIX4FPYANMVJ,75626,2024-01-13 21:26:47,0 days 01:09:05,1.151389,https://web.archive.org/web/20240113212647/htt...
139816,20220326132916,https://ground.news/,text/html,200,W4AJ2LKB3SVJ3YYZZ2FE7VAQB7FDAK64,44232,2022-03-26 13:29:16,0 days 01:11:03,1.184167,https://web.archive.org/web/20220326132916/htt...
134799,20220313103145,https://ground.news/,text/html,200,IUGPPMEFM6KSBL3T2RP5JIA7TXLAIU54,30038,2022-03-13 10:31:45,0 days 01:12:37,1.210278,https://web.archive.org/web/20220313103145/htt...
...,...,...,...,...,...,...,...,...,...,...
328511,20221101000834,https://ground.news/,text/html,200,3PP4J6G53552DA33PM3CWHBO443HQEGP,38947,2022-11-01 00:08:34,8 days 18:37:03,210.617500,https://web.archive.org/web/20221101000834/htt...
386166,20221127221246,https://ground.news/,text/html,200,MMRYGY4MD4NL67OFRNGAUBIYBDZTDSMM,59308,2022-11-27 22:12:46,9 days 02:42:37,218.710278,https://web.archive.org/web/20221127221246/htt...
206274,20220725205403,https://ground.news/,text/html,200,B4F5FAMWPFUVMQKGXMS3MKF7UW42I56K,45959,2022-07-25 20:54:03,9 days 10:01:32,226.025556,https://web.archive.org/web/20220725205403/htt...
260752,20221001001902,https://ground.news/,text/html,200,4ZNVLIYLGP4FZ7W5DUC6RLC56ATDW7PV,38444,2022-10-01 00:19:02,13 days 02:59:18,314.988333,https://web.archive.org/web/20221001001902/htt...


In [28]:
# Configure Firefox Options
firefox_options = FirefoxOptions()

# Example: Add argument to run Firefox in headless mode
firefox_options.add_argument("--headless")

# Optionally, configure a Firefox profile as needed
firefox_profile = webdriver.FirefoxProfile()

# Example: Disable images to make it lighter on the server
firefox_profile.set_preference("permissions.default.image", 2)
firefox_profile.set_preference("dom.ipc.plugins.enabled.libflashplayer.so", False)


driver = webdriver.Firefox(
    executable_path='C:\\Users\\karel\\.wdm\\drivers\\geckodriver\\win64\\v0.34.0\\geckodriver.exe',
    firefox_profile=firefox_profile,
    options=firefox_options
)

In [48]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager

# Configure Chrome Options
chrome_options = Options()

# To run in non-headless mode, simply do not add the "--headless" argument
# (The line to add the headless argument is omitted intentionally for non-headless mode)

# Optionally, disable images to make it lighter on the server
chrome_prefs = {
    "profile.managed_default_content_settings.images": 2,
    "plugins.plugins_disabled": ["Chrome PDF Viewer"]  # This disables the internal PDF viewer
    # Additional preferences can be set here as needed
}
chrome_options.add_experimental_option("prefs", chrome_prefs)

# Initialize the Chrome WebDriver with webdriver-manager
driver = webdriver.Chrome(ChromeDriverManager().install(), options=chrome_options)

In [39]:
# Check if the json_data subfolder exists, create it if it doesn't
if not os.path.exists("json_data"):
    os.makedirs("json_data")

In [40]:
sample_df = filtered_df.head(1)

In [41]:
sample_df

Unnamed: 0,Timestamp,URL,Content_type,Status,Checksum,Length,Timestamp_datetime,time_diff,time_diff_hours,full_archive_URL
112281,20220120212349,https://ground.news/,text/html,200,624OYQEUHRBXIKOGL476X5EITYJIDED4,31663,2022-01-20 21:23:49,0 days,0.0,https://web.archive.org/web/20220120212349/htt...


In [42]:
# Lo mezclamos para que no detecten secuencialidad lineal en las peticiones de acceso:
shuffled_filtered_df = sorted_filtered_df.sample(frac=1).reset_index(drop=True)

In [50]:
for index, row in shuffled_filtered_df.iterrows():
    # Define the file name using the Timestamp, now including the subfolder path "json_data"
    file_name = os.path.join("json_data", f"{row['Timestamp']}.json")
    
    # Check if the file already exists
    if os.path.exists(file_name):
        print(f"File {file_name} already exists. Skipping to the next URL.")
        continue  # Skip the rest of the loop and move to the next row
    
    # La url que visitará el webdriver
    url = row['full_archive_URL']
    
    # Tiempo de descanso de entre algo más de un minuto y poco más de cuatro minutos (la web es lenta... para que respire).
    sleep_time = random.uniform(128, 286)

    # Sleep for the randomly generated duration
    sleep(sleep_time)
    
    # Visit the URL
    driver.get(url)
    
    # Wait for the page to load. Adjust the time as necessary for your network speed and page complexity.
    driver.implicitly_wait(20)  # Waits for 20 seconds; mucho tiempo, porque va tan lento como el caballo del malo
    
    # Extract the content of the <script> tag
    script_content = driver.find_element_by_id("__NEXT_DATA__").get_attribute('innerHTML')
    
    # Convert the script content into a dictionary (assuming it's valid JSON)
    data = json.loads(script_content)
    
    # Save the extracted content as a JSON file
    with open(file_name, 'w') as file:
        json.dump(data, file)

    print(f"Data saved to {file_name}")
    
# Make sure to close the driver after completing the tasks
driver.quit()

File json_data\20221104001135.json already exists. Skipping to the next URL.
File json_data\20230614004710.json already exists. Skipping to the next URL.
File json_data\20220303134026.json already exists. Skipping to the next URL.
File json_data\20240112174021.json already exists. Skipping to the next URL.
File json_data\20220701001203.json already exists. Skipping to the next URL.
File json_data\20231222024708.json already exists. Skipping to the next URL.
File json_data\20230908080917.json already exists. Skipping to the next URL.
File json_data\20230628141702.json already exists. Skipping to the next URL.
File json_data\20220201165336.json already exists. Skipping to the next URL.
File json_data\20231214125220.json already exists. Skipping to the next URL.
File json_data\20220128095118.json already exists. Skipping to the next URL.
File json_data\20230401042949.json already exists. Skipping to the next URL.
File json_data\20230503144707.json already exists. Skipping to the next URL.

In [None]:
Parece funcionar. Lo hacemos con el resto del dataframe.

In [47]:
driver.quit()

Hemos logrado 475 ficheros .json desde 20 de enero de 2022, que en principio incorporan una estructura de datos común.

Vemos que "factuality", una métrica de gran interés a priori, solo aparece a partir del fichero "20220222033957.json". Por homogeneizar los datos, nos quedamos con los ficheros a partir de este solamente. Eso nos deja con 430 ficheros .json "modernos" todos ellos con esa métrica.

La parte referida al trabajo con los datos propiamente dicho, la hacemos en un nuevo notebook por claridad.