<a href="https://colab.research.google.com/github/javieraespinosa/lifranum/blob/main/WARC_URLs_Extraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Configuration

Requirements:
https://aut.docs.archivesunleashed.org/docs/dependencies


In [1]:
import os

In [2]:
SPARK_VERSION = "spark-2.4.7-bin-hadoop2.7"
AUT_VERSION   = "aut-0.80.0"
APPS_HOME     = os.getcwd() + "/apps"

In [3]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!pip install -q findspark

In [4]:
!mkdir -p "$APPS_HOME"

!wget -q https://archive.apache.org/dist/spark/spark-2.4.7/"$SPARK_VERSION".tgz
!wget -q https://github.com/archivesunleashed/aut/releases/download/"$AUT_VERSION"/"$AUT_VERSION".zip
!wget -q https://github.com/archivesunleashed/aut/releases/download/"$AUT_VERSION"/"$AUT_VERSION"-fatjar.jar
!tar -xf "$SPARK_VERSION".tgz
!mv spark-* aut-* "$APPS_HOME"

!wget -q https://bin.equinox.io/c/4VmDzA7iaHb/ngrok-stable-linux-amd64.zip
!unzip -qq ngrok-stable-linux-amd64.zip
!mv ngrok* "$APPS_HOME"

!rm -rf sample_data

In [5]:
import os
import findspark

SPARK_DRIVER_MEMORY   = "8g"

os.environ["JAVA_HOME"]  = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "{}/{}".format(APPS_HOME, SPARK_VERSION)   
os.environ['PYSPARK_SUBMIT_ARGS'] = '--driver-memory {0} --jars {2}/{1}-fatjar.jar --py-files {2}/{1}.zip pyspark-shell'.format(SPARK_DRIVER_MEMORY, AUT_VERSION, APPS_HOME)

findspark.init()

In [6]:
import pyspark
from pyspark.sql import SQLContext

conf = pyspark.SparkConf().set("spark.ui.port", "4050")

sc = pyspark.SparkContext.getOrCreate(conf)
sqlContext = SQLContext(sc)

In [7]:
sc

## Bug: Correcting URLS in Wget's WARCs

In [9]:
from pyspark.sql.functions import udf

url_correction = udf(lambda s: s[1:-1] if len(s) > 0 and s[0] == '<' and s[-1] == '>' else s)

# Extracting web pages's URLs from WARCs files

In [16]:
WARCs_PATH="/content/drive/MyDrive/LIFRANUM-DRIVE/jecritures.blogspot.com-H-L1/out-00000.warc.gz"

In [37]:
# Temporal folder for CSV files containing partial results
OUTPUT_PATH="full-links-all/"

In [38]:
from aut import *
from pyspark.sql.functions import desc, col

WebArchive(sc, sqlContext, WARCs_PATH) \
  .webpages() \
  .withColumn("url", url_correction("url")) \
  .select("url") \
  .select(remove_prefix_www(extract_domain(col("url"))).alias("domain"), 'url') \
  .groupBy("domain", "url") \
  .count() \
  .sort(desc("count")) \
  .write.csv(OUTPUT_PATH)


In [39]:
# Create a single CSV with partial results 
!cat "$OUTPUT_PATH"/*.csv > full-links-all.csv

In [40]:
# Delete temporal folder
!rm -r "$OUTPUT_PATH"