<a href="https://colab.research.google.com/github/javieraespinosa/lifranum/blob/main/LIFRANUM_UAT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Configuration

Requirements:
https://aut.docs.archivesunleashed.org/docs/dependencies


In [None]:
import os

In [None]:
SPARK_VERSION = "spark-2.4.7-bin-hadoop2.7"
AUT_VERSION   = "aut-0.80.0"
APPS_HOME     = os.getcwd() + "/apps"

In [None]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!pip install -q findspark

In [None]:
!mkdir -p "$APPS_HOME"

!wget -q https://archive.apache.org/dist/spark/spark-2.4.7/"$SPARK_VERSION".tgz
!wget -q https://github.com/archivesunleashed/aut/releases/download/"$AUT_VERSION"/"$AUT_VERSION".zip
!wget -q https://github.com/archivesunleashed/aut/releases/download/"$AUT_VERSION"/"$AUT_VERSION"-fatjar.jar
!tar -xf "$SPARK_VERSION".tgz
!mv spark-* aut-* "$APPS_HOME"

!wget -q https://bin.equinox.io/c/4VmDzA7iaHb/ngrok-stable-linux-amd64.zip
!unzip -qq ngrok-stable-linux-amd64.zip
!mv ngrok* "$APPS_HOME"

!rm -rf sample_data

In [None]:
import os
import findspark

SPARK_DRIVER_MEMORY   = "8g"

os.environ["JAVA_HOME"]  = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "{}/{}".format(APPS_HOME, SPARK_VERSION)   
os.environ['PYSPARK_SUBMIT_ARGS'] = '--driver-memory {0} --jars {2}/{1}-fatjar.jar --py-files {2}/{1}.zip pyspark-shell'.format(SPARK_DRIVER_MEMORY, AUT_VERSION, APPS_HOME)

findspark.init()

In [None]:
import pyspark
from pyspark.sql import SQLContext

conf = pyspark.SparkConf().set("spark.ui.port", "4050")

sc = pyspark.SparkContext.getOrCreate(conf)
sqlContext = SQLContext(sc)

In [None]:
sc

In [None]:
get_ipython().system_raw('{}/ngrok http 4050 &'.format(APPS_HOME))

In [None]:
!curl -s http://localhost:4040/api/tunnels | python3 -c \
    "import sys, json; print(json.load(sys.stdin)['tunnels'][0]['public_url'])"

http://e507cf101822.ngrok.io


# Data

In [None]:
!mkdir data
!wget -q "https://github.com/archivesunleashed/aut-resources/blob/master/Sample-Data/ARCHIVEIT-227-UOFTORONTO-CANPOLPINT-20060622205612-00009-crawling025.archive.org.arc.gz?raw=true" -O data/ARCHIVEIT-227-UOFTORONTO-CANPOLPINT-20060622205612-00009-crawling025.archive.org.arc.gz
!wget -q "https://github.com/archivesunleashed/aut-resources/blob/master/Sample-Data/ARCHIVEIT-227-QUARTERLY-XUGECV-20091218231727-00039-crawling06.us.archive.org-8091.warc.gz?raw=true" -O data/ARCHIVEIT-227-QUARTERLY-XUGECV-20091218231727-00039-crawling06.us.archive.org-8091.warc.gz

# Examples

## Extract All URLs

In [None]:
from aut import *
from pyspark.sql.functions import desc

WebArchive(sc, sqlContext, "/content/drive/MyDrive/WARC") \
  .webpages() \
  .select("url") \
  .show(10, False)

+-------------------------------------------------------------------------------------------------------------------------------------+
|url                                                                                                                                  |
+-------------------------------------------------------------------------------------------------------------------------------------+
|http://haikuduvidetdelaplenitude.blogspot.com/2016/02/nahaiwrimo-j-11.html                                                           |
|https://www.tempslibres.org/tl/tlphp/dbhk03.php?id=8763&lg=f                                                                         |
|https://cheminsbattus.wordpress.com/2018/12/27/distance/                                                                             |
|https://wordpress.com/log-in?redirect_to=https%3A%2F%2Fcheminsbattus.wordpress.com%2F2018%2F12%2F27%2Fdistance%2F&signup_flow=account|
|https://www.tempslibres.org/tl/tlphp/dbhk03.php

## Extract Top-Level Domains

In [None]:
from aut import *
from pyspark.sql.functions import desc

WebArchive(sc, sqlContext, "/content/drive/MyDrive/WARC") \
  .webpages() \
  .select(extract_domain("url").alias("domain")) \
  .groupBy("domain") \
  .count() \
  .sort(desc("count")) \
  .show(10, False)

+--------------------------------------+-----+
|domain                                |count|
+--------------------------------------+-----+
|www.tempslibres.org                   |3811 |
|haikuduvidetdelaplenitude.blogspot.com|2138 |
|twitter.com                           |1067 |
|wordpress.com                         |995  |
|cheminsbattus.wordpress.com           |856  |
|lefeucentral.blogspot.com             |658  |
|janickbelleau.ca                      |482  |
|lalitoutsimplement.com                |471  |
|www.youtube.com                       |96   |
|poesiemuziketc.fr                     |10   |
+--------------------------------------+-----+
only showing top 10 rows



## Extract Simple Site Link Structure

In [None]:
from aut import *
from pyspark.sql.functions import col, explode

content = "%radio%"

WebArchive(sc, sqlContext, "/content/drive/MyDrive/WARC") \
  .webpages() \
  .filter(col("content").like(content)) \
  .select(explode(extract_links("url", "content")).alias("links")) \
  .select(remove_prefix_www(extract_domain(col("links._1"))).alias("src"), remove_prefix_www(extract_domain(col("links._2"))).alias("dest")) \
  .groupBy("src", "dest") \
  .count() \
  .filter(col("count") > 5) \
  .write.csv("links-all-apple-df/")


## Finding Hyperlinks within Collection on Pages with Certain Keyword

In [None]:
from aut import *
from pyspark.sql.functions import col, explode_outer

webpages = WebArchive(sc, sqlContext, "/content/drive/MyDrive/WARC") \
  .webpages() \
  .select(remove_prefix_www(extract_domain("url")).alias("domain"), "url", "crawl_date", explode_outer(extract_links("url", "content")).alias("link")) \
  .filter(col("content").like("%food%")) \
  .select("url", "domain", "crawl_date", col("link._1").alias("destination_page")) \
  .show(10)

+--------------------+--------------------+----------+--------------------+
|                 url|              domain|crawl_date|    destination_page|
+--------------------+--------------------+----------+--------------------+
|http://haikuduvid...|haikuduvidetdelap...|  20201028|http://haikuduvid...|
|http://haikuduvid...|haikuduvidetdelap...|  20201028|http://haikuduvid...|
|http://haikuduvid...|haikuduvidetdelap...|  20201028|http://haikuduvid...|
|http://haikuduvid...|haikuduvidetdelap...|  20201028|http://haikuduvid...|
|http://haikuduvid...|haikuduvidetdelap...|  20201028|http://haikuduvid...|
|http://haikuduvid...|haikuduvidetdelap...|  20201028|http://haikuduvid...|
|http://haikuduvid...|haikuduvidetdelap...|  20201028|http://haikuduvid...|
|http://haikuduvid...|haikuduvidetdelap...|  20201028|http://haikuduvid...|
|http://haikuduvid...|haikuduvidetdelap...|  20201028|http://haikuduvid...|
|http://haikuduvid...|haikuduvidetdelap...|  20201028|http://haikuduvid...|
|http://haik

## Export to Gephi

In [None]:
from pyspark.sql.functions import col, desc

graph = WebArchive(sc, sqlContext, "/content/drive/MyDrive/WARC") \
          .webgraph() \
          .groupBy("crawl_date", remove_prefix_www(extract_domain("src")).alias("src_domain"), remove_prefix_www(extract_domain("dest")).alias("dest_domain")) \
          .count() \
          .filter((col("dest_domain").isNotNull()) & (col("dest_domain") !="")) \
          .filter((col("src_domain").isNotNull()) & (col("src_domain") !="")) \
          .filter(col("count") > 5) \
          .orderBy(desc("count")) \
          .collect()

WriteGEXF(graph, "links-for-gephi.gexf")

# Resources

* [A Must-Read Guide on How to Work with PySpark on Google Colab for Data Scientists!](https://www.analyticsvidhya.com/blog/2020/11/a-must-read-guide-on-how-to-work-with-pyspark-on-google-colab-for-data-scientists/)
* [Archives Unleashed Toolkit](https://aut.docs.archivesunleashed.org/docs/dependencies)
