# Exploring WebArchives using GraphFrames (Docker)

## ENV config

Initialize spark in [single-node cluster](https://docs.databricks.com/clusters/single-node.html) with the AUT and GraphFrames libraries.


In [None]:
%run ../scripts/spark-init-docker.ipynb
spark

## LIFRANUM dataset

In [None]:
%%capture
DIR="LIFRANUM"
!mkdir -p $DIR

# --------------------------------------------------------
# UNCOMMENT THE LINE(S) BELOW FOR DOWNLOADING 
# THE WARC COLLECTION(S) OF YOUR CHOICE
# --------------------------------------------------------

# !gsutil -m cp -r gs://cpe-lyon/LIFRANUM/autre $DIR
# !gsutil -m cp -r gs://cpe-lyon/LIFRANUM/cartoweb $DIR
# !gsutil -m cp -r gs://cpe-lyon/LIFRANUM/lifranum-method $DIR
# !gsutil -m cp -r gs://cpe-lyon/LIFRANUM/repo-ecritures-num $DIR

## WebArchive links extraction

Note:

* **AUT generates dataframes**. See the [AUT dataframe schemas](https://aut.docs.archivesunleashed.org/docs/dataframe-schemas) for more info.
* Check the [AUT documentation](https://aut.docs.archivesunleashed.org/docs/home) for more examples.



In [None]:
from aut import *
from pyspark.sql.functions import col, udf
from pyspark.sql.functions import desc, col, udf

WARCs_path = "LIFRANUM/repo-ecritures-num/out-00000.warc.gz"

# transform WARC(s) to a webgraph dataframe
df = WebArchive(spark.sparkContext, sqlContext, WARCs_path)\
    .webgraph()\
    .withColumnRenamed("src", "orig") # .limit(1000)

df.show(10)

## GraphFrame Generation

In [None]:
import tldextract

@udf("string")
def extract_domain(s):
    # return URL full qualified domain name
    return tldextract.extract(s).fqdn   

sqlContext.udf.register("extract_domain", extract_domain)

In [None]:
df.createOrReplaceTempView("webgraph")

sql='''
    SELECT src, dst, count(*) AS count
    FROM (
        SELECT extract_domain(orig) AS src,
               extract_domain(dest) AS dst
        FROM webgraph
    )
    WHERE src != "" AND dst != ""
    GROUP BY src, dst
'''

edges = sqlContext.sql(sql)
edges.cache()
edges.show(10)

edges.createOrReplaceTempView("edges")

sql='''
    SELECT DISTINCT src as id
    FROM   edges
    UNION
        SELECT dst
        FROM   edges
'''

vertices = sqlContext.sql(sql)
vertices.cache()
vertices.show(10)

In [None]:
from graphframes import GraphFrame

webGraph = GraphFrame(vertices, edges)
webGraph.cache()

print("Nodes: " + str(webGraph.vertices.count()))
print("Edges: " + str(webGraph.edges.count()))

In [None]:
webGraph.inDegrees.orderBy(desc("inDegree")).show(10, False)

In [None]:
webGraph.outDegrees.orderBy(desc("outDegree")).show(10, False)

In [None]:
webGraph.degrees.orderBy(desc("degree")).show(10, False)

## Graph Visualization

In [None]:
import matplotlib.pyplot as plt
import networkx as nx

dfx = edges.limit(1000).sample(0.5)

G = nx.from_pandas_edgelist(
    dfx.toPandas(), 
    source="src", 
    target="dst", 
    edge_key="dst", 
    edge_attr="count"
)

In [None]:
from ipycytoscape import *

cytoscapeobj = CytoscapeWidget()
cytoscapeobj.graph.add_graph_from_networkx(G, directed=True)
cytoscapeobj.set_style([
    {
        'selector': 'node',
        'css': { 
            'content': 'data(id)'
        } 
    }
])

# See https://blog.js.cytoscape.org/2020/05/11/layouts/
cytoscapeobj.set_layout(name='concentric')
cytoscapeobj

## PageRank

In [None]:
ranks = webGraph.pageRank(
    resetProbability=0.15,
    maxIter=1
)

ranks.cache()

In [None]:
ranks.vertices\
    .orderBy(desc("pagerank"))\
    .select("id", "pagerank")\
    .show(10, False)

## Community Detection

In [None]:
communities = webGraph.labelPropagation(maxIter=5).orderBy('label')
communities.cache()

In [None]:
communities.show(10)