# Author:

- Huu Khang Nguyen - 7402909
- hkn878@uowmail.edu.au


# Environment:

- Python 3.10.6
- Ubuntu 22.04.2 LTS x86_64

In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.getOrCreate()
spark_context = spark.sparkContext

23/05/28 14:07:41 WARN Utils: Your hostname, huukhang1512-B550I-AORUS-PRO-AX resolves to a loopback address: 127.0.1.1; using 192.168.0.162 instead (on interface wlp6s0)
23/05/28 14:07:41 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/05/28 14:07:41 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


# Load data into RDD

In [20]:
# Download the webpage dataset into the local machine
WEB_PAGE_DATASET = './gr0.California.txt'

# Read into RDD
lines = spark_context.textFile(WEB_PAGE_DATASET)

In [21]:
def process_line(line):
    return line.split(" ")

In [22]:
lines_map = lines.map(process_line)

In [83]:
def add_1(line):
    (_, src, dest) = line.split(' ')
    return (src, dest, 1)

In [84]:
def process_web_page(line):
    (_, id, url) = line.split(' ')
    return (id, url)

In [85]:
web_page = lines.filter(lambda line: "n" in line[0]).map(process_web_page)
web_page_graph = lines.filter(lambda line: "e" in line[0]).map(add_1)


In [86]:
len(web_page.collect())

9664

In [87]:
len(web_page_graph.collect())

16150

# Reduce Step

## Reduce by out-degree

### Out-degree

In [147]:
from operator import add
out_degree_map = web_page_graph \
    .map(lambda x: (x[0], x[2])) \
    .union(web_page.map(lambda x: (x[0], 0))) \
    .reduceByKey(add) \
    .sortBy(lambda x: x[1], ascending=False)

#### Web pages with largest out-degree

##### Get top 5

In [156]:
top_five_out_largest = out_degree_map.collect()[:5]
top_five_out_largest

[('235', 164), ('1437', 46), ('1627', 45), ('257', 43), ('1235', 43)]

In [161]:
for row in top_five_out_largest:
    (id_to_find, count) = row
    matching_urls = web_page.filter(lambda x: x[0] == id_to_find)
    for id, url in matching_urls.collect():
        print(id, url, count)


235 http://www.water.ca.gov/www.gov.sites.html 164
1437 http://www.igs.berkeley.edu:8880/library/cgpp.html 46
1627 http://www.ca.gov/s/search/servers.html 45
257 http://www.seismo.berkeley.edu/seismo/Homepage.html 43
1235 http://california.findlaw.com/CA10_california_governemnt/index.html 43


#### Get largest

In [172]:
largest = out_degree_map.max(lambda x: x[1])
matching_urls = web_page.filter(lambda x: x[0] == largest[0])
for id, url in matching_urls.collect():
    print(id, url, largest[1])

235 http://www.water.ca.gov/www.gov.sites.html 164


#### Number of webpages whose out-degree is 0

#### Average out-degree

In [129]:
out_degree_map.filter(lambda x: x[1] == 0).count()

4637

#### Average out-degree

In [138]:
avg = out_degree_map.map(lambda x: x[1]).mean()
print(f"Average out-degree (includes those with no out degree) = {avg}")

Average out-degree (includes those with no out degree) = 1.671150662251659


In [139]:
avg = web_page_graph \
    .map(lambda x: (x[0], x[2])) \
    .reduceByKey(add) \
    .map(lambda x: x[1]).mean()
print(f"Average out-degree (does not includes those with no out degree) = {avg}")

Average out-degree (does not includes those with no out degree) = 3.2126516809230137


### In-degree

In [153]:
in_degree_map = web_page_graph \
    .map(lambda x: (x[1], x[2])) \
    .union(web_page.map(lambda x: (x[0], 0))) \
    .reduceByKey(add) \
    .sortBy(lambda x: x[1], ascending=False)

#### Average in-degree

In [137]:
avg = in_degree_map.map(lambda x: x[1]).mean()
print(f"Average in-degree = {avg}")

Average in-degree = 1.6711506622516576


In [140]:
avg = web_page_graph \
    .map(lambda x: (x[1], x[2])) \
    .reduceByKey(add) \
    .map(lambda x: x[1]).mean()
print(f"Average in-degree (does not includes those with no out degree) = {avg}")

Average in-degree (does not includes those with no out degree) = 7.6941400666984245


#### Webpages with largest indegree

##### Top 5

In [170]:
top_five_in_largest = in_degree_map.collect()[:5]
top_five_in_largest

[('1806', 199), ('1079', 169), ('9', 155), ('2078', 134), ('0', 126)]

In [160]:
for row in top_five_in_largest:
    (id_to_find, count) = row
    matching_urls = web_page.filter(lambda x: x[0] == id_to_find)
    for id, url in matching_urls.collect():
        print(id, url, count)


1806 http://www.yahoo.com/ 199
1079 http://www.ca.gov/ 169
9 http://www.leginfo.ca.gov/calaw.html 155
2078 http://www.linkexchange.com/ 134
0 http://www.berkeley.edu/ 126


##### Largest

In [171]:
largest = in_degree_map.max(lambda x: x[1])
matching_urls = web_page.filter(lambda x: x[0] == largest[0])
for id, url in matching_urls.collect():
    print(id, url, largest[1])

1806 http://www.yahoo.com/ 199
