In [None]:
from graphframes import *

In [None]:
import requests
# Download the file from Google Drive
def download_file(file_id):
    url = f"https://drive.google.com/uc?id={file_id}"
    response = requests.get(url)
    with open("./file.txt", "wb") as f:
        f.write(response.content)

download_file("1Dx_wKGvcdlKvfEaM2Z_6C0gJkTn181sK")

In [None]:
rdd = sc.textFile("./file.txt")

In [None]:
edges_rdd = rdd.map(lambda x: (x.split("\t")[0], x.split("\t")[1]))

In [None]:
vertices_rdd = edges_rdd.flatMap(lambda x: [(x[0], ), (x[1], )]).distinct()

In [None]:
e = spark.createDataFrame(edges_rdd, ["src", "dst"])
v = spark.createDataFrame(vertices_rdd, ["id"])

In [None]:
g = GraphFrame(v, e)



In [None]:
outdegrees = g.outDegrees



In [None]:
outdegrees.show()

+----+---------+
|  id|outDegree|
+----+---------+
| 296|       10|
| 467|       12|
| 675|        1|
| 691|       24|
|1159|        2|
|1090|        2|
|1436|        1|
|1512|        1|
|1572|        2|
|2069|       17|
|2088|        6|
|2136|        1|
|2294|        9|
| 125|       29|
| 451|        1|
| 800|        2|
| 853|       16|
| 944|        5|
|1394|        9|
|2110|        3|
+----+---------+
only showing top 20 rows



In [None]:
nodes_with_highest_outdegree = outdegrees.orderBy(outdegrees["outdegree"].desc()).limit(5)

In [None]:
nodes_with_highest_outdegree.show()

+----+---------+
|  id|outDegree|
+----+---------+
|2565|      893|
| 766|      773|
|  11|      743|
| 457|      732|
|2688|      618|
+----+---------+



In [None]:
indegrees = g.inDegrees

In [None]:
nodes_with_highest_indegrees = indegrees.orderBy(indegrees["indegree"].desc()).limit(5)

In [None]:
nodes_with_highest_indegrees.show()

+----+--------+
|  id|inDegree|
+----+--------+
|4037|     457|
|  15|     361|
|2398|     340|
|2625|     331|
|1297|     309|
+----+--------+



In [None]:
pagerank_results = g.pageRank(resetProbability=0.15, tol=0.01)

In [None]:
pagerank_values = pagerank_results.vertices

In [None]:
top_5_pagerank_nodes = pagerank_values.orderBy(pagerank_values.pagerank.desc()).limit(5)

In [None]:
top_5_pagerank_nodes.show()

+----+------------------+
|  id|          pagerank|
+----+------------------+
|4037| 32.84294411698262|
|  15|  26.1242509353023|
|6634|25.324758401496755|
|2625|23.143419134686606|
|2398|18.458414261565846|
+----+------------------+



In [None]:
checkpoint_dir = "/tmp/checkpoint"

In [None]:
sc.setCheckpointDir(checkpoint_dir)

In [None]:
connected_components = g.connectedComponents()

In [None]:
component_sizes = connected_components.groupby("component").count()

In [None]:
top_5_components = component_sizes.orderBy(component_sizes["count"].desc()).limit(5)

In [None]:
top_5_components.show()

+------------+-----+
|   component|count|
+------------+-----+
|           0| 7066|
|532575944741|    3|
|592705486870|    3|
|936302870556|    3|
|369367187471|    2|
+------------+-----+



In [None]:
triangle_counts = g.triangleCount()

In [None]:
triangle_counts.show()

+-----+----+
|count|  id|
+-----+----+
|  280|   3|
|   57|  30|
| 2309|  28|
| 1899|  35|
|   43| 300|
|  420|  75|
| 3143|   6|
|   50| 604|
|  661|  23|
|  948|  55|
|   26| 349|
|  748|  25|
| 1879| 271|
|  471| 178|
|   68| 567|
|  345| 152|
|  955| 371|
|  368|  39|
|    4|8283|
|  805| 182|
+-----+----+
only showing top 20 rows



In [None]:
top_5_vertices = triangle_counts.orderBy(triangle_counts["count"].desc()).limit(5)


In [None]:
top_5_vertices.show()

+-----+----+
|count|  id|
+-----+----+
|30940|2565|
|22003|1549|
|18204| 766|
|17361|1166|
|14220|2688|
+-----+----+



In [None]:
output_file_paths = ["dbfs:/FileStore/shared_uploads/otaruntejaa@gmail.com/Assignment3Part2res1.csv",
                     "dbfs:/FileStore/shared_uploads/otaruntejaa@gmail.com/Assignment3Part2res2.csv",
                     "dbfs:/FileStore/shared_uploads/otaruntejaa@gmail.com/Assignment3Part2res3.csv",
                     "dbfs:/FileStore/shared_uploads/otaruntejaa@gmail.com/Assignment3Part2res4.csv",
                     "dbfs:/FileStore/shared_uploads/otaruntejaa@gmail.com/Assignment3Part2res5.csv"
                     ]

In [None]:
nodes_with_highest_outdegree.write.csv(output_file_paths[0], header=True)
nodes_with_highest_indegrees.write.csv(output_file_paths[1], header=True)
top_5_pagerank_nodes.write.csv(output_file_paths[2], header=True)
top_5_components.write.csv(output_file_paths[3], header=True)
top_5_vertices.write.csv(output_file_paths[4], header=True)