In [30]:
!pip install pyspark



In [31]:
!pip install -q findspark
import findspark
findspark.init()

In [32]:
!wget -q https://storage.googleapis.com/public_lddm_data/small_page_links.nt
!ls

sample_data	     small_page_links.nt.1  small_page_links.nt.3
small_page_links.nt  small_page_links.nt.2


In [33]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

# Créez une session Spark
spark = SparkSession.builder.appName("PageRankExample").getOrCreate()


In [34]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType
schema = StructType([
    StructField("source", StringType(), nullable=True),
    StructField("predicate", StringType(), nullable=True),
    StructField("target", StringType(), nullable=True)
])

In [35]:
# Chargez vos données web en tant que DataFrame
# Supposons que vous ayez un DataFrame avec deux colonnes : 'source' et 'target' représentant les liens entre les pages web
# Par exemple, vous pouvez le charger à partir d'un fichier CSV
data = spark.read.option("delimiter"," ").csv("small_page_links.nt", header=False, schema=schema)
data.show(5,truncate=200)

+-------------------------------------------------------+--------------------------------------+-----------------------------------------------------------+
|                                                 source|                             predicate|                                                     target|
+-------------------------------------------------------+--------------------------------------+-----------------------------------------------------------+
|       <http://dbpedia.org/resource/AfghanistanHistory>|<http://dbpedia.org/property/wikilink>|       <http://dbpedia.org/resource/History_of_Afghanistan>|
|     <http://dbpedia.org/resource/AfghanistanGeography>|<http://dbpedia.org/property/wikilink>|     <http://dbpedia.org/resource/Geography_of_Afghanistan>|
|      <http://dbpedia.org/resource/AccessibleComputing>|<http://dbpedia.org/property/wikilink>|       <http://dbpedia.org/resource/Computer_accessibility>|
|        <http://dbpedia.org/resource/AfghanistanPeople>|<

In [36]:
# Créez un DataFrame contenant le nombre de liens sortants pour chaque page
outdegrees = data.groupBy("source").count().withColumnRenamed("source", "page").withColumnRenamed("count", "outDegree")

damping_factor = 0.85
initial_pagerank = 1.0

# Créez un DataFrame contenant les valeurs de PageRank initiales
pagerank = outdegrees.withColumn("pagerank", col("outDegree") / initial_pagerank)

pagerank.show(5,truncate=100)


+-----------------------------------------------------+---------+--------+
|                                                 page|outDegree|pagerank|
+-----------------------------------------------------+---------+--------+
|         <http://dbpedia.org/resource/Actinopterygii>|      197|   197.0|
|<http://dbpedia.org/resource/AtlasShruggedCharacters>|        1|     1.0|
|             <http://dbpedia.org/resource/Allegiance>|       35|    35.0|
|                  <http://dbpedia.org/resource/AbboT>|        1|     1.0|
|        <http://dbpedia.org/resource/AcademicElitism>|        1|     1.0|
+-----------------------------------------------------+---------+--------+
only showing top 5 rows



In [37]:
import time
max_iterations = 10
debut = time.time()

# Effectuez des itérations pour calculer le PageRank

for iteration in range(max_iterations):
  # Rejoignez le DataFrame pagerank avec le DataFrame data pour calculer la contribution à partir des liens entrants
  contrib = data.join(pagerank, data.target == pagerank.page, "left").select("source", "pagerank")

  new_pagerank = contrib.groupBy("source").sum("pagerank").withColumnRenamed("source", "page").withColumnRenamed("sum(pagerank)", "pagerank")

  # Joignez le DataFrame "new_pagerank" avec le DataFrame "outdegrees" pour obtenir les "outDegree" appropriés
  pagerank = new_pagerank.join(outdegrees, new_pagerank.page == outdegrees.page, "left").select(new_pagerank.page, new_pagerank.pagerank, outdegrees.outDegree)

  # Appliquez la formule du PageRank
  pagerank = pagerank.withColumn("pagerank", (1 - damping_factor) + damping_factor * col("pagerank") / col("outDegree"))


# Affichez les résultats
pagerank.select("page", "pagerank").show()
fin = time.time()

#Affiche le lien avec le plus haut page rank
page_max_pagerank = pagerank.orderBy("pagerank", ascending=False).first()
print(f"Temps d'exécution : {fin-debut} secondes / {page_max_pagerank}")
# Arrêtez la session Spark
# spark.stop()

+--------------------+-------------------+
|                page|           pagerank|
+--------------------+-------------------+
|<http://dbpedia.o...|               NULL|
|<http://dbpedia.o...|  0.281117088459783|
|<http://dbpedia.o...|               NULL|
|<http://dbpedia.o...| 0.2798907386775123|
|<http://dbpedia.o...|               NULL|
|<http://dbpedia.o...|               NULL|
|<http://dbpedia.o...|               NULL|
|<http://dbpedia.o...|0.15577653242948128|
|<http://dbpedia.o...|               NULL|
|<http://dbpedia.o...|0.15589785108855858|
|<http://dbpedia.o...|0.15595562103330207|
|<http://dbpedia.o...|0.15521238381859923|
|<http://dbpedia.o...| 0.1512481644640235|
|<http://dbpedia.o...|0.15288306779854716|
|<http://dbpedia.o...|0.15363072921406706|
|<http://dbpedia.o...|               NULL|
|<http://dbpedia.o...|0.15071076181142568|
|<http://dbpedia.o...|0.15159132688616334|
|<http://dbpedia.o...|               NULL|
|<http://dbpedia.o...|               NULL|
+----------