Este notebook muestra ejemplos de análisis de grafos utilizando GraphFrames y el dataset MovieLens.


In [0]:
%pip install graphframes

In [0]:
%restart_python

In [0]:
from pyspark import *
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType,StructField, StringType, IntegerType, DateType, TimestampType, LongType
from pyspark.sql.types import ArrayType, DoubleType, BooleanType, DecimalType
from pyspark.sql.functions import regexp_extract, split, from_unixtime, col, avg, min, max, desc
from pyspark.sql.functions import grouping, explode, array_contains, lit
from graphframes import GraphFrame

In [0]:
spark = SparkSession.builder \
    .appName("GraphFrames") \
    .config("spark.jars.packages", "graphframes:graphframes:0.8.2-spark3.0-s_2.12") \
    .getOrCreate()

In [0]:
# Tabla Ratings
ratings_schema  = StructType(fields=[
    StructField("userId",IntegerType(),True), 
    StructField("movieId",StringType(),True),
    StructField("rating",DecimalType(precision=2,scale=1),True),
    StructField("timestamp",LongType(),True)
])
ratingsDf = spark.read\
    .option("header", True)\
    .option("dateFormat", "yyyyMMdd")\
    .schema(ratings_schema)\
    .csv("/Volumes/big_data_ii_2025/spark_examples/spark_data/ratings_full.csv")\
    .withColumn(\
            "date",\
            from_unixtime("timestamp", "yyyyMMdd"))\
                .drop('timestamp')

# Tabla Movies
movies_schema  = StructType(fields=[
    StructField("movieId",StringType(),True), 
    StructField("title",StringType(),True),
    StructField("genres",StringType(),True)
])

moviesDf = spark.read\
    .option("header", True)\
    .schema(movies_schema)\
    .csv("/Volumes/big_data_ii_2025/spark_examples/spark_data/movies.csv")

moviesDf = moviesDf.withColumn("genresSplit", split(moviesDf["genres"],"\|"))\
                        .drop('genres').withColumnRenamed("genresSplit","genres")\
                            .withColumn(\
                                "year",\
                                regexp_extract(\
                                           moviesDf["title"],\
                                           "^.+\(([0-9]+)\)$",\
                                           1)\
                                .try_cast(IntegerType()))\
                            .withColumn(\
                            "title_temp",\
                            regexp_extract(\
                                           moviesDf["title"],\
                                           "^(.+?) \([0-9]+\)$",\
                                           1))\
                            .drop('title')\
                        .withColumnRenamed("title_temp","title")

In [0]:
users = ratingsDf.select(
    col("userId").alias("id")
).distinct().withColumn("type", lit("user"))

movie_vertices = moviesDf.select(
    col("movieId").alias("id"),
    lit("movie").alias("type")
)

In [0]:
vertices = users.union(movie_vertices)

In [0]:
#usuario -> película con peso rating
edges = ratingsDf.select(
    col("userId").alias("src"),
    col("movieId").alias("dst"),
    col("rating").alias("weight")
)

In [0]:
# Crear GraphFrame
g = GraphFrame(vertices, edges)

In [0]:
# Ejecutar PageRank
pr_results = g.pageRank(resetProbability=0.15, maxIter=10)

# Filtrar solo películas y obtener top-10
pagerank_movies = pr_results.vertices.filter(col("type") == "movie")
top10 = pagerank_movies.join(
    moviesDf.withColumnRenamed("movieId", "id"), on="id"
).select("title", "pagerank").orderBy(col("pagerank").desc()).limit(10)

top10.show(truncate=False)

In [0]:
# Calcular conteo de triángulos
tc_results = g.triangleCount()

# Mostrar usuarios con más triángulos
user_triangles = tc_results.filter(col("type") == "user").orderBy(col("count").desc()).limit(10)
user_triangles.show()

In [0]:
# Rutas más cortas desde un usuario de ejemplo (e.g., usuario "1")
landmarks = ["1"]
sp_results = g.shortestPaths(landmarks=landmarks)

# Mostrar distancias
sp_results.select("id", "distances").show(10, truncate=False)