In [None]:
!pip install pyspark py4j

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.3.2.tar.gz (281.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m281.4/281.4 MB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting py4j
  Downloading py4j-0.10.9.7-py2.py3-none-any.whl (200 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.5/200.5 KB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Downloading py4j-0.10.9.5-py2.py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.7/199.7 KB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.3.2-py2.py3-none-any.whl size=281824025 sha256=176f03ed88921066c53261617e83058a7d90c91508068d91d4998a6aa03033bd
 

In [None]:
import numpy as np
from scipy.sparse.linalg import svds
from pyspark.sql import SparkSession
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import functions
from pyspark.sql.functions import col, monotonically_increasing_id

In [None]:
spark = SparkSession.builder.appName("sistema_recomendacion").getOrCreate()
spark.conf.set("spark.sql.pivotMaxValues", 100000)

books = spark.read.csv("Books.csv", header=True, inferSchema=True)
books = books.limit(10000)
ratings = spark.read.csv("Ratings.csv", header=True, inferSchema=True)
ratings = ratings.limit(10000)
users = spark.read.csv("Users.csv", header=True, inferSchema=True)
users = users.limit(10000)

In [None]:
books.show(5)

+----------+--------------------+--------------------+-------------------+--------------------+--------------------+--------------------+--------------------+
|      ISBN|          Book-Title|         Book-Author|Year-Of-Publication|           Publisher|         Image-URL-S|         Image-URL-M|         Image-URL-L|
+----------+--------------------+--------------------+-------------------+--------------------+--------------------+--------------------+--------------------+
|0195153448| Classical Mythology|  Mark P. O. Morford|               2002|Oxford University...|http://images.ama...|http://images.ama...|http://images.ama...|
|0002005018|        Clara Callan|Richard Bruce Wright|               2001|HarperFlamingo Ca...|http://images.ama...|http://images.ama...|http://images.ama...|
|0060973129|Decision in Normandy|        Carlo D'Este|               1991|     HarperPerennial|http://images.ama...|http://images.ama...|http://images.ama...|
|0374157065|Flu: The Story of...|    Gina Bari

In [None]:
users.show(5)

+-------+--------------------+----+
|user_id|            Location| Age|
+-------+--------------------+----+
|      1|  nyc, new york, usa|null|
|      2|stockton, califor...|  18|
|      3|moscow, yukon ter...|null|
|      4|porto, v.n.gaia, ...|  17|
|      5|farnborough, hant...|null|
+-------+--------------------+----+
only showing top 5 rows



In [None]:
# hacemos un join entre los usuarios y las calificaciones para obtener las calificaciones de los usuarios en los libros
usuarios_califs = users.join(ratings, ['user_id'])

In [None]:
usuarios_califs.show(5)

+-------+--------------------+----+----------+-----------+
|user_id|            Location| Age|      ISBN|Book-Rating|
+-------+--------------------+----+----------+-----------+
|      2|stockton, califor...|  18| 195153448|          0|
|      7| washington, dc, usa|null|  34542252|          0|
|      8|timmins, ontario,...|null|1881320189|          7|
|      8|timmins, ontario,...|null|1575663937|          6|
|      8|timmins, ontario,...|null|1567407781|          6|
+-------+--------------------+----+----------+-----------+
only showing top 5 rows



In [None]:
# ahora hacemos un join utilizando el identificador ISBN para tener los libros _revisados_ por usuario
book_califs = usuarios_califs.join(books, ['ISBN'])

In [None]:
book_califs.show(5)

+----------+-------+--------------------+----+-----------+--------------------+------------------+-------------------+--------------------+--------------------+--------------------+--------------------+
|      ISBN|user_id|            Location| Age|Book-Rating|          Book-Title|       Book-Author|Year-Of-Publication|           Publisher|         Image-URL-S|         Image-URL-M|         Image-URL-L|
+----------+-------+--------------------+----+-----------+--------------------+------------------+-------------------+--------------------+--------------------+--------------------+--------------------+
|1881320189|      8|timmins, ontario,...|null|          7|Goodbye to the Bu...|      Julia Oliver|               1994|      River City Pub|http://images.ama...|http://images.ama...|http://images.ama...|
|1575663937|      8|timmins, ontario,...|null|          6|More Cunning Than...|Robert Hendrickson|               1999|Kensington Publis...|http://images.ama...|http://images.ama...|http://

In [None]:
lb = book_califs.select('ISBN').distinct()
lb = lb.withColumn('unique_id_book', monotonically_increasing_id())
book_user_rating = book_califs.join(lb, on='ISBN', how='left')

In [None]:
book_user_rating.show(5)

+----------+-------+--------------------+----+-----------+--------------------+------------------+-------------------+--------------------+--------------------+--------------------+--------------------+--------------+
|      ISBN|user_id|            Location| Age|Book-Rating|          Book-Title|       Book-Author|Year-Of-Publication|           Publisher|         Image-URL-S|         Image-URL-M|         Image-URL-L|unique_id_book|
+----------+-------+--------------------+----+-----------+--------------------+------------------+-------------------+--------------------+--------------------+--------------------+--------------------+--------------+
|080652121X|      8|timmins, ontario,...|null|          0|Hitler's Secret B...|        Adam Lebor|               2000|       Citadel Press|http://images.ama...|http://images.ama...|http://images.ama...|             1|
|1552041778|      8|timmins, ontario,...|null|          5|            Jane Doe|      R. J. Kaiser|               1999|          

In [None]:
from pyspark.sql.functions import coalesce

mat = (book_user_rating
  .groupBy('user_id')
  .pivot('unique_id_book')
  .agg(functions.first('Book-Rating'))
)

In [None]:
mat = mat.na.fill(value=0)

In [None]:
fc = np.array(mat.collect())[:, 1:]

In [None]:
fc = fc.astype(float)

In [None]:
U, sigma, vt = svds(fc, k = 10)

In [None]:
U.shape, sigma.shape, vt.shape

((25, 10), (10,), (10, 65))

In [None]:
all_user_predicted_ratings = np.dot(np.dot(U, np.diag(sigma)), vt) 
all_user_predicted_ratings

array([[ 1.69354293e-16, -2.22625978e-34,  1.69354293e-16, ...,
        -4.61848864e-16, -5.38823675e-16,  0.00000000e+00],
       [ 2.05357749e-16,  1.34903873e-34,  2.05357749e-16, ...,
        -6.62777253e-17, -7.73240129e-17,  0.00000000e+00],
       [ 5.89837486e-16,  6.56705102e-34,  5.89837486e-16, ...,
         1.37979332e-16,  1.60975887e-16,  0.00000000e+00],
       ...,
       [ 4.08350162e-16, -2.03870525e-31,  4.08350162e-16, ...,
         2.39541604e-15,  2.79465204e-15,  0.00000000e+00],
       [-2.76021750e-18, -1.11947141e-35, -2.80781074e-18, ...,
        -6.41891494e-18, -7.48873410e-18,  0.00000000e+00],
       [ 8.67361738e-17,  6.28634843e-35,  8.67361738e-17, ...,
        -2.08166817e-17, -2.42861287e-17,  0.00000000e+00]])