In [10]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [11]:
!pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [12]:
import pyspark
from pyspark.sql import SparkSession

In [14]:
spark = SparkSession.builder.appName('Movie Recommendation').getOrCreate()

from pyspark.mllib.recommendation import Rating
from pyspark.mllib.recommendation import ALS

raw = spark.sparkContext.textFile("/content/drive/MyDrive/B Data/Minggu14/ratings.dat")

mydata = [(2, 0.01)]

mydatardd = spark.sparkContext.parallelize(mydata).map(lambda x: Rating(0, x[0], x[1]))

def parseRating(str):
    fields = str.split("::")
    assert(len(fields) == 4)
    return Rating(int(fields[0]), int(fields[1]), float(fields[2]))

ratings = raw.map(parseRating)
totalratings = ratings.union(mydatardd)
model = ALS.train(totalratings, rank=8, iterations=5, lambda_=1.0)
products = model.recommendProducts(1, 10)

products

[Rating(user=1, product=3382, rating=4.4600147771921215),
 Rating(user=1, product=989, rating=4.017066506042651),
 Rating(user=1, product=557, rating=3.9018863893516014),
 Rating(user=1, product=787, rating=3.8958963808781406),
 Rating(user=1, product=1830, rating=3.8944250945887506),
 Rating(user=1, product=3233, rating=3.8942927733712454),
 Rating(user=1, product=3607, rating=3.8424242087724636),
 Rating(user=1, product=3172, rating=3.820022091036847),
 Rating(user=1, product=2503, rating=3.7849626614846987),
 Rating(user=1, product=3656, rating=3.7849422122473477)]

In [15]:
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.stat import Statistics

vectorRDD = totalratings.map(lambda rating: Vectors.dense(rating.user, rating.product, rating.rating))
summary = Statistics.colStats(vectorRDD)

print(summary.mean())
print(summary.variance())
print(summary.numNonzeros())

[3024.50932404 1865.53803501    3.58156088]
[2.98741660e+06 1.20130746e+06 1.24792804e+00]
[1000209. 1000210. 1000210.]


In [16]:
from pyspark.ml.clustering import KMeans
from pyspark.ml.linalg import Vectors

data = spark.read.text("/content/drive/MyDrive/B Data/Minggu14/kmeans_data.txt")
parsedData = data.rdd.map(lambda s: Vectors.dense([float(x) for x in s.value.split(' ')]))

# Convert RDD to DataFrame
parsedData = parsedData.map(lambda x: (x, )).toDF(["features"])

# Cluster the data into two classes using KMeans
numClusters = 2
numIterations = 20
kmeans = KMeans().setK(numClusters).setMaxIter(numIterations)
model = kmeans.fit(parsedData)

# Predict the cluster for each data point
predictions = model.transform(parsedData)

# Compute Within Set Sum of Squared Errors
wssse = model.summary.trainingCost
print("Within Set Sum of Squared Errors = " + str(wssse))

# Print the predicted cluster for each data point
predictions.select("prediction").show()

Within Set Sum of Squared Errors = 0.1199999999999996
+----------+
|prediction|
+----------+
|         1|
|         1|
|         1|
|         0|
|         0|
|         0|
+----------+



In [18]:
from pyspark.mllib.clustering import KMeans, KMeansModel
from numpy import array
from math import sqrt


# Load and parse the data
data = spark.sparkContext.textFile("/content/drive/MyDrive/B Data/Minggu14/kmeans_data.txt")
parsedData = data.map(lambda line: array([float(x) for x in line.split(' ')]))

# Build the model (cluster the data)
clusters = KMeans.train(parsedData, k=2, maxIterations=10, initializationMode="random")

# Evaluate clustering by computing Within Set Sum of Squared Errors
def error(point):
    center = clusters.centers[clusters.predict(point)]
    return sqrt(sum([x ** 2 for x in (point - center)]))

WSSSE = parsedData.map(lambda point: error(point)).reduce(lambda x, y: x + y)

print("Within Set Sum of Squared Error = " + str(WSSSE))

# Save and load model
clusters.save(spark.sparkContext, "zadah_prak")
sameModel = KMeansModel.load(spark.sparkContext, "zadah_prak")

Within Set Sum of Squared Error = 0.6928203230275529
