In [1]:
import findspark
findspark.init()
findspark.find()
import pyspark
findspark.find()
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
import pandas as pd
from pyspark.sql import SQLContext
sc = SparkContext.getOrCreate()
sqlContext = SQLContext(sc)
spark = SparkSession(sc)

## Classification

### Decision trees
Cũng giống như các thuật toán phân loại Spark MLlib yêu cầu RDD của Đối tượng LabeledPoint, thuật toán Spark ML yêu cầu DataFrame of Row các đối tượng, bao gồm cả nhãn và tính năng. 

Cột nhãn chỉ định phân loại cho quan sát và cột tính năng chứa một SparseVector hoặc một đối tượng DenseVector. Một DenseVector được sử dụng khi mỗi quan sát chứa các tính năng giống nhau, trong khi một SparseVector được sử dụng khi các tính năng có thể thay đổi theo từng trường hợp — nghĩa là, một số tính năng có thể null hoặc không được điền cho một số trường hợp nhất định.

Ưu điểm chính của SparseVector là nó chỉ lưu trữ các tính năng có giá trị, yêu cầu ít dung lượng hơn trong tập dữ liệu có chứa giá trị rỗng.

In [2]:
from pyspark.ml.linalg import DenseVector
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql import Row # Prepare DataFrame of labeled observations
outlook = {"sunny": 0.0, "overcast": 1.0, "rainy": 2.0}
observations = [
Row(label=0, features=DenseVector([outlook["sunny"],85,85,False])),
Row(label=0, features=DenseVector([outlook["sunny"],80,90,True])),
Row(label=1, features=DenseVector([outlook["overcast"],83,86,False])),
Row(label=1, features=DenseVector([outlook["rainy"],70,96,False])),
Row(label=1, features=DenseVector([outlook["rainy"],68,80,False])),
Row(label=0, features=DenseVector([outlook["rainy"],65,70,True])),
Row(label=1, features=DenseVector([outlook["overcast"],64,65,True])),
Row(label=0, features=DenseVector([outlook["sunny"],72,95,False])),
Row(label=1, features=DenseVector([outlook["sunny"],69,70,False])),
Row(label=1, features=DenseVector([outlook["sunny"],75,80,False])),
Row(label=1, features=DenseVector([outlook["sunny"],75,70,True])),
Row(label=1, features=DenseVector([outlook["overcast"],72,90,True])),
Row(label=1, features=DenseVector([outlook["overcast"],81,75,False])),
Row(label=0, features=DenseVector([outlook["rainy"],71,91,True]))
]
rdd = sc.parallelize(observations)
data = spark.createDataFrame(rdd) 

In [3]:
data.show()

+-----+-------------------+
|label|           features|
+-----+-------------------+
|    0|[0.0,85.0,85.0,0.0]|
|    0|[0.0,80.0,90.0,1.0]|
|    1|[1.0,83.0,86.0,0.0]|
|    1|[2.0,70.0,96.0,0.0]|
|    1|[2.0,68.0,80.0,0.0]|
|    0|[2.0,65.0,70.0,1.0]|
|    1|[1.0,64.0,65.0,1.0]|
|    0|[0.0,72.0,95.0,0.0]|
|    1|[0.0,69.0,70.0,0.0]|
|    1|[0.0,75.0,80.0,0.0]|
|    1|[0.0,75.0,70.0,1.0]|
|    1|[1.0,72.0,90.0,1.0]|
|    1|[1.0,81.0,75.0,0.0]|
|    0|[2.0,71.0,91.0,1.0]|
+-----+-------------------+



In [4]:
# Chia dữ liệu thành các tập training và thử ngiệm
(trainingData, testData) = data.randomSplit([0.7, 0.3]) # Train decision tree model
dt = DecisionTreeClassifier()
model = dt.fit(trainingData)

In [5]:
trainingData.show()

+-----+-------------------+
|label|           features|
+-----+-------------------+
|    0|[0.0,85.0,85.0,0.0]|
|    1|[1.0,83.0,86.0,0.0]|
|    0|[2.0,65.0,70.0,1.0]|
|    1|[2.0,70.0,96.0,0.0]|
|    0|[0.0,72.0,95.0,0.0]|
|    1|[0.0,69.0,70.0,0.0]|
|    1|[1.0,64.0,65.0,1.0]|
|    1|[0.0,75.0,70.0,1.0]|
|    1|[0.0,75.0,80.0,0.0]|
|    1|[1.0,72.0,90.0,1.0]|
|    1|[1.0,81.0,75.0,0.0]|
+-----+-------------------+



In [6]:
testData.show()

+-----+-------------------+
|label|           features|
+-----+-------------------+
|    0|[0.0,80.0,90.0,1.0]|
|    1|[2.0,68.0,80.0,0.0]|
|    0|[2.0,71.0,91.0,1.0]|
+-----+-------------------+



In [7]:
model

DecisionTreeClassificationModel: uid=DecisionTreeClassifier_2a70b5c7b84a, depth=5, numNodes=11, numClasses=2, numFeatures=4

In [8]:
# Đưa ra dự đoán bằng cách sử dụng tập dữ liệu thử nghiệm
predictions = model.transform(testData)
predictions.show()

+-----+-------------------+-------------+-----------+----------+
|label|           features|rawPrediction|probability|prediction|
+-----+-------------------+-------------+-----------+----------+
|    0|[0.0,80.0,90.0,1.0]|    [0.0,4.0]|  [0.0,1.0]|       1.0|
|    1|[2.0,68.0,80.0,0.0]|    [0.0,2.0]|  [0.0,1.0]|       1.0|
|    0|[2.0,71.0,91.0,1.0]|    [0.0,2.0]|  [0.0,1.0]|       1.0|
+-----+-------------------+-------------+-----------+----------+



In [9]:
# Đánh giá độ chính xác của mô hình
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g " % (1.0 - accuracy))

Test Error = 0.666667 


In [13]:
sc.stop()

## Collaborative Filtering Using Spark ML

In [16]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row 
sc = SparkContext.getOrCreate()
spark = SparkSession(sc)

In [29]:
# load and prepare data, split data into training and test datasets
data = spark.sparkContext.textFile('data/movielens.dat')
ratings_rdd = data.map(lambda x: x.split('\t')) \
 .map(lambda x: Row(userId = int(x[0]), movieId=int(x[1]), rating=float(x[2]), timestamp=int(x[3])))

In [30]:
ratings_rdd.first()

Row(userId=196, movieId=242, rating=3.0, timestamp=881250949)

In [31]:
ratings = spark.createDataFrame(ratings_rdd)
ratings.show()

+------+-------+------+---------+
|userId|movieId|rating|timestamp|
+------+-------+------+---------+
|   196|    242|   3.0|881250949|
|   186|    302|   3.0|891717742|
|    22|    377|   1.0|878887116|
|   244|     51|   2.0|880606923|
|   166|    346|   1.0|886397596|
|   298|    474|   4.0|884182806|
|   115|    265|   2.0|881171488|
|   253|    465|   5.0|891628467|
|   305|    451|   3.0|886324817|
|     6|     86|   3.0|883603013|
|    62|    257|   2.0|879372434|
|   286|   1014|   5.0|879781125|
|   200|    222|   5.0|876042340|
|   210|     40|   3.0|891035994|
|   224|     29|   3.0|888104457|
|   303|    785|   3.0|879485318|
|   122|    387|   5.0|879270459|
|   194|    274|   2.0|879539794|
|   291|   1042|   4.0|874834944|
|   234|   1184|   2.0|892079237|
+------+-------+------+---------+
only showing top 20 rows



In [36]:
# train model
(training, test) = ratings.randomSplit([0.7, 0.3]) 

[Row(userId=1, movieId=1, rating=5.0, timestamp=874965758),
 Row(userId=1, movieId=9, rating=5.0, timestamp=878543541),
 Row(userId=1, movieId=22, rating=4.0, timestamp=875072404),
 Row(userId=1, movieId=23, rating=4.0, timestamp=875072895),
 Row(userId=1, movieId=36, rating=2.0, timestamp=875073180)]

In [37]:
training.take(5)

[Row(userId=1, movieId=2, rating=3.0, timestamp=876893171),
 Row(userId=1, movieId=5, rating=3.0, timestamp=889751712),
 Row(userId=1, movieId=6, rating=5.0, timestamp=887431973),
 Row(userId=1, movieId=8, rating=1.0, timestamp=875072484),
 Row(userId=1, movieId=10, rating=3.0, timestamp=875693118)]

In [38]:
test.take(5)

[Row(userId=1, movieId=1, rating=5.0, timestamp=874965758),
 Row(userId=1, movieId=9, rating=5.0, timestamp=878543541),
 Row(userId=1, movieId=22, rating=4.0, timestamp=875072404),
 Row(userId=1, movieId=23, rating=4.0, timestamp=875072895),
 Row(userId=1, movieId=36, rating=2.0, timestamp=875073180)]

In [39]:
als = ALS(maxIter=5, regParam=0.01, userCol="userId", itemCol="movieId", ratingCol="rating", coldStartStrategy="drop")
model = als.fit(training) # evaluate model
model

ALSModel: uid=ALS_f5be48135097, rank=10

In [40]:
predictions = model.transform(test)
predictions.take(5)

[Row(userId=251, movieId=148, rating=2.0, timestamp=886272547, prediction=3.593282461166382),
 Row(userId=633, movieId=148, rating=1.0, timestamp=875326138, prediction=3.942725419998169),
 Row(userId=406, movieId=148, rating=3.0, timestamp=879540276, prediction=2.1272995471954346),
 Row(userId=26, movieId=148, rating=3.0, timestamp=891377540, prediction=2.504910945892334),
 Row(userId=27, movieId=148, rating=3.0, timestamp=891543129, prediction=2.211848258972168)]

In [44]:
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

TypeError: float() argument must be a string or a number, not 'DataFrame'

In [42]:
# movie recommendations for each user
model.recommendForAllUsers(3).show(3)

+------+--------------------+
|userId|     recommendations|
+------+--------------------+
|   471|[{1311, 10.595923...|
|   463|[{961, 6.297265},...|
|   833|[{1085, 5.700978}...|
+------+--------------------+
only showing top 3 rows



In [43]:
# user recommendations for each movie
model.recommendForAllItems(3).show(3)

+-------+--------------------+
|movieId|     recommendations|
+-------+--------------------+
|   1580|[{153, 2.1682522}...|
|    471|[{88, 6.0313516},...|
|   1591|[{575, 11.41467},...|
+-------+--------------------+
only showing top 3 rows



In [1]:
sc.stop()

## Clustering Using Spark ML

### k-Means Clustering with Spark ML

In [2]:
from pyspark.ml.clustering import KMeans
from pyspark import SparkConf, SparkContext
sc = SparkContext.getOrCreate()
spark = SparkSession(sc)

In [5]:
# load data
dataset = spark.read.format("libsvm").load("data/mllib/sample_kmeans_data.txt")
dataset.show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|           (3,[],[])|
|  1.0|(3,[0,1,2],[0.1,0...|
|  2.0|(3,[0,1,2],[0.2,0...|
|  3.0|(3,[0,1,2],[9.0,9...|
|  4.0|(3,[0,1,2],[9.1,9...|
|  5.0|(3,[0,1,2],[9.2,9...|
+-----+--------------------+



In [6]:
# train a k-means model
kmeans = KMeans().setK(2).setSeed(1)
kmeans

KMeans_ca36e93c9116

In [8]:
model = kmeans.fit(dataset)
model

KMeansModel: uid=KMeans_ca36e93c9116, k=2, distanceMeasure=euclidean, numFeatures=3

In [9]:
# evaluate using Within Set Sum of Squared Errors
wssse = model.computeCost(dataset)
print("Within Set Sum of Squared Errors = " + str(wssse))
# returns:
# Within Set Sum of Squared Errors = 0.11999999999994547
# show results

AttributeError: 'KMeansModel' object has no attribute 'computeCost'

In [10]:
centers = model.clusterCenters()
print("Cluster Centers: ")
for center in centers:
 print(center)
# returns:
# [ 0.1 0.1 0.1]
# [ 9.1 9.1 9.1]

Cluster Centers: 
[9.1 9.1 9.1]
[0.1 0.1 0.1]


## Spark ML Pipelines

In [11]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import HashingTF, Tokenizer

In [12]:
# Prepare training documents from a list of (id, text, label) tuples.
training = spark.createDataFrame([
 (0, "a b c d e spark", 1.0),
 (1, "b d", 0.0),
 (2, "spark f g h", 1.0),
 (3, "hadoop mapreduce", 0.0)
], ["id", "text", "label"])
training.show()

+---+----------------+-----+
| id|            text|label|
+---+----------------+-----+
|  0| a b c d e spark|  1.0|
|  1|             b d|  0.0|
|  2|     spark f g h|  1.0|
|  3|hadoop mapreduce|  0.0|
+---+----------------+-----+



In [13]:
# Configure an ML pipeline, which consists of 3 stages: tokenizer, hashingTF, and lr.
tokenizer = Tokenizer(inputCol="text", outputCol="words")
hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")
lr = LogisticRegression(maxIter=10, regParam=0.001)

In [14]:
pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])
pipeline

Pipeline_c00632044f0c

In [15]:
# Fit the pipeline to training documents.
model = pipeline.fit(training) # Make predictions on test documents ...
model

PipelineModel_597fd6e56eaa