In [1]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession

conf = SparkConf().setMaster("local").setAppName("MyApp")
sc = SparkContext(conf=conf)
spark = SparkSession.builder.config(conf=SparkConf()).getOrCreate()

## Data type

### Local Vector

In [2]:
import numpy as np
from pyspark.mllib.linalg import Vectors

# Use a NumPy array as a dense vector.(recommend)
dv1 = np.array([1.0, 0.0, 3.0])
# Use a Python list as a dense vector.
dv2 = [1.0, 0.0, 3.0]

dv3 = Vectors.dense(1.0, 0.0, 3.0)

sv1 = Vectors.sparse(3, [(0,2), (1.0, 3.0)])

sv2 = Vectors.sparse(3, {0:1.0, 2:3.0})

sv3 = Vectors.sparse(3, [0,2], [1.0, 3.0])

### Labeled Point

In [3]:
from pyspark.mllib.regression import LabeledPoint

pos = LabeledPoint(1.0, Vectors.dense(1.0, 0.0, 3.0))

neg = LabeledPoint(0.0, Vectors.sparse(3, [0,2], [1.0, 3.0]))

### Local Matrix

In [4]:
from pyspark.ml.linalg import Matrices
# [[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]]
# dense matrix,注意构造的方式是列优先的
dense_matrix = Matrices.dense(3, 2, [1.0, 3.0, 5.0, 2.0, 4.0, 6.0])

# [[9.0, 0.0], [0.0, 8.0], [0.0, 6.0]]
# 参数依次是:行数，列数，列的第一个非0行+ ",n"，非0行标，非0数据
sparse_matrix = Matrices.sparse(3, 2, [0, 1, 3], [0, 2, 1], [9.0, 6.0, 8.0])

## 特征提取

### TF-IDF

In [5]:
from pyspark.ml.feature import HashingTF, IDF, Tokenizer

In [6]:
# 生成测试文本
sentenceData = spark.createDataFrame([(0, "I heard about Spark and I love Spark"), 
                                      (0, "I wish Java could use case classes"),
                                      (1, "Logistic regression models are neat")]).toDF('label', 'sentence')

In [7]:
# 使用Tokenizer把句子分解成单词
tokenizer = Tokenizer(inputCol='sentence', outputCol='words')
wordsData = tokenizer.transform(sentenceData)
wordsData.show()

+-----+--------------------+--------------------+
|label|            sentence|               words|
+-----+--------------------+--------------------+
|    0|I heard about Spa...|[i, heard, about,...|
|    0|I wish Java could...|[i, wish, java, c...|
|    1|Logistic regressi...|[logistic, regres...|
+-----+--------------------+--------------------+



In [8]:
# 使用HashingTF把单词哈希成特征向量,设置哈希的桶数为2000(这里等于是可容纳的单词数)
hashingTF = HashingTF(inputCol='words', outputCol='rawFeatures', numFeatures=2000)
featurizedData = hashingTF.transform(wordsData)
featurizedData.select("words","rawFeatures").show(truncate=False)

+---------------------------------------------+---------------------------------------------------------------------+
|words                                        |rawFeatures                                                          |
+---------------------------------------------+---------------------------------------------------------------------+
|[i, heard, about, spark, and, i, love, spark]|(2000,[240,333,1105,1329,1357,1777],[1.0,1.0,2.0,2.0,1.0,1.0])       |
|[i, wish, java, could, use, case, classes]   |(2000,[213,342,489,495,1329,1809,1967],[1.0,1.0,1.0,1.0,1.0,1.0,1.0])|
|[logistic, regression, models, are, neat]    |(2000,[286,695,1138,1193,1604],[1.0,1.0,1.0,1.0,1.0])                |
+---------------------------------------------+---------------------------------------------------------------------+



In [9]:
# idf训练
idf = IDF(inputCol='rawFeatures', outputCol='features')
idfModel = idf.fit(featurizedData)

In [10]:
# 使用IDFModel的transform方法获取TF-ITF值
rescaledData = idfModel.transform(featurizedData)
rescaledData.select('features', 'label').show(truncate=False)

+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----+
|features                                                                                                                                                                       |label|
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----+
|(2000,[240,333,1105,1329,1357,1777],[0.6931471805599453,0.6931471805599453,1.3862943611198906,0.5753641449035617,0.6931471805599453,0.6931471805599453])                       |0    |
|(2000,[213,342,489,495,1329,1809,1967],[0.6931471805599453,0.6931471805599453,0.6931471805599453,0.6931471805599453,0.28768207245178085,0.6931471805599453,0.6931471805599453])|0    |
|(2000,[286,695,1138,1193,1604],[0.6931471805599453,0.6931471805599453,0.6931471

## 特征转换

### StringIndexer

In [11]:
from pyspark.ml.feature import StringIndexer

In [12]:
# 生成测试数据
df = spark.createDataFrame([(0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c")], ["id", "category"])

In [13]:
# string2index　特征编号是按照特征出现的频率高低来排序的
indexer = StringIndexer(inputCol='category', outputCol='categoryIndex')
model = indexer.fit(df)

In [14]:
# transform
indexed = model.transform(df)
indexed.show()

+---+--------+-------------+
| id|category|categoryIndex|
+---+--------+-------------+
|  0|       a|          0.0|
|  1|       b|          2.0|
|  2|       c|          1.0|
|  3|       a|          0.0|
|  4|       a|          0.0|
|  5|       c|          1.0|
+---+--------+-------------+



### IndexToString

In [15]:
from pyspark.ml.feature import IndexToString, StringIndexer

In [16]:
toString = IndexToString(inputCol='categoryIndex', outputCol='originalCategory')
indexString = toString.transform(indexed)
indexString.select('id', 'originalCategory').show()

+---+----------------+
| id|originalCategory|
+---+----------------+
|  0|               a|
|  1|               b|
|  2|               c|
|  3|               a|
|  4|               a|
|  5|               c|
+---+----------------+



### VectorIndexer

In [17]:
from pyspark.ml.feature import VectorIndexer
from pyspark.ml.linalg import Vectors

In [18]:
# 生成测试数据,注意df必须是key-value(tuple)类型
df = spark.createDataFrame([(Vectors.dense(-1.0, 1.0, 1.0),),\
                            (Vectors.dense(-1.0, 3.0, 1.0),),\
                            (Vectors.dense(0.0, 5.0, 1.0),)], ['features'])

In [19]:
# 这里的maxCategories是指的种类小于2的特征才被视为类别型特征,否则被视作连续型特征
indexer = VectorIndexer(inputCol='features', outputCol='indexed', maxCategories=2)

In [20]:
indexerModel = indexer.fit(df)

In [21]:
categoricalFeatures = indexerModel.categoryMaps.keys()
print('choose '+str(len(categoricalFeatures))+", features:"+str(categoricalFeatures))

choose 2, features:KeysView({0: {0.0: 0, -1.0: 1}, 2: {1.0: 0}})


In [22]:
indexed = indexerModel.transform(df)
indexed.show()

+--------------+-------------+
|      features|      indexed|
+--------------+-------------+
|[-1.0,1.0,1.0]|[1.0,1.0,0.0]|
|[-1.0,3.0,1.0]|[1.0,3.0,0.0]|
| [0.0,5.0,1.0]|[0.0,5.0,0.0]|
+--------------+-------------+



## 特征选择

In [23]:
# 一共有VectorSlicer, RFormula, ChiSqSelector三种特征选择方法,这里使用ChiSqSelector
from pyspark.ml.feature import ChiSqSelector, ChiSqSelectorModel
from pyspark.ml.linalg import Vectors

In [24]:
# 生成测试数据
df = spark.createDataFrame([
    (1, Vectors.dense(0.0, 0.0, 18.0, 1.0), 1),
    (2, Vectors.dense(0.0, 1.0, 12.0, 0.0), 0),
    (3, Vectors.dense(1.0, 0.0, 15.0, 0.1), 0)
], ['id', 'features', 'label'])
df.show()

+---+------------------+-----+
| id|          features|label|
+---+------------------+-----+
|  1|[0.0,0.0,18.0,1.0]|    1|
|  2|[0.0,1.0,12.0,0.0]|    0|
|  3|[1.0,0.0,15.0,0.1]|    0|
+---+------------------+-----+



In [25]:
# 特征选择,通过numTopFeatures方法设置选择与标签关联性最强的n个特征
selector = ChiSqSelector(
    numTopFeatures= 1,
    featuresCol= 'features',
    labelCol= 'label',
    outputCol= 'selected-feature'
)

In [26]:
selector_model = selector.fit(df)
result = selector_model.transform(df)
result.show()

+---+------------------+-----+----------------+
| id|          features|label|selected-feature|
+---+------------------+-----+----------------+
|  1|[0.0,0.0,18.0,1.0]|    1|          [18.0]|
|  2|[0.0,1.0,12.0,0.0]|    0|          [12.0]|
|  3|[1.0,0.0,15.0,0.1]|    0|          [15.0]|
+---+------------------+-----+----------------+



## 分类算法

### 逻辑回归算法

In [27]:
from pyspark.ml.linalg import Vectors
from pyspark.sql import Row, functions
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml import Pipeline
from pyspark.ml.feature import IndexToString, StringIndexer, VectorIndexer, HashingTF, Tokenizer
from pyspark.ml.classification import LogisticRegression, LogisticRegressionModel, BinaryLogisticRegressionSummary

In [28]:
# 生成训练数据 Row(**_dict)--> Row(key1=value1, key2=value2)
def f(x):
    rel = {}
    rel['features'] = Vectors.dense(float(x[0]), float(x[1]), float(x[2]), float(x[3]))
    rel['label'] = str(x[4])
    return rel

data = spark.sparkContext.textFile('./iris.txt').\
    map(lambda line: line.split(',')).\
    map(lambda p: Row(**f(p))).\
    toDF()
data.show()

+-----------------+-----------+
|         features|      label|
+-----------------+-----------+
|[5.1,3.5,1.4,0.2]|Iris-setosa|
|[4.9,3.0,1.4,0.2]|Iris-setosa|
|[4.7,3.2,1.3,0.2]|Iris-setosa|
|[4.6,3.1,1.5,0.2]|Iris-setosa|
|[5.0,3.6,1.4,0.2]|Iris-setosa|
|[5.4,3.9,1.7,0.4]|Iris-setosa|
|[4.6,3.4,1.4,0.3]|Iris-setosa|
|[5.0,3.4,1.5,0.2]|Iris-setosa|
|[4.4,2.9,1.4,0.2]|Iris-setosa|
|[4.9,3.1,1.5,0.1]|Iris-setosa|
|[5.4,3.7,1.5,0.2]|Iris-setosa|
|[4.8,3.4,1.6,0.2]|Iris-setosa|
|[4.8,3.0,1.4,0.1]|Iris-setosa|
|[4.3,3.0,1.1,0.1]|Iris-setosa|
|[5.8,4.0,1.2,0.2]|Iris-setosa|
|[5.7,4.4,1.5,0.4]|Iris-setosa|
|[5.4,3.9,1.3,0.4]|Iris-setosa|
|[5.1,3.5,1.4,0.3]|Iris-setosa|
|[5.7,3.8,1.7,0.3]|Iris-setosa|
|[5.1,3.8,1.5,0.3]|Iris-setosa|
+-----------------+-----------+
only showing top 20 rows



In [30]:
# 对标签和特征增加索引
labelIndexer = StringIndexer().\
    setInputCol('label').\
    setOutputCol('indexedLabel').\
    fit(data)

featureIndexer = VectorIndexer().\
    setInputCol('features').\
    setOutputCol('indexedFeatures').\
    fit(data)

In [32]:
# 设置LogisticRegression
lr = LogisticRegression().\
    setLabelCol('indexedLabel').\
    setFeaturesCol('indexedFeatures').\
    setMaxIter(100).\
    setRegParam(0.3).\
    setElasticNetParam(0.8)

print('params:\n', lr.explainParams()) # 打印参数

params:
 aggregationDepth: suggested depth for treeAggregate (>= 2). (default: 2)
elasticNetParam: the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty. (default: 0.0, current: 0.8)
family: The name of family which is a description of the label distribution to be used in the model. Supported options: auto, binomial, multinomial (default: auto)
featuresCol: features column name. (default: features, current: indexedFeatures)
fitIntercept: whether to fit an intercept term. (default: True)
labelCol: label column name. (default: label, current: indexedLabel)
lowerBoundsOnCoefficients: The lower bounds on coefficients if fitting under bound constrained optimization. The bound matrix must be compatible with the shape (1, number of features) for binomial regression, or (number of classes, number of features) for multinomial regression. (undefined)
lowerBoundsOnIntercepts: The lower bounds on intercepts if fitting unde

In [33]:
# 设置IndexToString
labelConverter = IndexToString().\
    setInputCol('prediction').\
    setOutputCol('predictedLabel').\
    setLabels(labelIndexer.labels)

In [35]:
# 构建pipeline
lrPipeline = Pipeline().\
    setStages([labelIndexer, featureIndexer, lr, labelConverter])

In [41]:
# 划分训练集和测试集进行模型训练
trainingData, testData = data.randomSplit([0.7, 0.3])
lrPipelineModel = lrPipeline.fit(trainingData)
lrPredictions = lrPipelineModel.transform(testData)

In [42]:
# 输出预测结果
preRe1 = lrPredictions.select(
    'predictedLabel',
    'label',
    'features',
    'probability').\
    collect()
for item in preRe1:
    print('label: ',str(item['label'])+',\n'\
          'features: ', str(item['features'])+',\n'\
          'prob: ', str(item['probability'])+',\n'\
          'predict_label: '+ str(item['predictedLabel']))

label:  Iris-setosa,
features:  [4.4,2.9,1.4,0.2],
prob:  [0.517512490453518,0.2797617539385136,0.20272575560796846],
predict_label: Iris-setosa
label:  Iris-setosa,
features:  [4.4,3.2,1.3,0.2],
prob:  [0.5239524330049404,0.2760276684175376,0.20001989857752178],
predict_label: Iris-setosa
label:  Iris-setosa,
features:  [4.6,3.2,1.4,0.2],
prob:  [0.517512490453518,0.2797617539385136,0.20272575560796846],
predict_label: Iris-setosa
label:  Iris-setosa,
features:  [4.7,3.2,1.6,0.2],
prob:  [0.5046172775706185,0.2872388125196186,0.20814390990976306],
predict_label: Iris-setosa
label:  Iris-setosa,
features:  [4.8,3.0,1.4,0.1],
prob:  [0.5268542955807591,0.27837294823879616,0.19477275618044484],
predict_label: Iris-setosa
label:  Iris-setosa,
features:  [4.8,3.4,1.9,0.2],
prob:  [0.4852683017985809,0.2984579700973703,0.21627372810404868],
predict_label: Iris-setosa
label:  Iris-setosa,
features:  [4.9,3.0,1.4,0.2],
prob:  [0.517512490453518,0.2797617539385136,0.20272575560796846],
predict

In [43]:
# 对模型进行评估
evaluator = MulticlassClassificationEvaluator().\
    setLabelCol('indexedLabel').\
    setPredictionCol('prediction')

lrAccuracy = evaluator.evaluate(lrPredictions)
lrAccuracy

0.7795481829095274

In [44]:
# 通过model获取模型
lrModel = lrPipelineModel.stages[2]

print('Coefficients:\n' + str(lrModel.coefficientMatrix)+
     "\nIntercept: " + str(lrModel.interceptVector)+
     "\n numClasses: " + str(lrModel.numClasses)+
     "\n numFeatures: " + str(lrModel.numFeatures))

Coefficients:
3 X 4 CSRMatrix
(0,2) -0.258
(0,3) -0.2287
(1,3) 0.3504
Intercept: [0.8121188673353097,-0.20997333175016653,-0.6021455355851429]
 numClasses: 3
 numFeatures: 4


### 决策树算法

In [45]:
from pyspark.ml.classification import DecisionTreeClassificationModel
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml import Pipeline, PipelineModel
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.linalg import Vectors
from pyspark.sql import Row
from pyspark.ml.feature import IndexToString, StringIndexer, VectorIndexer

In [46]:
# 生成训练数据 Row(**_dict)--> Row(key1=value1, key2=value2)
def f(x):
    rel = {}
    rel['features'] = Vectors.dense(float(x[0]), float(x[1]), float(x[2]), float(x[3]))
    rel['label'] = str(x[4])
    return rel

data = spark.sparkContext.textFile('./iris.txt').\
    map(lambda line: line.split(',')).\
    map(lambda p: Row(**f(p))).\
    toDF()
data.show()

+-----------------+-----------+
|         features|      label|
+-----------------+-----------+
|[5.1,3.5,1.4,0.2]|Iris-setosa|
|[4.9,3.0,1.4,0.2]|Iris-setosa|
|[4.7,3.2,1.3,0.2]|Iris-setosa|
|[4.6,3.1,1.5,0.2]|Iris-setosa|
|[5.0,3.6,1.4,0.2]|Iris-setosa|
|[5.4,3.9,1.7,0.4]|Iris-setosa|
|[4.6,3.4,1.4,0.3]|Iris-setosa|
|[5.0,3.4,1.5,0.2]|Iris-setosa|
|[4.4,2.9,1.4,0.2]|Iris-setosa|
|[4.9,3.1,1.5,0.1]|Iris-setosa|
|[5.4,3.7,1.5,0.2]|Iris-setosa|
|[4.8,3.4,1.6,0.2]|Iris-setosa|
|[4.8,3.0,1.4,0.1]|Iris-setosa|
|[4.3,3.0,1.1,0.1]|Iris-setosa|
|[5.8,4.0,1.2,0.2]|Iris-setosa|
|[5.7,4.4,1.5,0.4]|Iris-setosa|
|[5.4,3.9,1.3,0.4]|Iris-setosa|
|[5.1,3.5,1.4,0.3]|Iris-setosa|
|[5.7,3.8,1.7,0.3]|Iris-setosa|
|[5.1,3.8,1.5,0.3]|Iris-setosa|
+-----------------+-----------+
only showing top 20 rows



In [47]:
# 对标签和特征增加索引
labelIndexer = StringIndexer().\
    setInputCol('label').\
    setOutputCol('indexedLabel').\
    fit(data)

featureIndexer = VectorIndexer().\
    setInputCol('features').\
    setOutputCol('indexedFeatures').\
    fit(data)

In [48]:
# 设置IndexToString
labelConverter = IndexToString().\
    setInputCol('prediction').\
    setOutputCol('predictedLabel').\
    setLabels(labelIndexer.labels)

In [49]:
# 划分训练集和测试集
trainingData, testData = data.randomSplit([0.7, 0.3])

In [50]:
# 创建决策树模型
dtClassifier = DecisionTreeClassifier().\
    setLabelCol('indexedLabel').\
    setFeaturesCol('indexedFeatures')

In [51]:
# 构建pipeline
dtPipeline = Pipeline().\
    setStages([labelIndexer, featureIndexer, dtClassifier, labelConverter])

dtPipelineModel = dtPipeline.fit(trainingData)
dtPredictions = dtPipelineModel.transform(testData)

dtPredictions.select('predictedLabel', 'label', 'features').show(20)

+---------------+---------------+-----------------+
| predictedLabel|          label|         features|
+---------------+---------------+-----------------+
|    Iris-setosa|    Iris-setosa|[4.4,2.9,1.4,0.2]|
|    Iris-setosa|    Iris-setosa|[4.5,2.3,1.3,0.3]|
|    Iris-setosa|    Iris-setosa|[4.6,3.2,1.4,0.2]|
|    Iris-setosa|    Iris-setosa|[4.8,3.0,1.4,0.3]|
|    Iris-setosa|    Iris-setosa|[4.8,3.4,1.6,0.2]|
|    Iris-setosa|    Iris-setosa|[4.8,3.4,1.9,0.2]|
|    Iris-setosa|    Iris-setosa|[5.0,3.5,1.3,0.3]|
|Iris-versicolor|Iris-versicolor|[5.1,2.5,3.0,1.1]|
|    Iris-setosa|    Iris-setosa|[5.1,3.5,1.4,0.2]|
|Iris-versicolor|Iris-versicolor|[5.4,3.0,4.5,1.5]|
|    Iris-setosa|    Iris-setosa|[5.4,3.9,1.3,0.4]|
|    Iris-setosa|    Iris-setosa|[5.4,3.9,1.7,0.4]|
|Iris-versicolor|Iris-versicolor|[5.5,2.3,4.0,1.3]|
|Iris-versicolor|Iris-versicolor|[5.5,2.4,3.7,1.0]|
|    Iris-setosa|    Iris-setosa|[5.5,3.5,1.3,0.2]|
|Iris-versicolor|Iris-versicolor|[5.6,2.5,3.9,1.1]|
|Iris-versic

In [53]:
evaluator = MulticlassClassificationEvaluator().\
    setLabelCol('indexedLabel').\
    setPredictionCol('prediction')

dtAccuracy = evaluator.evaluate(dtPredictions)
dtAccuracy

0.9762706429373097

In [54]:
# 查看决策树模型结构
treeModelClassifier = dtPipelineModel.stages[2]
print('tree model:\n'+
     str(treeModelClassifier.toDebugString))

tree model:
DecisionTreeClassificationModel (uid=DecisionTreeClassifier_7f2669283bfa) of depth 4 with 15 nodes
  If (feature 2 <= 2.5999999999999996)
   Predict: 0.0
  Else (feature 2 > 2.5999999999999996)
   If (feature 3 <= 1.75)
    If (feature 2 <= 4.95)
     If (feature 3 <= 1.65)
      Predict: 1.0
     Else (feature 3 > 1.65)
      Predict: 2.0
    Else (feature 2 > 4.95)
     If (feature 3 <= 1.55)
      Predict: 2.0
     Else (feature 3 > 1.55)
      Predict: 1.0
   Else (feature 3 > 1.75)
    If (feature 2 <= 4.85)
     If (feature 0 <= 5.95)
      Predict: 1.0
     Else (feature 0 > 5.95)
      Predict: 2.0
    Else (feature 2 > 4.85)
     Predict: 2.0



### 聚类算法

In [55]:
from pyspark.sql import Row
from pyspark.ml.clustering import KMeans, KMeansModel
from pyspark.ml.linalg import Vectors

In [56]:
# 生成训练数据 Row(**_dict)--> Row(key1=value1, key2=value2)
def f(x):
    rel = {}
    rel['features'] = Vectors.dense(float(x[0]), float(x[1]), float(x[2]), float(x[3]))
    rel['label'] = str(x[4])
    return rel

data = spark.sparkContext.textFile('./iris.txt').\
    map(lambda line: line.split(',')).\
    map(lambda p: Row(**f(p))).\
    toDF()
data.show()

+-----------------+-----------+
|         features|      label|
+-----------------+-----------+
|[5.1,3.5,1.4,0.2]|Iris-setosa|
|[4.9,3.0,1.4,0.2]|Iris-setosa|
|[4.7,3.2,1.3,0.2]|Iris-setosa|
|[4.6,3.1,1.5,0.2]|Iris-setosa|
|[5.0,3.6,1.4,0.2]|Iris-setosa|
|[5.4,3.9,1.7,0.4]|Iris-setosa|
|[4.6,3.4,1.4,0.3]|Iris-setosa|
|[5.0,3.4,1.5,0.2]|Iris-setosa|
|[4.4,2.9,1.4,0.2]|Iris-setosa|
|[4.9,3.1,1.5,0.1]|Iris-setosa|
|[5.4,3.7,1.5,0.2]|Iris-setosa|
|[4.8,3.4,1.6,0.2]|Iris-setosa|
|[4.8,3.0,1.4,0.1]|Iris-setosa|
|[4.3,3.0,1.1,0.1]|Iris-setosa|
|[5.8,4.0,1.2,0.2]|Iris-setosa|
|[5.7,4.4,1.5,0.4]|Iris-setosa|
|[5.4,3.9,1.3,0.4]|Iris-setosa|
|[5.1,3.5,1.4,0.3]|Iris-setosa|
|[5.7,3.8,1.7,0.3]|Iris-setosa|
|[5.1,3.8,1.5,0.3]|Iris-setosa|
+-----------------+-----------+
only showing top 20 rows



In [57]:
# 创建聚类模型并训练
kmeansmodel = KMeans().\
    setK(3).\
    setFeaturesCol('features').\
    setPredictionCol('prediction').\
    fit(data)

In [58]:
# 对数据集进行分类
results = kmeansmodel.transform(data).collect()
for item in results:
    print(str(item[0]) + 'is predicted as cluster' + str(item[1]))

[5.1,3.5,1.4,0.2]is predicted as clusterIris-setosa
[4.9,3.0,1.4,0.2]is predicted as clusterIris-setosa
[4.7,3.2,1.3,0.2]is predicted as clusterIris-setosa
[4.6,3.1,1.5,0.2]is predicted as clusterIris-setosa
[5.0,3.6,1.4,0.2]is predicted as clusterIris-setosa
[5.4,3.9,1.7,0.4]is predicted as clusterIris-setosa
[4.6,3.4,1.4,0.3]is predicted as clusterIris-setosa
[5.0,3.4,1.5,0.2]is predicted as clusterIris-setosa
[4.4,2.9,1.4,0.2]is predicted as clusterIris-setosa
[4.9,3.1,1.5,0.1]is predicted as clusterIris-setosa
[5.4,3.7,1.5,0.2]is predicted as clusterIris-setosa
[4.8,3.4,1.6,0.2]is predicted as clusterIris-setosa
[4.8,3.0,1.4,0.1]is predicted as clusterIris-setosa
[4.3,3.0,1.1,0.1]is predicted as clusterIris-setosa
[5.8,4.0,1.2,0.2]is predicted as clusterIris-setosa
[5.7,4.4,1.5,0.4]is predicted as clusterIris-setosa
[5.4,3.9,1.3,0.4]is predicted as clusterIris-setosa
[5.1,3.5,1.4,0.3]is predicted as clusterIris-setosa
[5.7,3.8,1.7,0.3]is predicted as clusterIris-setosa
[5.1,3.8,1.5

In [59]:
# 获取聚类中心情况
results2 = kmeansmodel.clusterCenters()
for item in results2:
    print(item)

[5.006 3.418 1.464 0.244]
[6.85       3.07368421 5.74210526 2.07105263]
[5.9016129  2.7483871  4.39354839 1.43387097]


In [60]:
# 计算误差平方和,用于在k值未知的情况下选取合适的k值
kmeansmodel.computeCost(data)

78.94084142614648