In [1]:
# 导入pyspark
from pyspark.sql import SparkSession

# 连接到spark
spark = SparkSession.builder \
        .appName("pysparkML") \
        .master("local[4]") \
        .getOrCreate()

In [2]:
# 从文件系统中读取数据，并将其转化为DataFrame
sc = spark.sparkContext
rdd = sc.textFile("./resources/iris.txt")
# 下面基于spark提供的row，构造df的行
from pyspark.sql import Row
from pyspark.ml.linalg import Vectors # 将特征转换为向量
iris = rdd.map(lambda line:line.strip().split(",")).map(lambda line:Row(features = Vectors.dense(float(line[0]),
                                                                                                 float(line[1]),
                                                                                                 float(line[2]),
                                                                                                 float(line[3])
                                                                                                        ),
                                                                         label = line[4]))
df = spark.createDataFrame(iris)
df.printSchema()
df.show(5)

root
 |-- features: vector (nullable = true)
 |-- label: string (nullable = true)

+-----------------+-----------+
|         features|      label|
+-----------------+-----------+
|[5.1,3.5,1.4,0.2]|Iris-setosa|
|[4.9,3.0,1.4,0.2]|Iris-setosa|
|[4.7,3.2,1.3,0.2]|Iris-setosa|
|[4.6,3.1,1.5,0.2]|Iris-setosa|
|[5.0,3.6,1.4,0.2]|Iris-setosa|
+-----------------+-----------+
only showing top 5 rows



In [3]:
# 下面开始进行标签的转化，将label进行编码
from  pyspark.ml.feature import StringIndexer,IndexToString
# 创建标签编码器
labelIndexer = StringIndexer(inputCol="label",outputCol="indexed",stringOrderType="frequencyDesc")
labelIndexerModel = labelIndexer.fit(df)
# 创建标签的解码器
labelIndexToString = IndexToString(inputCol="prediction",outputCol="predLabel",labels=labelIndexerModel.labels)

In [4]:
# 下面对特征进行编码索引
from pyspark.ml.feature import VectorIndexer
# 创建编码器
featureIndexer = VectorIndexer(inputCol="features",outputCol="indexedFeatures")
featureIndexerModel = featureIndexer.fit(df)

In [5]:
# 划分训练集和测试集
train,test = df.randomSplit([0.7,0.3])

In [6]:
# 构建逻辑回归的分类器
from pyspark.ml.classification import LogisticRegression

logitModel = LogisticRegression(
    featuresCol="indexedFeatures",labelCol="indexed",maxIter=100,elasticNetParam=0.8
)
print(logitModel.explainParams())

aggregationDepth: suggested depth for treeAggregate (>= 2). (default: 2)
elasticNetParam: the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty. (default: 0.0, current: 0.8)
family: The name of family which is a description of the label distribution to be used in the model. Supported options: auto, binomial, multinomial (default: auto)
featuresCol: features column name. (default: features, current: indexedFeatures)
fitIntercept: whether to fit an intercept term. (default: True)
labelCol: label column name. (default: label, current: indexed)
lowerBoundsOnCoefficients: The lower bounds on coefficients if fitting under bound constrained optimization. The bound matrix must be compatible with the shape (1, number of features) for binomial regression, or (number of classes, number of features) for multinomial regression. (undefined)
lowerBoundsOnIntercepts: The lower bounds on intercepts if fitting under bound constr

In [7]:
# 创建pipeline
from pyspark.ml import Pipeline
logitPipeline = Pipeline(stages=[
    labelIndexerModel,featureIndexerModel,logitModel,labelIndexToString
])

In [8]:
logitPipelineModel = logitPipeline.fit(train)
pred_df = logitPipelineModel.transform(test)

In [17]:
preDF = pred_df.select("features","label","probability","predLabel").head(5) # 是一个action，以行的形式返回每一条记录;collect()可以返回所有

for item in preDF:
    print(
        str(item["label"]),":",str(item["features"]),"---->prob:",str(item["probability"]),str(item["predLabel"]),"\n"
    )

Iris-setosa : [4.4,3.2,1.3,0.2] ---->prob: [1.0,7.436686310088662e-45,4.414624110684572e-64] Iris-setosa 

Iris-setosa : [4.6,3.1,1.5,0.2] ---->prob: [1.0,2.0600073588532342e-38,4.9671672987418535e-57] Iris-setosa 

Iris-setosa : [4.7,3.2,1.6,0.2] ---->prob: [1.0,8.057271119430964e-41,2.0008567191942337e-59] Iris-setosa 

Iris-setosa : [4.8,3.1,1.6,0.2] ---->prob: [1.0,1.3858864238980332e-36,4.07182743549451e-55] Iris-setosa 

Iris-setosa : [4.8,3.4,1.6,0.2] ---->prob: [1.0,7.653793536232767e-48,5.872036036896465e-67] Iris-setosa 



In [19]:
pred_df.show(1)

+-----------------+-----------+-------+-----------------+--------------------+--------------------+----------+-----------+
|         features|      label|indexed|  indexedFeatures|       rawPrediction|         probability|prediction|  predLabel|
+-----------------+-----------+-------+-----------------+--------------------+--------------------+----------+-----------+
|[4.4,3.2,1.3,0.2]|Iris-setosa|    0.0|[4.4,3.2,1.3,0.2]|[82.4968090279537...|[1.0,7.4366863100...|       0.0|Iris-setosa|
+-----------------+-----------+-------+-----------------+--------------------+--------------------+----------+-----------+
only showing top 1 row



In [20]:
# 下面计算评价指标
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(labelCol="indexed",predictionCol="prediction")
lrAccuracy = evaluator.evaluate(pred_df)
print(lrAccuracy)

0.9773492500765228
