In [1]:
# 导入pyspark
from pyspark.sql import SparkSession

# 连接到spark
spark = SparkSession.builder \
        .appName("pysparkML") \
        .master("local[4]") \
        .getOrCreate()

# 使用rdd读取数据

In [2]:
# 从文件系统中读取数据，并将其转化为DataFrame
sc = spark.sparkContext
rdd = sc.textFile("./resources/iris.txt")

In [3]:
rdd.take(5)

['5.1,3.5,1.4,0.2,Iris-setosa',
 '4.9,3.0,1.4,0.2,Iris-setosa',
 '4.7,3.2,1.3,0.2,Iris-setosa',
 '4.6,3.1,1.5,0.2,Iris-setosa',
 '5.0,3.6,1.4,0.2,Iris-setosa']

# 将rdd转化成dataframe，其中，第一列为特征，第二列为label

In [4]:
# 下面基于spark提供的row对象，构造dataframe
# dataframe的一行表示一个实例，第一列是feature，是一个row对象，第二列是他的label
from pyspark.sql import Row
from pyspark.ml.linalg import Vectors # 将特征转换为向量
iris = rdd.map(lambda line:line.strip().split(",")) \
       .map(lambda line:Row(features = Vectors.dense(float(line[0]),
                                                                                                 float(line[1]),
                                                                                                 float(line[2]),
                                                                                                 float(line[3])
                                                                                                        ),
                                                                         label = line[4]))
df = spark.createDataFrame(iris)
df.printSchema()
df.show(5)

root
 |-- features: vector (nullable = true)
 |-- label: string (nullable = true)

+-----------------+-----------+
|         features|      label|
+-----------------+-----------+
|[5.1,3.5,1.4,0.2]|Iris-setosa|
|[4.9,3.0,1.4,0.2]|Iris-setosa|
|[4.7,3.2,1.3,0.2]|Iris-setosa|
|[4.6,3.1,1.5,0.2]|Iris-setosa|
|[5.0,3.6,1.4,0.2]|Iris-setosa|
+-----------------+-----------+
only showing top 5 rows



# 定义标签的编码与解码

In [5]:
# 下面开始进行标签的转化，将label进行编码
from  pyspark.ml.feature import StringIndexer,IndexToString
# 创建标签编码器
# 输入的列名称，输出的列名称
labelIndexer = StringIndexer(inputCol="label",outputCol="indexedLabel",stringOrderType="frequencyDesc")
labelIndexerModel = labelIndexer.fit(df)
# 创建标签的解码器，将解码的对应关系传给解码器
labelIndexToString = IndexToString(inputCol="prediction",outputCol="predLabel",labels=labelIndexerModel.labels)

# 特征向量化

In [6]:
# 特征的转化
# 这个问题是不需要在做的，因为已经处理成了向量了
from pyspark.ml.feature import VectorIndexer # 用来将特征向量化
featureToVector = VectorIndexer(inputCol="features",outputCol="featureVect").fit(df)

# 数据集的划分

In [7]:
# 划分训练集和测试集
train,test = df.randomSplit([0.7,0.3])

# 多分类-逻辑回归模型

- 多分类的时候，使用的逻辑回归实际上是softmax逻辑回归，针对一个实例，使用对数函数，计算其属于每一个类别的值，然后再使用softmax归一化

In [8]:
# 构建逻辑回归的分类器
from pyspark.ml.classification import LogisticRegression

# treeAggregate：不会一次性将partition的运算结果拉倒driver，而是根据depth，在分区做完局部aggregation后再拉倒driver
# elasticNetParam：回归同时应用l1和l2正则化，当其为0时表示只用l2，为1的时候只用l2
# family：是二项分布还是多项分布，默认用auto
# intercept term：表示截距项（一般，也叫偏置）
logitModel = LogisticRegression(
    featuresCol="features",labelCol="indexedLabel",maxIter=100,elasticNetParam=0.8
)
print(logitModel.explainParams())

aggregationDepth: suggested depth for treeAggregate (>= 2). (default: 2)
elasticNetParam: the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty. (default: 0.0, current: 0.8)
family: The name of family which is a description of the label distribution to be used in the model. Supported options: auto, binomial, multinomial (default: auto)
featuresCol: features column name. (default: features, current: features)
fitIntercept: whether to fit an intercept term. (default: True)
labelCol: label column name. (default: label, current: indexedLabel)
lowerBoundsOnCoefficients: The lower bounds on coefficients if fitting under bound constrained optimization. The bound matrix must be compatible with the shape (1, number of features) for binomial regression, or (number of classes, number of features) for multinomial regression. (undefined)
lowerBoundsOnIntercepts: The lower bounds on intercepts if fitting under bound constrai

# 构建机器学习的pipeline

In [9]:
# 创建pipeline
from pyspark.ml import Pipeline
logitPipeline = Pipeline(stages=[
    labelIndexerModel,logitModel,labelIndexToString
])
logitPipelineModel = logitPipeline.fit(train)

# 在测试集上验证相关指标

In [10]:
# 获取预测结果的df
pred_df = logitPipelineModel.transform(test)

In [11]:
preDF = pred_df.select("features","label","probability","predLabel").head(5) # 是一个action，以行的形式返回每一条记录;collect()可以返回所有

for item in preDF:
    print(
        str(item["label"]),":",str(item["features"]),"---->prob:",str(item["probability"]),str(item["predLabel"]),"\n"
    )

Iris-setosa : [4.4,3.0,1.3,0.2] ---->prob: [1.0,0.0,0.0] Iris-setosa 

Iris-setosa : [4.6,3.4,1.4,0.3] ---->prob: [1.0,0.0,0.0] Iris-setosa 

Iris-setosa : [4.6,3.6,1.0,0.2] ---->prob: [1.0,0.0,0.0] Iris-setosa 

Iris-versicolor : [4.9,2.4,3.3,1.0] ---->prob: [0.0,1.0,0.0] Iris-versicolor 

Iris-setosa : [4.9,3.1,1.5,0.1] ---->prob: [1.0,0.0,0.0] Iris-setosa 



In [12]:
# 下面计算评价指标
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel",predictionCol="prediction")
lrAccuracy = evaluator.evaluate(pred_df)
print(lrAccuracy)

0.9631818181818183


# 模型的进一步分析，查看模型的一些参数

In [13]:
# 获取模型相关的系数
lrModel = logitPipelineModel.stages[1]
print("Coefficients: \n " + str(lrModel.coefficientMatrix) + \
"\nintercept: " + str(lrModel.interceptVector) + \
"\n numClasses: " + str(lrModel.numClasses) + \
"\n numFeatures: " + str(lrModel.numFeatures))

Coefficients: 
 DenseMatrix([[-1661.8537329 ,  1642.71499567, -1120.25780823, -2221.2591937 ],
             [  905.41805344,   -23.05624158,  -463.92616847, -1699.26454619],
             [  756.43567946, -1619.65875409,  1584.1839767 ,  3920.52373989]])
intercept: [11204.906801668069,1521.4484613916343,-12726.355263059704]
 numClasses: 3
 numFeatures: 4
