In [56]:
from __future__ import print_function
 
# $example on$
from pyspark.ml.linalg import Vectors
from pyspark.ml.classification import LogisticRegression
# $example off$
from pyspark.sql import SparkSession
 
if __name__ == "__main__":
    spark = SparkSession\
        .builder\
        .appName("EstimatorTransformerParamExample")\
        .getOrCreate()
 
    # label 列为标记，features 是特征向量
    training = spark.createDataFrame([
        (1.0, Vectors.dense([0.0, 1.1, 0.1])),
        (0.0, Vectors.dense([2.0, 1.0, -1.0])),
        (0.0, Vectors.dense([2.0, 1.3, 1.0])),
        (1.0, Vectors.dense([0.0, 1.2, -0.5]))], ["label", "features"])
#-------------------------------------------------------------------------------------------------------------
    # 创建一个逻辑回归算法，算法是 Estimator.
    # maxIter 最大迭代次数，regParam 是正则化参数
    lr = LogisticRegression(maxIter=10, regParam=0.01)                                ################
    # Print out the parameters, documentation, and any default values.                #              #
    # print("LogisticRegression parameters:\n" + lr.explainParams() + "\n")           #   model 1    #
                                                                                      #              #
    # 算法是一个Estimator，学习训练数据以后，会返回一个模型，模型是Transformer        ################
    model1 = lr.fit(training)
 
    #下面两行去掉注释，会打印model1的相关参数
    #print("Model 1 was fit using parameters: ")
    #print(model1.extractParamMap())
#-------------------------------------------------------------------------------------------------------------
    #-----paraMap1-------------------------------
    # paramMaps可以调整算法的参数，是字典类型
    paramMap = {lr.maxIter: 20}
    paramMap[lr.maxIter] = 30  
    #调整了算法正则系数regParam，和判断概率阈值
    paramMap.update({lr.regParam: 0.1, lr.threshold: 0.55}) 
 
    #-----paraMap2--------------------------------
    #你可以通过合并字典修改参数。
    #可以修改预测列名称
    paramMap2 = {lr.probabilityCol: "probability"}               ################################
                                                                 #                              #
    #------结合-----------------------------------               #                              #
    paramMapCombined = paramMap.copy()                           #                              #
    paramMapCombined.update(paramMap2)                           #           model 2            #
                                                                 #                              #
                                                                 #                              #
    # 把训练数据、combine放进来拟合                              #                              #
    model2 = lr.fit(training, paramMapCombined)                  #                              #
     #下面两行去掉注释，会打印model2的相关参数                   ###############################
    #print("Model 2 was fit using parameters: ")
    #print(model2.extractParamMap())
#------------------------------------------------------------------------------------------------------------
    
    
    
    # 测试数据
    test = spark.createDataFrame([
        (1.0, Vectors.dense([-1.0, 1.5, 1.3])),
        (0.0, Vectors.dense([3.0, 2.0, -0.1])),
        (1.0, Vectors.dense([0.0, 2.2, -1.5]))], ["label", "features"])
 
    predictions = (model1.transform(test),model2.transform(test))
 
    
    for prediction in predictions:
        result = prediction.select("features", "label", "probability", "prediction").collect()
        for row in result:
            print("features=%s, label=%s -> prob=%s, prediction=%s "
              % (row.features, row.label, row.probability, row.prediction))
        print("\n")
 
    spark.stop()


features=[-1.0,1.5,1.3], label=1.0 -> prob=[0.0013759947069214356,0.9986240052930786], prediction=1.0 
features=[3.0,2.0,-0.1], label=0.0 -> prob=[0.9816604009374171,0.018339599062582944], prediction=0.0 
features=[0.0,2.2,-1.5], label=1.0 -> prob=[0.0016981475578358373,0.9983018524421641], prediction=1.0 


features=[-1.0,1.5,1.3], label=1.0 -> prob=[0.05707304171033977,0.9429269582896603], prediction=1.0 
features=[3.0,2.0,-0.1], label=0.0 -> prob=[0.9238522311704088,0.07614776882959128], prediction=0.0 
features=[0.0,2.2,-1.5], label=1.0 -> prob=[0.10972776114779119,0.8902722388522087], prediction=1.0 


