In [1]:
# -*- coding: utf-8 -*-
from pyspark.sql import SparkSession
from pyspark.sql.functions import corr
from pyspark.ml.linalg import Vector
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression

In [2]:
""" 步骤1： 创建SparkSession对象 """
spark = SparkSession.builder.appName('lin_reg').getOrCreate()

In [3]:
""" 步骤2： 读取数据集 """
path = r"D:\LKM\PySpark机器学习、自然语言与推荐系统\machine-learning-with-pyspark-master\chapter_4_Linear_Regression\Linear_regression_dataset.csv"
df = spark.read.csv(path, inferSchema=True, header=True)
df.show()

+-----+-----+-----+-----+-----+------+
|var_1|var_2|var_3|var_4|var_5|output|
+-----+-----+-----+-----+-----+------+
|  734|  688|   81|0.328|0.259| 0.418|
|  700|  600|   94| 0.32|0.247| 0.389|
|  712|  705|   93|0.311|0.247| 0.417|
|  734|  806|   69|0.315| 0.26| 0.415|
|  613|  759|   61|0.302| 0.24| 0.378|
|  748|  676|   85|0.318|0.255| 0.422|
|  669|  588|   97|0.315|0.251| 0.411|
|  667|  845|   68|0.324|0.251| 0.381|
|  758|  890|   64| 0.33|0.274| 0.436|
|  726|  670|   88|0.335|0.268| 0.422|
|  583|  794|   55|0.302|0.236| 0.371|
|  676|  746|   72|0.317|0.265|   0.4|
|  767|  699|   89|0.332|0.274| 0.433|
|  637|  597|   86|0.317|0.252| 0.374|
|  609|  724|   69|0.308|0.244| 0.382|
|  776|  733|   83|0.325|0.259| 0.437|
|  701|  832|   66|0.325| 0.26|  0.39|
|  650|  709|   74|0.316|0.249| 0.386|
|  804|  668|   95|0.337|0.265| 0.453|
|  713|  614|   94| 0.31|0.238| 0.404|
+-----+-----+-----+-----+-----+------+
only showing top 20 rows



In [4]:
""" 步骤3： 探究式数据分析 """
# 查看数据维度
(df.count(), len(df.columns))

(1232, 6)

In [5]:
""" 步骤3： 探究式数据分析 """
# 查看数据类型
df.printSchema()

root
 |-- var_1: integer (nullable = true)
 |-- var_2: integer (nullable = true)
 |-- var_3: integer (nullable = true)
 |-- var_4: double (nullable = true)
 |-- var_5: double (nullable = true)
 |-- output: double (nullable = true)



In [7]:
# 步骤3： 探究式数据分析
df.describe().show()

+-------+-----------------+-----------------+------------------+--------------------+--------------------+-------------------+
|summary|            var_1|            var_2|             var_3|               var_4|               var_5|             output|
+-------+-----------------+-----------------+------------------+--------------------+--------------------+-------------------+
|  count|             1232|             1232|              1232|                1232|                1232|               1232|
|   mean|715.0819805194806|715.0819805194806| 80.90422077922078|  0.3263311688311693| 0.25927272727272715|0.39734172077922014|
| stddev| 91.5342940441652|93.07993263118064|11.458139049993724|0.015012772334166148|0.012907228928000298|0.03326689862173776|
|    min|              463|              472|                40|               0.277|               0.214|              0.301|
|    max|             1009|             1103|               116|               0.373|               0.294|     

In [9]:
# 步骤3： 探究式数据分析
# 检查变量之间的相互关系
df.select(corr('var_1', 'output')).show()

+-------------------+
|corr(var_1, output)|
+-------------------+
| 0.9187399607627283|
+-------------------+



In [11]:
# 查看字段名称
df.columns

['var_1', 'var_2', 'var_3', 'var_4', 'var_5', 'output']

In [12]:
# 步骤4：特征工程化
vec_assembler = VectorAssembler(inputCols=['var_1', 'var_2', 'var_3', 'var_4', 'var_5'], outputCol='features')

In [13]:
features_df = vec_assembler.transform(df)

In [14]:
features_df.printSchema()

root
 |-- var_1: integer (nullable = true)
 |-- var_2: integer (nullable = true)
 |-- var_3: integer (nullable = true)
 |-- var_4: double (nullable = true)
 |-- var_5: double (nullable = true)
 |-- output: double (nullable = true)
 |-- features: vector (nullable = true)



In [15]:
features_df.select('features').show(5, False)

+------------------------------+
|features                      |
+------------------------------+
|[734.0,688.0,81.0,0.328,0.259]|
|[700.0,600.0,94.0,0.32,0.247] |
|[712.0,705.0,93.0,0.311,0.247]|
|[734.0,806.0,69.0,0.315,0.26] |
|[613.0,759.0,61.0,0.302,0.24] |
+------------------------------+
only showing top 5 rows



In [16]:
model_df = features_df.select('features', 'output')

In [17]:
model_df.show(5, False)

+------------------------------+------+
|features                      |output|
+------------------------------+------+
|[734.0,688.0,81.0,0.328,0.259]|0.418 |
|[700.0,600.0,94.0,0.32,0.247] |0.389 |
|[712.0,705.0,93.0,0.311,0.247]|0.417 |
|[734.0,806.0,69.0,0.315,0.26] |0.415 |
|[613.0,759.0,61.0,0.302,0.24] |0.378 |
+------------------------------+------+
only showing top 5 rows



In [18]:
# 查看数据维度
(model_df.count(), len(df.columns))

(1232, 6)

In [19]:
# 步骤5：划分数据集
train_df, test_df = model_df.randomSplit([0.7, 0.3])

In [23]:
# 步骤6：构建和训练线性回归模型
lin_Reg = LinearRegression(labelCol='output')
lr_model = lin_Reg.fit(train_df)

In [24]:
# 查看回归系数
lr_model.coefficients

DenseVector([0.0003, 0.0, 0.0001, -0.6239, 0.506])

In [25]:
# 查看截距
lr_model.intercept

0.17898774119159971

In [26]:
training_predictions = lr_model.evaluate(train_df)
training_predictions.r2

0.8766093431926327

In [27]:
# 步骤7：在测试数据上评估线性回归模型
test_predictions =lr_model.evaluate(test_df)
test_predictions.r2

0.8489746142833268

In [28]:
test_predictions.meanSquaredError

0.0001579801327742761