In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler
import os

In [2]:
# Create spark session
spark = SparkSession.builder.appName("ICP-14").getOrCreate()
spark.sparkContext.setLogLevel("ERROR")

In [3]:
# Load data and select feature and label columns
data = spark.read.format("csv").option("header", True)\
.option("inferSchema", True).option("delimiter", ",")\
.load("car.csv")
data = data.withColumnRenamed("wheel-base", "label").select("label", "length", "width", "height")
data.show(10)

+-----+------+-----+------+
|label|length|width|height|
+-----+------+-----+------+
| 88.6| 168.8| 64.1|  48.8|
| 88.6| 168.8| 64.1|  48.8|
| 94.5| 171.2| 65.5|  52.4|
| 99.8| 176.6| 66.2|  54.3|
| 99.4| 176.6| 66.4|  54.3|
| 99.8| 177.3| 66.3|  53.1|
|105.8| 192.7| 71.4|  55.7|
|105.8| 192.7| 71.4|  55.7|
|105.8| 192.7| 71.4|  55.9|
| 99.5| 178.2| 67.9|  52.0|
+-----+------+-----+------+
only showing top 10 rows



In [4]:
# Create vector assembler for feature columns
assembler = VectorAssembler(inputCols=data.columns[1:], outputCol="features")
data = assembler.transform(data)

In [5]:
lr = LinearRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)

# Fit the model
model = lr.fit(data)

# Print the coefficients and intercept for linear regression
print("Coefficients: %s" % str(model.coefficients))
print("Intercept: %s" % str(model.intercept))

# Summarize the model over the training set and print out some metrics
trainingSummary = model.summary
print("numIterations: %d" % trainingSummary.totalIterations)
print("objectiveHistory: %s" % str(trainingSummary.objectiveHistory))
trainingSummary.residuals.show()
print("RMSE: %f" % trainingSummary.rootMeanSquaredError)
print("r2: %f" % trainingSummary.r2)
spark.stop()

Coefficients: [0.22836801258821893,0.8223218915856468,0.580595102043434]
Intercept: -26.380531957157498
numIterations: 11
objectiveHistory: [0.5, 0.38579526656819896, 0.13000842393266873, 0.12985504772567413, 0.12963704261349218, 0.12947103310674205, 0.1294164378448031, 0.1294050846483987, 0.12940508261516015, 0.1294050824628613, 0.12940508245526855]
+--------------------+
|           residuals|
+--------------------+
|  -4.611862798093398|
|  -4.611862798093398|
|  -2.501339043881387|
|-0.11328232985025011|
| -0.6777467081673763|
|  0.3413419946315486|
|  -2.878914311626758|
|  -2.878914311626758|
| -2.9950333320354474|
| -0.8412496309870932|
|  2.3922947158520174|
|  2.3922947158520174|
|  2.3922947158520174|
|  2.3922947158520174|
| -0.6335041529149237|
| -0.6335041529149237|
| -1.3908023008371515|
|  0.4019071188106693|
|   2.084135889634638|
|   2.787341183548463|
+--------------------+
only showing top 20 rows

RMSE: 2.517190
r2: 0.824407


In [6]:
from pyspark.ml.classification import LogisticRegression

mlr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8, family="multinomial")

# Fit the model
mlr_model = mlr.fit(data)

# Print the coefficients and intercepts for logistic regression with multinomial family
print("Multinomial coefficients: " + str(mlr_model.coefficientMatrix))
print("Multinomial intercepts: " + str(mlr_model.interceptVector))

AttributeError: 'NoneType' object has no attribute '_jvm'