In [1]:
!apt-get install openjdk-11-jdk -qq > /dev/null
!wget -q https://archive.apache.org/dist/spark/spark-3.5.1/spark-3.5.1-bin-hadoop3.tgz
!tar -xzf spark-3.5.1-bin-hadoop3.tgz
!pip install -q findspark

In [2]:
import os, findspark
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.5.1-bin-hadoop3"
findspark.init()

from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("SparkMLlib_Demo").getOrCreate()
print("✅ Spark version:", spark.version)

✅ Spark version: 3.5.1


In [3]:
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.linalg import Vectors

In [5]:
#Create a sample dataset
data = [
    (1.0, 2.0, 3.0, 40.0),
    (2.0, 3.0, 4.0, 50.0),
    (3.0, 4.0, 5.0, 60.0),
    (4.0, 5.0, 6.0, 70.0),
]
columns = ["feature1", "feature2", "feature3", "label"]

df = spark.createDataFrame(data, columns)
df.show()

+--------+--------+--------+-----+
|feature1|feature2|feature3|label|
+--------+--------+--------+-----+
|     1.0|     2.0|     3.0| 40.0|
|     2.0|     3.0|     4.0| 50.0|
|     3.0|     4.0|     5.0| 60.0|
|     4.0|     5.0|     6.0| 70.0|
+--------+--------+--------+-----+



In [8]:
#Transformer – VectorAssembler

assembler = VectorAssembler(inputCols=["feature1", "feature2", "feature3"], outputCol="features")

assembled_df = assembler.transform(df)
assembled_df.show(truncate=False)

+--------+--------+--------+-----+-------------+
|feature1|feature2|feature3|label|features     |
+--------+--------+--------+-----+-------------+
|1.0     |2.0     |3.0     |40.0 |[1.0,2.0,3.0]|
|2.0     |3.0     |4.0     |50.0 |[2.0,3.0,4.0]|
|3.0     |4.0     |5.0     |60.0 |[3.0,4.0,5.0]|
|4.0     |5.0     |6.0     |70.0 |[4.0,5.0,6.0]|
+--------+--------+--------+-----+-------------+



In [9]:
#Transformer – StandardScaler

scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures", withStd=True, withMean=False)

scaler_model = scaler.fit(assembled_df)
scaled_df = scaler_model.transform(assembled_df)
scaled_df.show(truncate=False)

+--------+--------+--------+-----+-------------+--------------------------------------------------------+
|feature1|feature2|feature3|label|features     |scaledFeatures                                          |
+--------+--------+--------+-----+-------------+--------------------------------------------------------+
|1.0     |2.0     |3.0     |40.0 |[1.0,2.0,3.0]|[0.7745966692414834,1.5491933384829668,2.32379000772445]|
|2.0     |3.0     |4.0     |50.0 |[2.0,3.0,4.0]|[1.5491933384829668,2.32379000772445,3.0983866769659336]|
|3.0     |4.0     |5.0     |60.0 |[3.0,4.0,5.0]|[2.32379000772445,3.0983866769659336,3.872983346207417] |
|4.0     |5.0     |6.0     |70.0 |[4.0,5.0,6.0]|[3.0983866769659336,3.872983346207417,4.6475800154489]  |
+--------+--------+--------+-----+-------------+--------------------------------------------------------+



In [10]:
# Estimator – LinearRegression

lr = LinearRegression(featuresCol="scaledFeatures", labelCol="label", maxIter=10, regParam=0.3, elasticNetParam=0.8)

lr_model = lr.fit(scaled_df)
predictions = lr_model.transform(scaled_df)
predictions.select("features", "label", "prediction").show(truncate=False)

+-------------+-----+-----------------+
|features     |label|prediction       |
+-------------+-----+-----------------+
|[1.0,2.0,3.0]|40.0 |40.34820371874184|
|[2.0,3.0,4.0]|50.0 |50.11606790624728|
|[3.0,4.0,5.0]|60.0 |59.88393209375272|
|[4.0,5.0,6.0]|70.0 |69.65179628125814|
+-------------+-----+-----------------+



In [11]:
#Param Example

print("Max Iterations:", lr.getMaxIter())
print("Regularization Parameter:", lr.getRegParam())
print("ElasticNet Param:", lr.getElasticNetParam())
print("Coefficients:", lr_model.coefficients)
print("Intercept:", lr_model.intercept)

Max Iterations: 10
Regularization Parameter: 0.3
ElasticNet Param: 0.8
Coefficients: [4.203419480691576,4.203419480691576,4.203419480691645]
Intercept: 20.81247534373092
