In [1]:
# Install pyspark and findspark
!pip install --ignore-install -q pyspark
# Install findspark library
!pip install --ignore-install -q findspark

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.9/316.9 MB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.5/200.5 kB[0m [31m13.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for pyspark (setup.py) ... [?25l[?25hdone


In [2]:
# Import findspark
import findspark
findspark.init()

In [3]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [4]:
import sys
sys.version_info

sys.version_info(major=3, minor=10, micro=12, releaselevel='final', serial=0)

In [5]:
print(sys.version)

3.10.12 (main, Nov 20 2023, 15:14:05) [GCC 11.4.0]


### 1. Set up spark context and SparkSession

In [6]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("PySpark-LinearRegression_Advertising") \
    .getOrCreate()

### 2.  Load data set

In [9]:
df = spark.read.format('com.databricks.spark.csv').\
                               options(header='true', \
                               inferschema='true').load("/content/drive/MyDrive/Advertising.csv",header=True);

In [10]:
df.show(6)

+-----+-----+---------+-----+
|   TV|Radio|Newspaper|Sales|
+-----+-----+---------+-----+
|230.1| 37.8|     69.2| 22.1|
| 44.5| 39.3|     45.1| 10.4|
| 17.2| 45.9|     69.3|  9.3|
|151.5| 41.3|     58.5| 18.5|
|180.8| 10.8|     58.4| 12.9|
|  8.7| 48.9|     75.0|  7.2|
+-----+-----+---------+-----+
only showing top 6 rows



In [11]:
df.printSchema()

root
 |-- TV: double (nullable = true)
 |-- Radio: double (nullable = true)
 |-- Newspaper: double (nullable = true)
 |-- Sales: double (nullable = true)



### 3. Convert data into feature

In [12]:
from pyspark.sql import Row
from pyspark.ml.linalg import Vectors

In [13]:
# convert the data to dense vector
#def transData(row):
#    return Row(label=row["Sales"],
#               features=Vectors.dense([row["TV"],
#                                       row["Radio"],
#                                       row["Newspaper"]]))
def transData(data):
    return data.rdd.map(lambda r: [Vectors.dense(r[:-1]),r[-1]]).toDF(['features','label'])

### 4. Transform the dataset to DataFrame

In [14]:
#transformed = df.rdd.map(transData).toDF()
data= transData(df)
data.show(6)

+-----------------+-----+
|         features|label|
+-----------------+-----+
|[230.1,37.8,69.2]| 22.1|
| [44.5,39.3,45.1]| 10.4|
| [17.2,45.9,69.3]|  9.3|
|[151.5,41.3,58.5]| 18.5|
|[180.8,10.8,58.4]| 12.9|
|  [8.7,48.9,75.0]|  7.2|
+-----------------+-----+
only showing top 6 rows



### 5. Convert features data format and set up training and test data sets

In [15]:
from pyspark.ml import Pipeline
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorIndexer
from pyspark.ml.evaluation import RegressionEvaluator
featureIndexer = VectorIndexer(inputCol="features", \
                               outputCol="indexedFeatures",\
                               maxCategories=4).fit(data)

# Split the data into training and test sets (40% held out for testing)
(trainingData, testData) = data.randomSplit([0.6, 0.4], seed = 218)

### 6. Fit linear regression model

In [16]:
# Fit elastic net model
lr = LinearRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)

In [17]:
# Chain indexer and tree in a Pipeline
pipeline = Pipeline(stages=[featureIndexer, lr])

In [18]:
# Train model.  This also runs the indexer.
model = pipeline.fit(trainingData)


In [19]:
 lrmodel= model.stages[1]

In [20]:
lrmodel.coefficients

DenseVector([0.0414, 0.1581, 0.0])

In [21]:
lrmodel.summary.meanAbsoluteError

1.3536052449934453

###  7. Make predictions

In [22]:
predictions = model.transform(testData)

In [23]:
# Select example rows to display.
predictions.select("prediction", "label", "features").show(5)

+------------------+-----+----------------+
|        prediction|label|        features|
+------------------+-----+----------------+
| 6.098217842293607|  3.2|  [4.1,11.6,5.7]|
|   9.0453590533938|  5.3|  [5.4,29.9,9.4]|
| 8.742649922605892|  5.7|  [8.4,27.2,2.1]|
|4.7824788680237384|  4.8|   [8.6,2.1,1.0]|
|10.412867268349295|  7.3|[11.7,36.9,45.2]|
+------------------+-----+----------------+
only showing top 5 rows



### 8. Evaluation

In [24]:
# Select (prediction, true label) and compute test error
evaluator = RegressionEvaluator(
    labelCol="label", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

Root Mean Squared Error (RMSE) on test data = 1.72421


In [25]:
y_true = predictions.select("label").toPandas()
y_pred = predictions.select("prediction").toPandas()

In [26]:
import sklearn.metrics
r2_score = sklearn.metrics.r2_score(y_true,y_pred)
print(r2_score)

0.8989563084326142


### 9. Fit generalized linear regression model

In [27]:
# Import LinearRegression Class
from pyspark.ml.regression import GeneralizedLinearRegression

# Define LinearRegression Model
glr = GeneralizedLinearRegression(family="gaussian", link="identity",\
                                 maxIter = 10, regParam=0.3)

In [28]:
# Create pipeline
pipeline = Pipeline(stages=[featureIndexer,glr])
model = pipeline.fit(trainingData)

In [29]:
# Make predictions
predictions = model.transform(testData)

In [30]:
# Select example rows to display.
predictions.select("prediction", "label", "features").show(5)

+------------------+-----+----------------+
|        prediction|label|        features|
+------------------+-----+----------------+
| 5.818378480845925|  3.2|  [4.1,11.6,5.7]|
|  8.86945269880158|  5.3|  [5.4,29.9,9.4]|
| 8.517309580526351|  5.7|  [8.4,27.2,2.1]|
| 4.437492260704728|  4.8|   [8.6,2.1,1.0]|
|10.466526621329647|  7.3|[11.7,36.9,45.2]|
+------------------+-----+----------------+
only showing top 5 rows



In [31]:
# Evaluation
evaluator = RegressionEvaluator(
    labelCol="label", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

Root Mean Squared Error (RMSE) on test data = 1.66821


In [32]:
spark.stop()