Install PySpark if using Colab

In [1]:
#!pip install pyspark
# Use wget linux command to download the file and store it to the local folder
# !wget https://raw.githubusercontent.com/trajanov/BigDataAnalytics/master/data/netflix-subscription.csv

### Initialize PySpark Session

In [2]:
import findspark
findspark.init()
from pyspark import SparkContext
from pyspark.sql import SparkSession
sc = SparkContext("local")
spark = SparkSession.builder.getOrCreate()

In [3]:
# Use wget linux command to download the file and store it to the local folder
# !wget https://raw.githubusercontent.com/trajanov/BigDataAnalytics/master/data/netflix-subscription.csv

### Create a PySpark DataFrame from the netflix-subscription.csv file

In [92]:
from pyspark.ml.feature import VectorAssembler

# Create a DataFrame
df = spark.read.csv("data\\netflix-subscription.csv", header=True, inferSchema=True)
# Show sample records
df.show(5)



+------------+---------+------------------+------------+----------+--------------------+-----------------------+----------------------+
|Country_Code|  Country|Total_Library_Size|Num_TV_Shows|Num_Movies|Cost_Per_Month_Basic|Cost_Per_Month_Standard|Cost_Per_Month_Premium|
+------------+---------+------------------+------------+----------+--------------------+-----------------------+----------------------+
|          ar|Argentina|              4760|        3154|      1606|                3.74|                    6.3|                  9.26|
|          au|Australia|              6114|        4050|      2064|                7.84|                  12.12|                 16.39|
|          at|  Austria|              5640|        3779|      1861|                9.03|                  14.67|                 20.32|
|          be|  Belgium|              4990|        3374|      1616|               10.16|                  15.24|                 20.32|
|          bo|  Bolivia|              4991|     

### Create the feature vector and labels

In [93]:
# Create features vector
feature_columns = ["Total_Library_Size","Num_TV_Shows","Num_Movies"]
assembler = VectorAssembler(inputCols=feature_columns, outputCol='features')
df = df.withColumnRenamed("Cost_Per_Month_Basic","label")
df = assembler.transform(df).select("features","label")

# Train-Test split
train, test = df.randomSplit(weights=[0.7,0.3], seed=200)
train.show(5)

+--------------------+-----+
|            features|label|
+--------------------+-----+
|[2274.0,1675.0,59...| 9.03|
|[3048.0,1712.0,13...|12.88|
|[3887.0,2449.0,14...| 8.36|
|[4045.0,2638.0,14...| 9.03|
|[4361.0,2973.0,13...| 10.9|
+--------------------+-----+
only showing top 5 rows



## Linear Regression Model

In [94]:
from pyspark.ml.regression import LinearRegression

lr = LinearRegression(maxIter=50)

# Fit the model
lrModel = lr.fit(train)

# Print the coefficients and intercept for linear regression
print("Coefficients: %s" % str(lrModel.coefficients))
print("Intercept: %s" % str(lrModel.intercept))

Coefficients: [-7.965004316307868e-06,0.0002585904963213735,-0.001742711925249158]
Intercept: 10.859474972201737


In [95]:
predicrions = lrModel.transform(test)
predRDD = predicrions.select(*["label","prediction"]).rdd.map(tuple)
predRDD.take(5)

[(9.03, 10.69193405548763),
 (9.03, 8.916280789046422),
 (1.97, 8.601900791149198),
 (7.99, 8.773585578709653),
 (7.99, 8.437455608233083)]

In [96]:
from pyspark.mllib.evaluation import RegressionMetrics
metrics = RegressionMetrics(predRDD)
print("MAE", metrics.meanAbsoluteError)
print("MSE", metrics.meanSquaredError)

MAE 1.6545303554482729
MSE 6.09946883815713


# GeneralizedLinearRegression

In [168]:
from pyspark.ml.regression import GeneralizedLinearRegression

glr = GeneralizedLinearRegression(family="gaussian", maxIter=50)

# Fit the model
model = glr.fit(train)

# Print the coefficients and intercept for generalized linear regression model
print("Coefficients: " + str(model.coefficients))
print("Intercept: " + str(model.intercept))


Coefficients: [-7.965004316307868e-06,0.0002585904963213735,-0.001742711925249158]
Intercept: 10.859474972201737


In [169]:
predicrions = model.transform(test)
predRDD = predicrions.select(*["label","prediction"]).rdd.map(tuple)
predRDD.take(5)

[(9.03, 10.69193405548763),
 (9.03, 8.916280789046422),
 (1.97, 8.601900791149198),
 (7.99, 8.773585578709653),
 (7.99, 8.437455608233083)]

In [170]:
from pyspark.mllib.evaluation import RegressionMetrics
metrics = RegressionMetrics(predRDD)
print("MAE", metrics.meanAbsoluteError)
print("MSE", metrics.meanSquaredError)

MAE 1.6545303554482729
MSE 6.09946883815713


# Q10: Improve the model
Only simple improvement using a non-linear model that slightly improves the performance is implemented.

In [183]:
from pyspark.ml.regression import GBTRegressor
from pyspark.ml.regression import DecisionTreeRegressor


gbt = DecisionTreeRegressor(maxDepth=5,maxBins=5)
#gbt = GBTRegressor(maxIter=4, maxDepth = 5, maxBins=5)
model = gbt.fit(train)

In [184]:
predicrions = model.transform(test)
predRDD = predicrions.select(*["label","prediction"]).rdd.map(tuple)
predRDD.take(5)

[(9.03, 10.305714285714284),
 (9.03, 10.305714285714284),
 (1.97, 8.08),
 (7.99, 8.064285714285715),
 (7.99, 8.064285714285715)]

In [185]:
from pyspark.mllib.evaluation import RegressionMetrics
metrics = RegressionMetrics(predRDD)
print("MAE", metrics.meanAbsoluteError)
print("MSE", metrics.meanSquaredError)


MAE 1.5173424036281176
MSE 5.527559933484507
