In [None]:
!pip install pyspark
!pip install findspark
import findspark
findspark.init()
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression

In [None]:
sc = SparkContext()
spark = SparkSession \
    .builder \
    .appName("Saving and Loading a SparkML Model").getOrCreate()
sc.setLogLevel("ERROR")

In [None]:
# Create a simple data set of infant height(cms) weight(kgs) chart.
mydata = [[46,2.5],[51,3.4],[54,4.4],[57,5.1],[60,5.6],[61,6.1],[63,6.4]]
# Mention column names of dataframe
columns = ["height", "weight"]
mydf = spark.createDataFrame(mydata, columns)
mydf.show()

In [None]:
assembler = VectorAssembler(
    inputCols=["height"],
    outputCol="features")

data = assembler.transform(mydf).select('features','weight')
data.show()

In [None]:
# Create a LR model
lr = LinearRegression(featuresCol='features', labelCol='weight', maxIter=100)
lr.setRegParam(0.1)
# Fit the model
lrModel = lr.fit(data)
lrModel.save('infantheight2.model')

In [None]:
# You need LinearRegressionModel to load the model
from pyspark.ml.regression import LinearRegressionModel

In [None]:
model = LinearRegressionModel.load('infantheight2.model')

In [None]:
# This function converts a scalar number into a dataframe that can be used by the model to predict.
def predict(weight):
    assembler = VectorAssembler(inputCols=["weight"],outputCol="features")
    data = [[weight,0]]
    columns = ["weight", "height"]
    _ = spark.createDataFrame(data, columns)
    __ = assembler.transform(_).select('features','height')
    predictions = model.transform(__)
    predictions.select('prediction').show()

In [None]:
predict(70)

### ecommerce analytics and modelling with spark

In [None]:
# Install spark
!pip install pyspark
!pip install findspark
import findspark
findspark.init()
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession

In [None]:
# Start session
sc = SparkContext()
spark = SparkSession \
    .builder \
    .appName("Saving and Loading a SparkML Model").getOrCreate()

sc.setLogLevel("ERROR")

In [None]:
%%! wget https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-DB0321EN-SkillsNetwork/Bigdata%20and%20Spark/searchterms.csv

In [None]:
# Load the csv into a spark dataframe
df = spark.read.format("csv").option("header", "true").load("searchterms.csv")
df.show()

In [None]:
# df shape
print((df.count(), len(df.columns)))

In [None]:
# df head
df.show(5)

In [None]:
# data types
df.dtypes

In [None]:
# How many times was the term `gaming laptop` searched?
df.select('searchterm').where(df.searchterm=="gaming laptop").count()

In [None]:
# Print the top 5 most frequently used search terms?
df.groupBy("searchterm").count().sort("count", ascending=False).show(5)

In [None]:
# The pretrained sales forecasting model is available at the below url
%%!wget https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-DB0321EN-SkillsNetwork/Bigdata%20and%20Spark/model.tar.gz
%%!tar –xvzf model.tar.gz

In [None]:
# Load the sales forecast model.
from pyspark.ml.regression import LinearRegressionModel
model = LinearRegressionModel.load('sales_prediction.model')

In [None]:
# Using the sales forecast model, predict the sales for the year of 2023.
def predict(year):
    assembler = VectorAssembler(inputCols=["year"],outputCol="features")
    data = [[year,0]]
    columns = ["year", "sales"]
    _ = spark.createDataFrame(data, columns)
    __ = assembler.transform(_).select('features', 'year')
    predictions = model.transform(__)
    predictions.select('prediction').show()

predict(2023)