In [1]:
## Pyspark ML for Delights Confectionaries

In [2]:
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName('Delights').getOrCreate()

In [22]:
## Read The dataset
df = spark.read.csv('delight_confect.csv',header=True,inferSchema=True)

In [23]:
df.show()

+---+-----+-------------+-------+-----+---------+
|_c0| city|         Item|country|month|    sales|
+---+-----+-------------+-------+-----+---------+
|  0|Abuja|  Butter Cake|Nigeria|    1| 723266.0|
|  1|Abuja|  Butter Cake|Nigeria|    2| 630520.0|
|  2|Abuja|  Butter Cake|Nigeria|    3| 692946.0|
|  3|Abuja|  Butter Cake|Nigeria|    4| 714788.0|
|  4|Abuja|  Butter Cake|Nigeria|    5| 720920.0|
|  5|Abuja|  Butter Cake|Nigeria|    6| 553146.0|
|  6|Abuja|  Butter Cake|Nigeria|    7| 742040.0|
|  7|Abuja|  Butter Cake|Nigeria|    8| 944060.0|
|  8|Abuja|  Butter Cake|Nigeria|    9| 867996.0|
|  9|Abuja|  Butter Cake|Nigeria|   10| 831078.0|
| 10|Abuja|  Butter Cake|Nigeria|   11| 747806.0|
| 11|Abuja|  Butter Cake|Nigeria|   12| 790578.0|
| 12|Abuja|Club Sandwich|Nigeria|    7|    560.0|
| 13|Abuja|Club Sandwich|Nigeria|    8|      0.0|
| 14|Abuja|Club Sandwich|Nigeria|    9|      0.0|
| 15|Abuja|Club Sandwich|Nigeria|   10|    460.0|
| 16|Abuja|Club Sandwich|Nigeria|   11|   2908.0|


In [24]:
df.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- city: string (nullable = true)
 |-- Item: string (nullable = true)
 |-- country: string (nullable = true)
 |-- month: integer (nullable = true)
 |-- sales: double (nullable = true)



In [25]:
df.columns

['_c0', 'city', 'Item', 'country', 'month', 'sales']

In [26]:
### Handling Categorical Features
from pyspark.ml.feature import StringIndexer

In [27]:
 indexer=StringIndexer(inputCols=['city', 'Item', 'country', 'month'],outputCols=['city_', 'Item_', 'country_', 'month_'])
df_r=indexer.fit(df).transform(df)
df_r.show()

+---+-----+-------------+-------+-----+---------+-----+-----+--------+------+
|_c0| city|         Item|country|month|    sales|city_|Item_|country_|month_|
+---+-----+-------------+-------+-----+---------+-----+-----+--------+------+
|  0|Abuja|  Butter Cake|Nigeria|    1| 723266.0|  1.0|  1.0|     0.0|   9.0|
|  1|Abuja|  Butter Cake|Nigeria|    2| 630520.0|  1.0|  1.0|     0.0|  10.0|
|  2|Abuja|  Butter Cake|Nigeria|    3| 692946.0|  1.0|  1.0|     0.0|   6.0|
|  3|Abuja|  Butter Cake|Nigeria|    4| 714788.0|  1.0|  1.0|     0.0|  11.0|
|  4|Abuja|  Butter Cake|Nigeria|    5| 720920.0|  1.0|  1.0|     0.0|   7.0|
|  5|Abuja|  Butter Cake|Nigeria|    6| 553146.0|  1.0|  1.0|     0.0|   8.0|
|  6|Abuja|  Butter Cake|Nigeria|    7| 742040.0|  1.0|  1.0|     0.0|   3.0|
|  7|Abuja|  Butter Cake|Nigeria|    8| 944060.0|  1.0|  1.0|     0.0|   4.0|
|  8|Abuja|  Butter Cake|Nigeria|    9| 867996.0|  1.0|  1.0|     0.0|   5.0|
|  9|Abuja|  Butter Cake|Nigeria|   10| 831078.0|  1.0|  1.0|   

In [29]:
df_r.columns

['_c0',
 'city',
 'Item',
 'country',
 'month',
 'sales',
 'city_',
 'Item_',
 'country_',
 'month_']

In [30]:
from pyspark.ml.feature import VectorAssembler
featureassembler=VectorAssembler(inputCols=['city_','Item_','country_','month_'],
                                 outputCol="Independent Features")
output=featureassembler.transform(df_r)

In [31]:
 output.select('Independent Features').show()

+--------------------+
|Independent Features|
+--------------------+
|   [1.0,1.0,0.0,9.0]|
|  [1.0,1.0,0.0,10.0]|
|   [1.0,1.0,0.0,6.0]|
|  [1.0,1.0,0.0,11.0]|
|   [1.0,1.0,0.0,7.0]|
|   [1.0,1.0,0.0,8.0]|
|   [1.0,1.0,0.0,3.0]|
|   [1.0,1.0,0.0,4.0]|
|   [1.0,1.0,0.0,5.0]|
|   [1.0,1.0,0.0,0.0]|
|   [1.0,1.0,0.0,1.0]|
|   [1.0,1.0,0.0,2.0]|
|   [1.0,4.0,0.0,3.0]|
|   [1.0,4.0,0.0,4.0]|
|   [1.0,4.0,0.0,5.0]|
|   [1.0,4.0,0.0,0.0]|
|   [1.0,4.0,0.0,1.0]|
|   [1.0,4.0,0.0,2.0]|
|   [1.0,2.0,0.0,9.0]|
|  [1.0,2.0,0.0,10.0]|
+--------------------+
only showing top 20 rows



In [32]:
data=output.select("Independent Features","sales")

In [33]:
data.show()

+--------------------+---------+
|Independent Features|    sales|
+--------------------+---------+
|   [1.0,1.0,0.0,9.0]| 723266.0|
|  [1.0,1.0,0.0,10.0]| 630520.0|
|   [1.0,1.0,0.0,6.0]| 692946.0|
|  [1.0,1.0,0.0,11.0]| 714788.0|
|   [1.0,1.0,0.0,7.0]| 720920.0|
|   [1.0,1.0,0.0,8.0]| 553146.0|
|   [1.0,1.0,0.0,3.0]| 742040.0|
|   [1.0,1.0,0.0,4.0]| 944060.0|
|   [1.0,1.0,0.0,5.0]| 867996.0|
|   [1.0,1.0,0.0,0.0]| 831078.0|
|   [1.0,1.0,0.0,1.0]| 747806.0|
|   [1.0,1.0,0.0,2.0]| 790578.0|
|   [1.0,4.0,0.0,3.0]|    560.0|
|   [1.0,4.0,0.0,4.0]|      0.0|
|   [1.0,4.0,0.0,5.0]|      0.0|
|   [1.0,4.0,0.0,0.0]|    460.0|
|   [1.0,4.0,0.0,1.0]|   2908.0|
|   [1.0,4.0,0.0,2.0]|   2536.0|
|   [1.0,2.0,0.0,9.0]|1546638.0|
|  [1.0,2.0,0.0,10.0]|1255042.0|
+--------------------+---------+
only showing top 20 rows



In [34]:
from pyspark.ml.regression import LinearRegression
##train test split and train model
train_data,test_data=data.randomSplit([0.75,0.25])
regressor=LinearRegression(featuresCol='Independent Features', labelCol='sales')
regressor=regressor.fit(train_data)

In [35]:
regressor.coefficients

DenseVector([-2093506.8185, -10290531.0055, 22302426.7245, -13627.2784])

In [36]:
regressor.intercept

35975073.09042397

In [38]:
# checking performance of model
pred_results=regressor.evaluate(test_data)

In [39]:
## Final comparison
pred_results.predictions.show()



+--------------------+-------------------+--------------------+
|Independent Features|              sales|          prediction|
+--------------------+-------------------+--------------------+
|           (4,[],[])|        8.7177846E7| 3.597507309042397E7|
|       (4,[0],[7.0])|        2.6426778E7| 2.132052536058117E7|
|       (4,[1],[1.0])|        1.4002136E7|2.5684542084882386E7|
|       (4,[1],[3.0])|          5121848.0|   5103480.073799215|
|       (4,[3],[6.0])|        7.3480204E7| 3.589330941979257E7|
|       (4,[3],[9.0])|6.514119560000001E7|3.5852427584476866E7|
|  [0.0,1.0,0.0,10.0]|        1.6738818E7|2.5548269300496712E7|
|   [0.0,2.0,0.0,2.0]|        3.0316182E7|1.5366756522463664E7|
|   [0.0,2.0,0.0,8.0]|        3.1336188E7|1.5284992851832263E7|
|  [0.0,2.0,0.0,10.0]|        2.7335268E7|1.5257738294955127E7|
|  [0.0,2.0,0.0,11.0]|        2.8626172E7|1.5244111016516563E7|
|   [0.0,3.0,0.0,2.0]|          3512558.0|   5076225.516922079|
|   [0.0,3.0,0.0,6.0]|          2809872.

In [40]:
### PErformance Metrics
pred_results.r2,pred_results.meanAbsoluteError,pred_results.meanSquaredError

(0.351039423233261, 14897523.069947043, 686565640724516.6)