In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('DiwaliSales').getOrCreate()

In [3]:
spark

In [4]:
df_pyspark = spark.read.csv('data.csv', header=True, inferSchema=True)
df_pyspark.show(3)

+-------+---------+----------+------+---------+---+--------------+--------------+--------+----------+----------------+------+-------+------+--------+
|User_ID|Cust_name|Product_ID|Gender|Age Group|Age|Marital_Status|         State|    Zone|Occupation|Product_Category|Orders| Amount|Status|unnamed1|
+-------+---------+----------+------+---------+---+--------------+--------------+--------+----------+----------------+------+-------+------+--------+
|1002903|Sanskriti| P00125942|     F|    26-35| 28|             0|   Maharashtra| Western|Healthcare|            Auto|     1|23952.0|  NULL|    NULL|
|1000732|   Kartik| P00110942|     F|    26-35| 35|             1|Andhra�Pradesh|Southern|      Govt|            Auto|     3|23934.0|  NULL|    NULL|
|1001990|    Bindu| P00118542|     F|    26-35| 35|             1| Uttar Pradesh| Central|Automobile|            Auto|     3|23924.0|  NULL|    NULL|
+-------+---------+----------+------+---------+---+--------------+--------------+--------+----------

In [5]:
df_pyspark = df_pyspark.drop("Status", "unnamed1")
df_pyspark = df_pyspark.na.drop(how='any')

In [6]:
df_pyspark.printSchema()

root
 |-- User_ID: integer (nullable = true)
 |-- Cust_name: string (nullable = true)
 |-- Product_ID: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Age Group: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Marital_Status: integer (nullable = true)
 |-- State: string (nullable = true)
 |-- Zone: string (nullable = true)
 |-- Occupation: string (nullable = true)
 |-- Product_Category: string (nullable = true)
 |-- Orders: integer (nullable = true)
 |-- Amount: double (nullable = true)



In [7]:
# Our goal it to predict the amount spent on Diwali, by looking at the features - Zone, Age, Gender, Marital_Status, Orders,
# but for now we will avoid string datatypes.
# So, first of all we will group them together:

# [Age, Orders, Marital_Status] ----> New Feature ----> Independent Feature

In [8]:
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols=['Age', 'Orders', 'Marital_Status'], outputCol='Independent')

ModuleNotFoundError: No module named 'distutils'

In [None]:
output = assembler.transform(df_pyspark)

In [None]:
output.show(3)

+-------+---------+----------+------+---------+---+--------------+--------------+--------+----------+----------------+------+-------+--------------+
|User_ID|Cust_name|Product_ID|Gender|Age Group|Age|Marital_Status|         State|    Zone|Occupation|Product_Category|Orders| Amount|   Independent|
+-------+---------+----------+------+---------+---+--------------+--------------+--------+----------+----------------+------+-------+--------------+
|1002903|Sanskriti| P00125942|     F|    26-35| 28|             0|   Maharashtra| Western|Healthcare|            Auto|     1|23952.0|[28.0,1.0,0.0]|
|1000732|   Kartik| P00110942|     F|    26-35| 35|             1|Andhra�Pradesh|Southern|      Govt|            Auto|     3|23934.0|[35.0,3.0,1.0]|
|1001990|    Bindu| P00118542|     F|    26-35| 35|             1| Uttar Pradesh| Central|Automobile|            Auto|     3|23924.0|[35.0,3.0,1.0]|
+-------+---------+----------+------+---------+---+--------------+--------------+--------+----------+-----

In [None]:
# Now as we want only two columns: Amount and Independent, so we'll select them.

finalized_data = output.select('Amount', 'Independent')
finalized_data.show(3)

+-------+--------------+
| Amount|   Independent|
+-------+--------------+
|23952.0|[28.0,1.0,0.0]|
|23934.0|[35.0,3.0,1.0]|
|23924.0|[35.0,3.0,1.0]|
+-------+--------------+
only showing top 3 rows



In [None]:
from pyspark.ml.regression import LinearRegression

train_data, test_data = finalized_data.randomSplit([0.8, 0.2])
regressor = LinearRegression(featuresCol='Independent', labelCol='Amount')

regressor = regressor.fit(train_data)

In [None]:
regressor.coefficients

DenseVector([13.0648, -50.6775, -190.7537])

In [None]:
regressor.intercept

9210.199368600932

In [None]:
pred_results = regressor.evaluate(test_data)

In [None]:
pred_results.predictions.show()

+------+--------------+-----------------+
|Amount|   Independent|       prediction|
+------+--------------+-----------------+
| 206.0|[37.0,3.0,0.0]|9541.563293891495|
| 370.0|[19.0,4.0,1.0]|9064.966324454372|
| 407.0|[33.0,1.0,0.0]|9590.659186643155|
| 579.0|[30.0,2.0,1.0]|9310.033716117994|
| 582.0|[32.0,4.0,0.0]| 9425.56198135675|
| 686.0|[50.0,2.0,0.0]|9762.082739179286|
| 738.0|[28.0,2.0,1.0]|9283.904182929731|
| 744.0|[28.0,2.0,1.0]|9283.904182929731|
| 750.0|[44.0,1.0,1.0]|9543.617927999916|
| 760.0|[19.0,4.0,1.0]|9064.966324454372|
| 760.0|[29.0,1.0,1.0]|9347.646429087954|
| 766.0|[23.0,4.0,0.0]|9307.979082009573|
| 770.0|[27.0,4.0,0.0]|9360.238148386095|
| 771.0|[25.0,4.0,1.0]|9143.354924019157|
| 772.0|[55.0,1.0,0.0]|9878.084051714033|
| 785.0|[21.0,3.0,0.0]|9332.527028385402|
| 883.0|[29.0,1.0,0.0]|9538.400120266633|
| 883.0|[29.0,3.0,0.0]|9437.045161138449|
| 942.0|[55.0,2.0,0.0]|9827.406572149941|
| 951.0|[20.0,1.0,1.0]|9230.063529740777|
+------+--------------+-----------

In [None]:
pred_results.meanAbsoluteError, pred_results.meanSquaredError

(4287.383886815544, 27358423.2776139)