In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Regression').getOrCreate()

In [2]:
spark

In [11]:
df = spark.read.csv('tips.csv', header=True, inferSchema=True)

In [12]:
df.show(3)

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|
+----------+----+------+------+---+------+----+
only showing top 3 rows



In [13]:
df.printSchema()

root
 |-- total_bill: double (nullable = true)
 |-- tip: double (nullable = true)
 |-- sex: string (nullable = true)
 |-- smoker: string (nullable = true)
 |-- day: string (nullable = true)
 |-- time: string (nullable = true)
 |-- size: integer (nullable = true)



We need to convert string features into numerical.  

In [14]:
from pyspark.ml.feature import VectorAssembler, StringIndexer
# we will do label encoding for columns: sex, smoker, day, time
d = StringIndexer(inputCols=['sex', 'smoker', 'day', 'time'], outputCols=['sex_en', 'smoker_en', 'day_en', 'time_en']).fit(df)
df = d.transform(df)
df.show(3)

+----------+----+------+------+---+------+----+------+---------+------+-------+
|total_bill| tip|   sex|smoker|day|  time|size|sex_en|smoker_en|day_en|time_en|
+----------+----+------+------+---+------+----+------+---------+------+-------+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|   1.0|      0.0|   1.0|    0.0|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|   0.0|      0.0|   1.0|    0.0|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|   0.0|      0.0|   1.0|    0.0|
+----------+----+------+------+---+------+----+------+---------+------+-------+
only showing top 3 rows



In [22]:
df.groupBy('smoker_en').sum().show()

+---------+------------------+-----------------+---------+-----------+--------------+-----------+------------+
|smoker_en|   sum(total_bill)|         sum(tip)|sum(size)|sum(sex_en)|sum(smoker_en)|sum(day_en)|sum(time_en)|
+---------+------------------+-----------------+---------+-----------+--------------+-----------+------------+
|      0.0| 2897.430000000001|451.7700000000001|      403|       54.0|           0.0|      159.0|        45.0|
|      1.0|1930.3400000000001|           279.81|      224|       33.0|          93.0|       98.0|        23.0|
+---------+------------------+-----------------+---------+-----------+--------------+-----------+------------+



In [23]:
# Find Count of Null, None, NaN of All DataFrame Columns
from pyspark.sql.functions import col,isnan, when, count
df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in df.columns]).show()

+----------+---+---+------+---+----+----+------+---------+------+-------+
|total_bill|tip|sex|smoker|day|time|size|sex_en|smoker_en|day_en|time_en|
+----------+---+---+------+---+----+----+------+---------+------+-------+
|         0|  0|  0|     0|  0|   0|   0|     0|        0|     0|      0|
+----------+---+---+------+---+----+----+------+---------+------+-------+



In [25]:
print(df.columns)

['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size', 'sex_en', 'smoker_en', 'day_en', 'time_en']


In [26]:
required_columns = ['tip', 'size', 'sex_en', 'smoker_en', 'day_en', 'time_en']

In [29]:
from pyspark.ml.feature import VectorAssembler
vector_assembler = VectorAssembler(inputCols=required_columns, outputCol='IndependentFeatures')

In [30]:
new_df = vector_assembler.transform(df)

In [31]:
new_df.show(3)

+----------+----+------+------+---+------+----+------+---------+------+-------+--------------------+
|total_bill| tip|   sex|smoker|day|  time|size|sex_en|smoker_en|day_en|time_en| IndependentFeatures|
+----------+----+------+------+---+------+----+------+---------+------+-------+--------------------+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|   1.0|      0.0|   1.0|    0.0|[1.01,2.0,1.0,0.0...|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|   0.0|      0.0|   1.0|    0.0|[1.66,3.0,0.0,0.0...|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|   0.0|      0.0|   1.0|    0.0|[3.5,3.0,0.0,0.0,...|
+----------+----+------+------+---+------+----+------+---------+------+-------+--------------------+
only showing top 3 rows



In [35]:
final_data = new_df.select('IndependentFeatures', 'total_bill')

In [36]:
final_data.show(3)

+--------------------+----------+
| IndependentFeatures|total_bill|
+--------------------+----------+
|[1.01,2.0,1.0,0.0...|     16.99|
|[1.66,3.0,0.0,0.0...|     10.34|
|[3.5,3.0,0.0,0.0,...|     21.01|
+--------------------+----------+
only showing top 3 rows



In [37]:
train, test = final_data.randomSplit([0.7, 0.3])

In [38]:
train.count()

173

In [39]:
test.count()

71

In [40]:
from pyspark.ml.regression import LinearRegression
regressor = LinearRegression(featuresCol='IndependentFeatures', labelCol='total_bill')

In [41]:
regressor = regressor.fit(train)

In [42]:
regressor.coefficients

DenseVector([3.3279, 3.1231, -1.1942, 1.7721, -0.7524, -0.3378])

In [43]:
regressor.intercept

2.4896079646728873

In [45]:
# Model Evaluation
y_pred = regressor.evaluate(test)

In [48]:
y_pred.predictions.show()

+--------------------+----------+------------------+
| IndependentFeatures|total_bill|        prediction|
+--------------------+----------+------------------+
|(6,[0,1],[1.97,2.0])|     12.02|15.291752697972319|
| (6,[0,1],[2.0,2.0])|     12.69|15.391589595278099|
|(6,[0,1],[2.72,2.0])|     13.28|17.787675130616822|
| (6,[0,1],[3.0,4.0])|     20.45|24.965674649023974|
|(6,[0,1],[3.15,3.0])|     20.08| 22.34176489710961|
|(6,[0,1],[3.39,2.0])|     11.61| 20.01736583711258|
|(6,[0,1],[4.08,2.0])|     17.92|22.313614475145524|
|(6,[0,1],[6.73,4.0])|     48.27|  37.3787288807093|
|[1.0,1.0,1.0,1.0,...|      3.07|  9.51852213414649|
|[1.0,2.0,0.0,1.0,...|      12.6| 13.83578593529759|
|[1.25,2.0,1.0,0.0...|      8.51| 9.858826575624178|
|[1.32,2.0,0.0,0.0...|      9.68|12.376192136556137|
|[1.36,3.0,1.0,0.0...|     18.64| 13.34798943752197|
|[1.5,2.0,0.0,0.0,...|     12.46| 11.47035794747559|
|[1.5,2.0,1.0,0.0,...|     10.65|10.690800719839013|
|[1.57,2.0,0.0,0.0...|     15.42| 13.208166280

#### comparision between actual and predicted values using regression performance Metrics:
1. R2 
2. Mean Absolute Error
3. Mean Square Error

In [51]:
y_pred.r2, y_pred.meanAbsoluteError, y_pred.meanSquaredError

(0.40301216030614784, 4.572273167951223, 43.153212131911154)