In [1]:
import findspark
findspark.init()
import pyspark
sc = pyspark.SparkContext(appName="LINEARREG")
from pyspark.sql.session import SparkSession
spark = SparkSession(sc)

# Linear Regression using Pyspark

In [2]:
#create sparksession object
#from pyspark.sql import SparkSession
#spark=SparkSession.builder.appName('lin_reg').getOrCreate()

In [3]:
#import Linear Regression from spark's MLlib
from pyspark.ml.regression import LinearRegression

In [4]:
#Load the dataset
df=spark.read.csv('Linear_regression_dataset.csv',inferSchema=True,header=True)

In [5]:
#validate the size of data
print((df.count(), len(df.columns)))

(1232, 6)


In [6]:
#explore the data
df.printSchema()

root
 |-- var_1: integer (nullable = true)
 |-- var_2: integer (nullable = true)
 |-- var_3: integer (nullable = true)
 |-- var_4: double (nullable = true)
 |-- var_5: double (nullable = true)
 |-- output: double (nullable = true)



In [7]:
#sneak into the dataset
df.head(3)

[Row(var_1=734, var_2=688, var_3=81, var_4=0.328, var_5=0.259, output=0.418),
 Row(var_1=700, var_2=600, var_3=94, var_4=0.32, var_5=0.247, output=0.389),
 Row(var_1=712, var_2=705, var_3=93, var_4=0.311, var_5=0.247, output=0.417)]

In [8]:
#view statistical measures of data 
df.describe().show(5,False)

+-------+-----------------+-----------------+------------------+--------------------+--------------------+-------------------+
|summary|var_1            |var_2            |var_3             |var_4               |var_5               |output             |
+-------+-----------------+-----------------+------------------+--------------------+--------------------+-------------------+
|count  |1232             |1232             |1232              |1232                |1232                |1232               |
|mean   |715.0819805194806|715.0819805194806|80.90422077922078 |0.3263311688311693  |0.25927272727272715 |0.39734172077922014|
|stddev |91.5342940441652 |93.07993263118064|11.458139049993724|0.015012772334166148|0.012907228928000298|0.03326689862173776|
|min    |463              |472              |40                |0.277               |0.214               |0.301              |
|max    |1009             |1103             |116               |0.373               |0.294               |0.491

In [9]:
#import corr function from pyspark functions
from pyspark.sql.functions import corr

In [10]:
# check for correlation
df.select(corr('var_1','output')).show()

+-------------------+
|corr(var_1, output)|
+-------------------+
| 0.9187399607627283|
+-------------------+



# Cual es la correlación entre output con las demas variables?

In [11]:
df.select(corr('var_1','output'),corr('var_2','output'),corr('var_3','output')\
     ,corr('var_4','output'),corr('var_5','output'))\
  .show()

+-------------------+-------------------+-------------------+-------------------+-------------------+
|corr(var_1, output)|corr(var_2, output)|corr(var_3, output)|corr(var_4, output)|corr(var_5, output)|
+-------------------+-------------------+-------------------+-------------------+-------------------+
| 0.9187399607627283|0.43652698913681093| 0.4014958408311139| 0.7909100204842113| 0.7904806260381185|
+-------------------+-------------------+-------------------+-------------------+-------------------+



In [12]:
#for i in range(1,5):
#    df.select(corr('var_' + str(i),'output')).show()

In [13]:
df.select(corr('var_1','var_4')).show()

+------------------+
|corr(var_1, var_4)|
+------------------+
|0.9004922105597886|
+------------------+



In [14]:
df.select(corr('var_1','var_5')).show()

+------------------+
|corr(var_1, var_5)|
+------------------+
|0.8269996418857769|
+------------------+



In [15]:
#import vectorassembler to create dense vectors
from pyspark.ml.linalg import Vector
from pyspark.ml.feature import VectorAssembler

In [16]:
#select the columns to create input vector
df.columns

['var_1', 'var_2', 'var_3', 'var_4', 'var_5', 'output']

In [17]:
#create the vector assembler 
vec_assmebler=VectorAssembler(inputCols=['var_1', 'var_2', 'var_3', 'var_4', 'var_5'],outputCol='NonscaledFeatures')

In [18]:
#transform the values
features_df=vec_assmebler.transform(df)

In [19]:
#validate the presence of dense vectors 
features_df.printSchema()

root
 |-- var_1: integer (nullable = true)
 |-- var_2: integer (nullable = true)
 |-- var_3: integer (nullable = true)
 |-- var_4: double (nullable = true)
 |-- var_5: double (nullable = true)
 |-- output: double (nullable = true)
 |-- NonscaledFeatures: vector (nullable = true)



In [20]:
#view the details of dense vector
features_df.select('NonscaledFeatures').show(5,False)

+------------------------------+
|NonscaledFeatures             |
+------------------------------+
|[734.0,688.0,81.0,0.328,0.259]|
|[700.0,600.0,94.0,0.32,0.247] |
|[712.0,705.0,93.0,0.311,0.247]|
|[734.0,806.0,69.0,0.315,0.26] |
|[613.0,759.0,61.0,0.302,0.24] |
+------------------------------+
only showing top 5 rows



In [21]:
from pyspark.ml.feature import StandardScaler
scaler = StandardScaler(inputCol="NonscaledFeatures", outputCol="features",
                        withStd=True, withMean=False)
scalerModel = scaler.fit(features_df)

In [22]:
scaledData = scalerModel.transform(features_df)
scaledData.show()

+-----+-----+-----+-----+-----+------+--------------------+--------------------+
|var_1|var_2|var_3|var_4|var_5|output|   NonscaledFeatures|            features|
+-----+-----+-----+-----+-----+------+--------------------+--------------------+
|  734|  688|   81|0.328|0.259| 0.418|[734.0,688.0,81.0...|[8.01885247124805...|
|  700|  600|   94| 0.32|0.247| 0.389|[700.0,600.0,94.0...|[7.64740698892866...|
|  712|  705|   93|0.311|0.247| 0.417|[712.0,705.0,93.0...|[7.77850539445315...|
|  734|  806|   69|0.315| 0.26| 0.415|[734.0,806.0,69.0...|[8.01885247124805...|
|  613|  759|   61|0.302| 0.24| 0.378|[613.0,759.0,61.0...|[6.69694354887609...|
|  748|  676|   85|0.318|0.255| 0.422|[748.0,676.0,85.0...|[8.17180061102662...|
|  669|  588|   97|0.315|0.251| 0.411|[669.0,588.0,97.0...|[7.30873610799039...|
|  667|  845|   68|0.324|0.251| 0.381|[667.0,845.0,68.0...|[7.28688637373630...|
|  758|  890|   64| 0.33|0.274| 0.436|[758.0,890.0,64.0...|[8.28104928229703...|
|  726|  670|   88|0.335|0.2

In [24]:
#create data containing input features and output column
model_df=scaledData.select('features','output')

In [25]:
model_df.show(5,False)

+---------------------------------------------------------------------------------------------+------+
|features                                                                                     |output|
+---------------------------------------------------------------------------------------------+------+
|[8.018852471248053,7.3914965401417625,7.06921077206201,21.848063282324997,20.066274600440245]|0.418 |
|[7.64740698892866,6.446072564077118,8.20377546387443,21.31518369007317,19.136563035941084]   |0.389 |
|[7.778505394453152,7.574135262790614,8.116501256811937,20.71569414878986,19.136563035941084] |0.417 |
|[8.018852471248053,8.659224144410262,6.021920287312082,20.982133944915773,20.143750564148508]|0.415 |
|[6.696943548876098,8.154281793557555,5.323726630812131,20.11620460750655,18.594231289983238] |0.378 |
+---------------------------------------------------------------------------------------------+------+
only showing top 5 rows



In [26]:
#size of model df
print((model_df.count(), len(model_df.columns)))

(1232, 2)


### Split Data - Train & Test sets


In [27]:
#split the data into 70/30 ratio for train test purpose
train_df,test_df=model_df.randomSplit([0.7,0.3])

In [28]:
print((train_df.count(), len(train_df.columns)))

(851, 2)


In [29]:
print((test_df.count(), len(test_df.columns)))

(381, 2)


In [30]:
train_df.describe().show()

+-------+--------------------+
|summary|              output|
+-------+--------------------+
|  count|                 851|
|   mean|  0.3961856639247937|
| stddev|0.032625119609424685|
|    min|               0.301|
|    max|               0.491|
+-------+--------------------+



In [31]:
test_df.describe().show()

+-------+-------------------+
|summary|             output|
+-------+-------------------+
|  count|                381|
|   mean|0.39992388451443645|
| stddev|0.03456172177770165|
|    min|              0.315|
|    max|              0.479|
+-------+-------------------+



## Build Linear Regression Model 

In [32]:
#Build Linear Regression model 
lin_Reg=LinearRegression(labelCol='output')

In [33]:
#fit the linear regression model on training data set 
lr_model=lin_Reg.fit(train_df)

In [34]:
lr_model.intercept

0.17252628276692097

In [35]:
print(lr_model.coefficients)

[0.030340508003037347,0.005493657386689944,0.0026560805565733763,-0.009324831701019921,0.006451964449495263]


In [36]:
training_predictions=lr_model.evaluate(train_df)

In [37]:
training_predictions.predictions.show(10)

+--------------------+------+-------------------+
|            features|output|         prediction|
+--------------------+------+-------------------+
|[5.05821347981995...| 0.311| 0.3102010406888639|
|[5.06913834694699...| 0.301| 0.3135922054717988|
|[5.30948542374189...| 0.332|0.31824186803126553|
|[5.40780922788526...| 0.315| 0.3264159981053271|
|[5.44058382926639...| 0.318| 0.3221484286321684|
|[5.44058382926639...| 0.325| 0.3314837470221417|
|[5.60445683617200...| 0.339|0.33077926960937765|
|[5.61538170329904...| 0.339|0.32936058997258044|
|[5.63723143755312...| 0.327| 0.3282246930271663|
|[5.67000603893424...| 0.332|0.32760722565742983|
+--------------------+------+-------------------+
only showing top 10 rows



In [38]:
training_predictions.meanSquaredError

0.00014231847863577967

In [39]:
training_predictions.r2

0.8661347967198038

In [40]:
#make predictions on test data 
test_results=lr_model.evaluate(test_df)

In [41]:
#view the residual errors based on predictions 
test_results.residuals.show(10)

+--------------------+
|           residuals|
+--------------------+
| 0.00981391447338642|
|0.007560695180705512|
|-1.17340519692199...|
|-0.00399570237925...|
|-0.01354600782541...|
|-0.00634309704327...|
|1.374454088231558E-4|
|-0.01051139769135151|
|0.001761136922648...|
|0.011147367648627005|
+--------------------+
only showing top 10 rows



In [42]:
test_results.predictions.show(10)

+--------------------+------+-------------------+
|            features|output|         prediction|
+--------------------+------+-------------------+
|[5.11283781545516...| 0.329| 0.3191860855266136|
|[5.13468754970924...| 0.319| 0.3114393048192945|
|[5.16746215109036...| 0.315| 0.3151173405196922|
|[5.40780922788526...| 0.327| 0.3309957023792581|
|[5.47335843064751...| 0.315|0.32854600782541454|
|[5.57168223479088...| 0.317|0.32334309704327974|
|[5.58260710191792...| 0.329|0.32886255459117686|
|[5.70278064031537...| 0.317| 0.3275113976913515|
|[5.72463037456945...| 0.336| 0.3342388630773514|
|[5.81202931158578...| 0.351|  0.339852632351373|
+--------------------+------+-------------------+
only showing top 10 rows



In [43]:
#coefficient of determination value for model
test_results.r2

0.8740614444615777

In [44]:
test_results.meanSquaredError

0.0001500403499317034

In [45]:
test_results.rootMeanSquaredError

0.012249095882215283