In [1]:
#importing pyspark
import findspark
findspark.init()
import pyspark

In [2]:
#creating sparksession
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName('Linear_Regression').getOrCreate()

In [3]:
#importing required libraries
from pyspark.sql.functions import corr
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import StandardScaler

In [4]:
#loading the dataset
fish_df=spark.read.csv('Fish.csv',inferSchema=True,header=True)

In [5]:
fish_df.printSchema()

root
 |-- Species: string (nullable = true)
 |-- Weight: double (nullable = true)
 |-- Length1: double (nullable = true)
 |-- Length2: double (nullable = true)
 |-- Length3: double (nullable = true)
 |-- Height: double (nullable = true)
 |-- Width: double (nullable = true)



In [6]:
#total number of records
fish_df.count()

159

In [7]:
#statistical summary
fish_df.describe().show(5,False)

+-------+---------+------------------+-----------------+------------------+------------------+-----------------+------------------+
|summary|Species  |Weight            |Length1          |Length2           |Length3           |Height           |Width             |
+-------+---------+------------------+-----------------+------------------+------------------+-----------------+------------------+
|count  |159      |159               |159              |159               |159               |159              |159               |
|mean   |null     |398.3264150943396 |26.24716981132075|28.415723270440253|31.227044025157248|8.970993710691822|4.417485534591194 |
|stddev |null     |357.97831655089306|9.996441210553128|10.716328098884247|11.61024583269096 |4.286207619968869|1.6858038699921665|
|min    |Bream    |0.0               |7.5              |8.4               |8.8               |1.7284           |1.0476            |
|max    |Whitefish|1650.0            |59.0             |63.4              |6

In [8]:
# check for correlation
feature_list=['Length1','Length2','Length3','Height','Width']
for i in feature_list:
    fish_df.select(corr(i,'Weight')).show()

+---------------------+
|corr(Length1, Weight)|
+---------------------+
|   0.9157117160312038|
+---------------------+

+---------------------+
|corr(Length2, Weight)|
+---------------------+
|   0.9186177013642219|
+---------------------+

+---------------------+
|corr(Length3, Weight)|
+---------------------+
|   0.9230435593620122|
+---------------------+

+--------------------+
|corr(Height, Weight)|
+--------------------+
|  0.7243453291993319|
+--------------------+

+-------------------+
|corr(Width, Weight)|
+-------------------+
| 0.8865066052433449|
+-------------------+



In [9]:
#fitting vector assembler
vec_assembler=VectorAssembler(inputCols=feature_list,outputCol='Features')

In [10]:
#transforming the values
features_df=vec_assembler.transform(fish_df)

In [11]:
#dataframe with assembled features and target
model_df=features_df.select('Features','Weight')

In [12]:
#scaling the features
Scalerizer=StandardScaler().setInputCol("Features").setOutputCol("features")
scaled_df=Scalerizer.fit(model_df).transform(model_df)

In [13]:
scaled_df.show(5)

+--------------------+------+
|            features|Weight|
+--------------------+------+
|[2.32082593308386...| 242.0|
|[2.40085441353503...| 290.0|
|[2.39085085347864...| 340.0|
|[2.63093629483214...| 363.0|
|[2.65094341494493...| 430.0|
+--------------------+------+
only showing top 5 rows



In [14]:
#creating model with scaled features and output column
model_df=scaled_df.select('features','Weight')

In [15]:
model_df.show(5,False)

+-----------------------------------------------------------------------------------------------+------+
|features                                                                                       |Weight|
+-----------------------------------------------------------------------------------------------+------+
|[2.3208259330838685,2.370214850238168,2.5839246155778217,2.6876906163690855,2.3846190363880706]|242.0 |
|[2.4008544135350363,2.454198840994639,2.6872816002009348,2.911664834399843,2.5540337619583275] |290.0 |
|[2.3908508534786406,2.4728619500516325,2.6786685181490086,2.8878209124386522,2.785673994224383]|340.0 |
|[2.630936294832144,2.7061508132640504,2.8853824873952343,2.969991453678686,2.6429527653301115] |363.0 |
|[2.650943414944936,2.7061508132640504,2.928447897654865,2.90326580122369,3.0454313763224765]   |430.0 |
+-----------------------------------------------------------------------------------------------+------+
only showing top 5 rows



In [16]:
#splitting the data into 75/25 ratio for training and testing set
train_df,test_df=model_df.randomSplit([0.8,0.2])

In [19]:
#building Linear Regression model 
lin_Reg=LinearRegression(labelCol='Weight',featuresCol='features')

In [20]:
#fitting the linear regression model on training data set 
lr_model=lin_Reg.fit(train_df)

In [21]:
#intercept value of model equation
print('Intercept:',lr_model.intercept)

Intercept: -442.17771085310136


In [22]:
#weight coefficients of the model
print('Coefficients:',lr_model.coefficients)

Coefficients: [905.7236404215063,-279.1486415994291,-461.33130585332617,156.04290952007082,43.10143979226833]


In [23]:
#evaluating the model on the train set
train_pred=lr_model.evaluate(train_df)

In [24]:
print('R-Squared value for the training set:',round(train_pred.r2,4))

R-Squared value for the training set: 0.8838


In [25]:
print('MSE value for the training set:',round(train_pred.meanSquaredError,4))

MSE value for the training set: 13127.3991


In [26]:
lr_model.summary.residuals.show(5)

+------------------+
|         residuals|
+------------------+
|224.13275744998782|
|  200.583725700335|
| 176.6190092042163|
|157.71584903843956|
| 158.4730799293897|
+------------------+
only showing top 5 rows



In [27]:
#evaluating the model on the test set
test_pred=lr_model.evaluate(test_df)

In [28]:
print('R-Squared value for the test set:',round(test_pred.r2,4))

R-Squared value for the test set: 0.866


In [29]:
print('MSE value for the test set:',round(test_pred.meanSquaredError,4))

MSE value for the test set: 24703.513


In [30]:
#predcited and actual columns
predictions=lr_model.transform(test_df)
predictions=predictions.withColumnRenamed('prediction','EstimatedWeight')
predictions.select('Weight','EstimatedWeight').show(5)

+------+-------------------+
|Weight|    EstimatedWeight|
+------+-------------------+
|   7.0| -171.8284778477048|
|  40.0|-16.929718631889898|
|  60.0|  56.94003095985602|
|  70.0|  34.20513180666143|
|  69.0|  37.12306669488072|
+------+-------------------+
only showing top 5 rows

