In [2]:
# Adding pyspark link to system environment to read the library
import os
import sys
os.environ['SPARK_HOME']= r'D:\Keerthesh\Python_Folder\Module\spark-3.0.0-bin-hadoop2.7'

In [3]:
from __future__ import print_function
import findspark
findspark.init()
findspark.find()
import pyspark
findspark.find()
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression

In [4]:
# Creating a session on spark to execute the code inside this session
if __name__ == "__main__":
    spark = SparkSession\
        .builder\
        .appName("LinearRegWithSpark")\
        .getOrCreate()

In [5]:
# Reading the CSV file from the folder
dataset = spark.read.csv("Admission_Prediction.csv",header=True)

In [6]:
dataset.show()

+---------+-----------+-----------------+----+----+----+--------+---------------+
|GRE Score|TOEFL Score|University Rating| SOP| LOR|CGPA|Research|Chance of Admit|
+---------+-----------+-----------------+----+----+----+--------+---------------+
|   337.00|     118.00|                4|4.50|4.50|9.65|    1.00|           0.92|
|   324.00|     107.00|                4|4.00|4.50|8.87|    1.00|           0.76|
|     null|     104.00|                3|3.00|3.50|8.00|    1.00|           0.72|
|   322.00|     110.00|                3|3.50|2.50|8.67|    1.00|           0.80|
|   314.00|     103.00|                2|2.00|3.00|8.21|    0.00|           0.65|
|   330.00|     115.00|                5|4.50|3.00|9.34|    1.00|           0.90|
|   321.00|     109.00|             null|3.00|4.00|8.20|    1.00|           0.75|
|   308.00|     101.00|                2|3.00|4.00|7.90|    0.00|           0.68|
|   302.00|     102.00|                1|2.00|1.50|8.00|    0.00|           0.50|
|   323.00|     

In [7]:
# Printing the schema of the data structure
dataset.printSchema()

root
 |-- GRE Score: string (nullable = true)
 |-- TOEFL Score: string (nullable = true)
 |-- University Rating: string (nullable = true)
 |-- SOP: string (nullable = true)
 |-- LOR: string (nullable = true)
 |-- CGPA: string (nullable = true)
 |-- Research: string (nullable = true)
 |-- Chance of Admit: string (nullable = true)



In [8]:
# Converting all the values in the columns to float from string
from pyspark.sql.functions import col
new_data = dataset.select(*(col(c).cast("float").alias(c) for c in dataset.columns))

In [9]:
new_data.printSchema()

root
 |-- GRE Score: float (nullable = true)
 |-- TOEFL Score: float (nullable = true)
 |-- University Rating: float (nullable = true)
 |-- SOP: float (nullable = true)
 |-- LOR: float (nullable = true)
 |-- CGPA: float (nullable = true)
 |-- Research: float (nullable = true)
 |-- Chance of Admit: float (nullable = true)



In [10]:
from pyspark.sql.functions import col, count, isnan, when

In [11]:
#let's check for missing values

In [12]:
#checking for null ir nan type values in our columns
new_data.select([count(when(col(c).isNull(), c)).alias(c) for c in new_data.columns]).show()

+---------+-----------+-----------------+---+---+----+--------+---------------+
|GRE Score|TOEFL Score|University Rating|SOP|LOR|CGPA|Research|Chance of Admit|
+---------+-----------+-----------------+---+---+----+--------+---------------+
|       15|         10|               15|  0|  0|   0|       0|              0|
+---------+-----------+-----------------+---+---+----+--------+---------------+



In [13]:
from pyspark.ml.feature import Imputer

In [14]:
# Imputing the null values within the columns with average values of non null value columns
imputer = Imputer(inputCols=["GRE Score", "TOEFL Score","University Rating"], 
                  outputCols=["GRE Score", "TOEFL Score","University Rating"])
model = imputer.fit(new_data)

imputed_data = model.transform(new_data)

In [15]:
#checking for null ir nan type values in our columns
imputed_data.select([count(when(col(c).isNull(), c)).alias(c) for c in imputed_data.columns]).show()

+---------+-----------+-----------------+---+---+----+--------+---------------+
|GRE Score|TOEFL Score|University Rating|SOP|LOR|CGPA|Research|Chance of Admit|
+---------+-----------+-----------------+---+---+----+--------+---------------+
|        0|          0|                0|  0|  0|   0|       0|              0|
+---------+-----------+-----------------+---+---+----+--------+---------------+



In [16]:
# Dropping the column called as 'Chance of Admit'
features = imputed_data.drop('Chance of Admit')

In [19]:
features.show()

+---------+-----------+-----------------+---+---+----+--------+
|GRE Score|TOEFL Score|University Rating|SOP|LOR|CGPA|Research|
+---------+-----------+-----------------+---+---+----+--------+
|    337.0|      118.0|              4.0|4.5|4.5|9.65|     1.0|
|    324.0|      107.0|              4.0|4.0|4.5|8.87|     1.0|
|316.55878|      104.0|              3.0|3.0|3.5| 8.0|     1.0|
|    322.0|      110.0|              3.0|3.5|2.5|8.67|     1.0|
|    314.0|      103.0|              2.0|2.0|3.0|8.21|     0.0|
|    330.0|      115.0|              5.0|4.5|3.0|9.34|     1.0|
|    321.0|      109.0|        3.1216495|3.0|4.0| 8.2|     1.0|
|    308.0|      101.0|              2.0|3.0|4.0| 7.9|     0.0|
|    302.0|      102.0|              1.0|2.0|1.5| 8.0|     0.0|
|    323.0|      108.0|              3.0|3.5|3.0| 8.6|     0.0|
|    325.0|      106.0|              3.0|3.5|4.0| 8.4|     1.0|
|    327.0|      111.0|              4.0|4.0|4.5| 9.0|     1.0|
|316.55878|      112.0|              4.0

In [20]:
#let's assemble our features together using vectorAssembler
assembler = VectorAssembler(
    inputCols=features.columns,
    outputCol="features")

In [21]:
# Assemble all of the feature vectors into one column called features
# This is required as spark needs the data to be in vector
output = assembler.transform(imputed_data)

In [22]:
output.show()

+---------+-----------+-----------------+---+---+----+--------+---------------+--------------------+
|GRE Score|TOEFL Score|University Rating|SOP|LOR|CGPA|Research|Chance of Admit|            features|
+---------+-----------+-----------------+---+---+----+--------+---------------+--------------------+
|    337.0|      118.0|              4.0|4.5|4.5|9.65|     1.0|           0.92|[337.0,118.0,4.0,...|
|    324.0|      107.0|              4.0|4.0|4.5|8.87|     1.0|           0.76|[324.0,107.0,4.0,...|
|316.55878|      104.0|              3.0|3.0|3.5| 8.0|     1.0|           0.72|[316.558776855468...|
|    322.0|      110.0|              3.0|3.5|2.5|8.67|     1.0|            0.8|[322.0,110.0,3.0,...|
|    314.0|      103.0|              2.0|2.0|3.0|8.21|     0.0|           0.65|[314.0,103.0,2.0,...|
|    330.0|      115.0|              5.0|4.5|3.0|9.34|     1.0|            0.9|[330.0,115.0,5.0,...|
|    321.0|      109.0|        3.1216495|3.0|4.0| 8.2|     1.0|           0.75|[321.0,109.0

In [23]:
# Selecting only features and Chance of Admit columns from the output dataframe
output= output.select("features", "Chance of Admit")

In [25]:
output.show()

+--------------------+---------------+
|            features|Chance of Admit|
+--------------------+---------------+
|[337.0,118.0,4.0,...|           0.92|
|[324.0,107.0,4.0,...|           0.76|
|[316.558776855468...|           0.72|
|[322.0,110.0,3.0,...|            0.8|
|[314.0,103.0,2.0,...|           0.65|
|[330.0,115.0,5.0,...|            0.9|
|[321.0,109.0,3.12...|           0.75|
|[308.0,101.0,2.0,...|           0.68|
|[302.0,102.0,1.0,...|            0.5|
|[323.0,108.0,3.0,...|           0.45|
|[325.0,106.0,3.0,...|           0.52|
|[327.0,111.0,4.0,...|           0.84|
|[316.558776855468...|           0.78|
|[307.0,109.0,3.0,...|           0.62|
|[311.0,104.0,3.0,...|           0.61|
|[314.0,105.0,3.0,...|           0.54|
|[317.0,107.0,3.0,...|           0.66|
|[319.0,106.0,3.0,...|           0.65|
|[318.0,110.0,3.0,...|           0.63|
|[303.0,102.0,3.0,...|           0.62|
+--------------------+---------------+
only showing top 20 rows



In [26]:
# Splitting the data into train and test with 70:30 split
train_df,test_df = output.randomSplit([0.7, 0.3])

In [27]:
train_df.show()
test_df.show()

+--------------------+---------------+
|            features|Chance of Admit|
+--------------------+---------------+
|[290.0,100.0,1.0,...|           0.47|
|[293.0,97.0,2.0,2...|           0.64|
|[294.0,93.0,1.0,1...|           0.46|
|[294.0,95.0,1.0,1...|           0.49|
|[295.0,93.0,1.0,2...|           0.46|
|[295.0,96.0,2.0,1...|           0.47|
|[295.0,99.0,2.0,2...|           0.57|
|[295.0,101.0,2.0,...|           0.69|
|[296.0,95.0,2.0,3...|           0.44|
|[296.0,97.0,2.0,1...|           0.49|
|[296.0,99.0,2.0,3...|           0.47|
|[297.0,96.0,2.0,2...|           0.43|
|[297.0,96.0,2.0,2...|           0.34|
|[297.0,98.0,2.0,2...|           0.59|
|[297.0,100.0,1.0,...|           0.52|
|[297.0,101.0,3.0,...|           0.57|
|[298.0,92.0,1.0,2...|           0.51|
|[298.0,97.0,3.121...|           0.45|
|[298.0,99.0,1.0,1...|           0.53|
|[298.0,100.0,3.0,...|           0.58|
+--------------------+---------------+
only showing top 20 rows

+--------------------+---------------+

In [28]:
# Defining the linear regression model for our operations
lin_reg = LinearRegression(featuresCol = 'features', labelCol='Chance of Admit')
# Fitting the model to training data
linear_model = lin_reg.fit(train_df)

In [29]:
print("Coefficients: " + str(linear_model.coefficients))
print("Intercept: " + str(linear_model.intercept))

Coefficients: [0.0024239810550611193,0.0020768529388602882,0.002473888480576707,0.008259129138856396,0.015289196658832941,0.10888762566840014,0.028165459402548915]
Intercept: -1.3052979320092042


In [30]:
# getting the summary of the training data after transforming the data
trainSummary = linear_model.summary
print("RMSE: %f" % trainSummary.rootMeanSquaredError)
print("r2: %f" % trainSummary.r2)

RMSE: 0.057636
r2: 0.825256


In [31]:
# predicting the values dereived from the model to test data

predictions = linear_model.transform(test_df)
predictions.select("prediction","Chance of Admit","features").show()

+-------------------+---------------+--------------------+
|         prediction|Chance of Admit|            features|
+-------------------+---------------+--------------------+
| 0.4905877750870906|           0.45|[290.0,104.0,4.0,...|
|0.48159020692912224|           0.37|[295.0,99.0,1.0,2...|
| 0.5559950977325527|           0.61|[296.0,99.0,2.0,2...|
| 0.5272088548595062|            0.6|[296.0,101.0,1.0,...|
| 0.5588303621755133|           0.54|[297.0,99.0,4.0,3...|
| 0.5209621246393878|           0.44|[298.0,98.0,2.0,1...|
| 0.5787994989415157|           0.34|[298.0,98.0,2.0,4...|
| 0.5305821855026938|           0.54|[298.0,101.0,2.0,...|
| 0.5916666379812403|           0.53|[298.0,101.0,4.0,...|
| 0.6625037040968029|           0.69|[298.0,105.0,3.0,...|
|  0.557235833970458|           0.38|[299.0,97.0,3.0,5...|
| 0.5950781445397622|           0.62|[300.0,95.0,2.0,3...|
| 0.5559219186281801|           0.61|[300.0,98.0,1.0,2...|
| 0.5486507412491004|           0.58|[300.0,99.0,1.0,1..

In [32]:
# evaluating the model based on the test data and prediction data results
from pyspark.ml.evaluation import RegressionEvaluator
pred_evaluator = RegressionEvaluator(predictionCol="prediction", \
                 labelCol="Chance of Admit",metricName="r2")
print("R Squared (R2) on test data = %g" % pred_evaluator.evaluate(predictions))

R Squared (R2) on test data = 0.804776


In [33]:
# ending the spark session
spark.stop()