# TASK 1 : Install Dependencies & Run a SparkSession


In [2]:
#install pyspark
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.1.2.tar.gz (212.4 MB)
[K     |████████████████████████████████| 212.4 MB 65 kB/s 
[?25hCollecting py4j==0.10.9
  Downloading py4j-0.10.9-py2.py3-none-any.whl (198 kB)
[K     |████████████████████████████████| 198 kB 55.0 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.1.2-py2.py3-none-any.whl size=212880768 sha256=95c2f8e10d10dbe4901a1a926773381a0ee25122979997dd5afd49f7c97d1c37
  Stored in directory: /root/.cache/pip/wheels/a5/0a/c1/9561f6fecb759579a7d863dcd846daaa95f598744e71b02c77
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9 pyspark-3.1.2


In [3]:
#create a sparksession
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('spark').getOrCreate()

# TASK 2 : Clone & Explore dataset

In [4]:
#clone the dataset
! git clone https://github.com/education454/admission_dataset

Cloning into 'admission_dataset'...
remote: Enumerating objects: 3, done.[K
remote: Counting objects: 100% (3/3), done.[K
remote: Compressing objects: 100% (3/3), done.[K
remote: Total 3 (delta 0), reused 0 (delta 0), pack-reused 0[K
Unpacking objects: 100% (3/3), done.


In [5]:
#check the presence of dataset
! ls admission_dataset

Admission_Predict_Ver1.1.csv


In [6]:
#create a spark dataframe
df = spark.read.csv('/content/admission_dataset/Admission_Predict_Ver1.1.csv', header=True, inferSchema=True)

In [7]:
#display dataframe
df.show()

+---------+---------+-----------+-----------------+---+---+----+--------+---------------+
|Serial No|GRE Score|TOEFL Score|University Rating|SOP|LOR|CGPA|Research|Chance of Admit|
+---------+---------+-----------+-----------------+---+---+----+--------+---------------+
|        1|      337|        118|                4|4.5|4.5|9.65|       1|           0.92|
|        2|      324|        107|                4|4.0|4.5|8.87|       1|           0.76|
|        3|      316|        104|                3|3.0|3.5| 8.0|       1|           0.72|
|        4|      322|        110|                3|3.5|2.5|8.67|       1|            0.8|
|        5|      314|        103|                2|2.0|3.0|8.21|       0|           0.65|
|        6|      330|        115|                5|4.5|3.0|9.34|       1|            0.9|
|        7|      321|        109|                3|3.0|4.0| 8.2|       1|           0.75|
|        8|      308|        101|                2|3.0|4.0| 7.9|       0|           0.68|
|        9

In [8]:
#get the no.of rows & columns
rows = df.count()
length = len(df.columns)
print(rows, length)

500 9


In [9]:
#print schema 
df.printSchema()

root
 |-- Serial No: integer (nullable = true)
 |-- GRE Score: integer (nullable = true)
 |-- TOEFL Score: integer (nullable = true)
 |-- University Rating: integer (nullable = true)
 |-- SOP: double (nullable = true)
 |-- LOR: double (nullable = true)
 |-- CGPA: double (nullable = true)
 |-- Research: integer (nullable = true)
 |-- Chance of Admit: double (nullable = true)



In [10]:
#get the summary statistics
df.describe().show()

+-------+-----------------+------------------+-----------------+-----------------+------------------+------------------+------------------+------------------+-------------------+
|summary|        Serial No|         GRE Score|      TOEFL Score|University Rating|               SOP|               LOR|              CGPA|          Research|    Chance of Admit|
+-------+-----------------+------------------+-----------------+-----------------+------------------+------------------+------------------+------------------+-------------------+
|  count|              500|               500|              500|              500|               500|               500|               500|               500|                500|
|   mean|            250.5|           316.472|          107.192|            3.114|             3.374|             3.484| 8.576440000000003|              0.56| 0.7217399999999996|
| stddev|144.4818327679989|11.295148372354712|6.081867659564538|1.143511800759815|0.9910036207566072|0.92

# TASK 3 : Data Cleaning

In [11]:
#drop the unnecessary column
df = df.drop('Serial No')

In [12]:
#display the dataframe
df.show()

+---------+-----------+-----------------+---+---+----+--------+---------------+
|GRE Score|TOEFL Score|University Rating|SOP|LOR|CGPA|Research|Chance of Admit|
+---------+-----------+-----------------+---+---+----+--------+---------------+
|      337|        118|                4|4.5|4.5|9.65|       1|           0.92|
|      324|        107|                4|4.0|4.5|8.87|       1|           0.76|
|      316|        104|                3|3.0|3.5| 8.0|       1|           0.72|
|      322|        110|                3|3.5|2.5|8.67|       1|            0.8|
|      314|        103|                2|2.0|3.0|8.21|       0|           0.65|
|      330|        115|                5|4.5|3.0|9.34|       1|            0.9|
|      321|        109|                3|3.0|4.0| 8.2|       1|           0.75|
|      308|        101|                2|3.0|4.0| 7.9|       0|           0.68|
|      302|        102|                1|2.0|1.5| 8.0|       0|            0.5|
|      323|        108|                3

In [13]:
#check for null values
for i in df.columns:
  print(i + ':', df[df[i].isNull()].count())

GRE Score: 0
TOEFL Score: 0
University Rating: 0
SOP: 0
LOR: 0
CGPA: 0
Research: 0
Chance of Admit: 0


# TASK 4 : Correlation Analysis & Feature Selection

In [14]:
# correlation analysis
for i in df.columns:
  cor = df.stat.corr('Chance of Admit', i)
  print('Correlation to chance of admin col for {} is {}'.format(i, cor))

Correlation to chance of admin col for GRE Score is 0.8103506354632598
Correlation to chance of admin col for TOEFL Score is 0.7922276143050823
Correlation to chance of admin col for University Rating is 0.6901323687886892
Correlation to chance of admin col for SOP is 0.6841365241316723
Correlation to chance of admin col for LOR is 0.6453645135280112
Correlation to chance of admin col for CGPA is 0.882412574904574
Correlation to chance of admin col for Research is 0.5458710294711379
Correlation to chance of admin col for Chance of Admit is 1.0


In [15]:
# feature selection
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols=['GRE Score', 'TOEFL Score', 'CGPA'], outputCol='features')

In [16]:
#display dataframe
output_data = assembler.transform(df)
output_data.show()


+---------+-----------+-----------------+---+---+----+--------+---------------+------------------+
|GRE Score|TOEFL Score|University Rating|SOP|LOR|CGPA|Research|Chance of Admit|          features|
+---------+-----------+-----------------+---+---+----+--------+---------------+------------------+
|      337|        118|                4|4.5|4.5|9.65|       1|           0.92|[337.0,118.0,9.65]|
|      324|        107|                4|4.0|4.5|8.87|       1|           0.76|[324.0,107.0,8.87]|
|      316|        104|                3|3.0|3.5| 8.0|       1|           0.72| [316.0,104.0,8.0]|
|      322|        110|                3|3.5|2.5|8.67|       1|            0.8|[322.0,110.0,8.67]|
|      314|        103|                2|2.0|3.0|8.21|       0|           0.65|[314.0,103.0,8.21]|
|      330|        115|                5|4.5|3.0|9.34|       1|            0.9|[330.0,115.0,9.34]|
|      321|        109|                3|3.0|4.0| 8.2|       1|           0.75| [321.0,109.0,8.2]|
|      308

# TASK 5 : Build the Linear Regression Model

In [17]:
#import Linearregression and create final data
from pyspark.ml.regression import LinearRegression
final_data = output_data.select('features', 'Chance of Admit')
final_data.show()

+------------------+---------------+
|          features|Chance of Admit|
+------------------+---------------+
|[337.0,118.0,9.65]|           0.92|
|[324.0,107.0,8.87]|           0.76|
| [316.0,104.0,8.0]|           0.72|
|[322.0,110.0,8.67]|            0.8|
|[314.0,103.0,8.21]|           0.65|
|[330.0,115.0,9.34]|            0.9|
| [321.0,109.0,8.2]|           0.75|
| [308.0,101.0,7.9]|           0.68|
| [302.0,102.0,8.0]|            0.5|
| [323.0,108.0,8.6]|           0.45|
| [325.0,106.0,8.4]|           0.52|
| [327.0,111.0,9.0]|           0.84|
| [328.0,112.0,9.1]|           0.78|
| [307.0,109.0,8.0]|           0.62|
| [311.0,104.0,8.2]|           0.61|
| [314.0,105.0,8.3]|           0.54|
| [317.0,107.0,8.7]|           0.66|
| [319.0,106.0,8.0]|           0.65|
| [318.0,110.0,8.8]|           0.63|
| [303.0,102.0,8.5]|           0.62|
+------------------+---------------+
only showing top 20 rows



In [29]:
#print schema of final data
final_data.printSchema()

root
 |-- features: vector (nullable = true)
 |-- Chance of Admit: double (nullable = true)



In [30]:
#split the dataset into training and testing set
train, test = final_data.randomSplit([0.7, 0.3])
train.show()
test.show()

+------------------+---------------+
|          features|Chance of Admit|
+------------------+---------------+
|[290.0,100.0,7.56]|           0.47|
|  [293.0,97.0,7.8]|           0.64|
| [294.0,95.0,7.64]|           0.49|
|  [295.0,93.0,7.2]|           0.46|
| [295.0,99.0,7.65]|           0.57|
|[295.0,101.0,7.86]|           0.69|
|  [296.0,97.0,7.8]|           0.49|
| [296.0,99.0,8.03]|           0.61|
|[296.0,101.0,7.68]|            0.6|
| [297.0,96.0,7.43]|           0.34|
| [297.0,98.0,7.67]|           0.59|
| [297.0,99.0,7.81]|           0.54|
| [297.0,100.0,7.9]|           0.52|
|[297.0,101.0,7.67]|           0.57|
| [298.0,92.0,7.88]|           0.51|
| [298.0,99.0,7.46]|           0.53|
|[298.0,101.0,7.86]|           0.54|
|[298.0,105.0,8.54]|           0.69|
| [299.0,94.0,7.34]|           0.42|
| [299.0,96.0,7.86]|           0.54|
+------------------+---------------+
only showing top 20 rows

+------------------+---------------+
|          features|Chance of Admit|
+-----------

In [31]:
#build & train the model
model = LinearRegression(featuresCol = 'features' ,labelCol = 'Chance of Admit')
lr_model = model.fit(train)

In [33]:
#get coefficients & intercept
coefficients = lr_model.coefficients
intercept = lr_model.intercept
print(coefficients)
print(intercept)

[0.0022319207761229954,0.0031857090023803014,0.14253597369181636]
-1.5454477861723417


In [38]:
#get summary of the model
summary = lr_model.summary

In [39]:
#print the rmse & r2 score
rmse = summary.rootMeanSquaredError
r2 = summary.r2
print(rmse)
print(r2)

0.061401180702098546
0.8039233459763617


# TASK 6 : Evaluate & Save the Model

In [41]:
#transform on the test data
predictions = lr_model.transform(test)

In [42]:
#display the predictions
predictions.show()

+------------------+---------------+-------------------+
|          features|Chance of Admit|         prediction|
+------------------+---------------+-------------------+
|[290.0,104.0,7.46]|           0.45| 0.4964413388918283|
| [294.0,93.0,7.36]|           0.46|0.45607262560095574|
| [295.0,96.0,7.34]|           0.47|0.46501095391038305|
| [295.0,99.0,7.57]|           0.37| 0.5073513548666417|
| [296.0,95.0,7.54]|           0.44|0.49256436042248897|
| [296.0,99.0,7.28]|           0.47| 0.4682478432721382|
| [297.0,96.0,7.89]|           0.43| 0.5478695809931278|
| [298.0,97.0,7.21]|           0.45| 0.4563627486611965|
|  [298.0,98.0,7.5]|           0.44| 0.5008838900342032|
| [298.0,98.0,8.03]|           0.34| 0.5764279560908658|
|  [298.0,99.0,7.6]|           0.46| 0.5183231964057651|
|[298.0,100.0,7.95]|           0.58| 0.5713964962002809|
|[298.0,101.0,7.69]|           0.53| 0.5375228520427893|
| [299.0,97.0,7.66]|           0.38| 0.5227358575986365|
|[299.0,102.0,8.62]|           

In [46]:
#evaluate the model 
from pyspark.ml.evaluation import RegressionEvaluator
evaluator = RegressionEvaluator(predictionCol = 'prediction', 
                                labelCol='Chance of Admit', 
                                metricName='r2',)
print(evaluator.evaluate(predictions))

0.8026638734552771


In [67]:
#save the model
lr_model.save('abc')

In [68]:
#load the model
from pyspark.ml.regression import LinearRegressionModel
model = LinearRegressionModel.load('abc')

In [70]:
test = model.transform(test)
test.show()

+------------------+---------------+-------------------+
|          features|Chance of Admit|         prediction|
+------------------+---------------+-------------------+
|[290.0,104.0,7.46]|           0.45| 0.4964413388918283|
| [294.0,93.0,7.36]|           0.46|0.45607262560095574|
| [295.0,96.0,7.34]|           0.47|0.46501095391038305|
| [295.0,99.0,7.57]|           0.37| 0.5073513548666417|
| [296.0,95.0,7.54]|           0.44|0.49256436042248897|
| [296.0,99.0,7.28]|           0.47| 0.4682478432721382|
| [297.0,96.0,7.89]|           0.43| 0.5478695809931278|
| [298.0,97.0,7.21]|           0.45| 0.4563627486611965|
|  [298.0,98.0,7.5]|           0.44| 0.5008838900342032|
| [298.0,98.0,8.03]|           0.34| 0.5764279560908658|
|  [298.0,99.0,7.6]|           0.46| 0.5183231964057651|
|[298.0,100.0,7.95]|           0.58| 0.5713964962002809|
|[298.0,101.0,7.69]|           0.53| 0.5375228520427893|
| [299.0,97.0,7.66]|           0.38| 0.5227358575986365|
|[299.0,102.0,8.62]|           