# Installing Dependencies & Run a SparkSession


In [None]:
#install pyspark
!pip install pyspark

In [3]:
#create a sparksession
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("spark").getOrCreate()

# Cloning dataset

In [None]:
! git clone https://github.com/education454/admission_dataset

In [5]:
#create a spark dataframe
df = spark.read.csv("admission_dataset/Admission_Predict_Ver1.1.csv",header=True,inferSchema=True)

In [7]:
df.show()

+---------+---------+-----------+-----------------+---+---+----+--------+---------------+
|Serial No|GRE Score|TOEFL Score|University Rating|SOP|LOR|CGPA|Research|Chance of Admit|
+---------+---------+-----------+-----------------+---+---+----+--------+---------------+
|        1|      337|        118|                4|4.5|4.5|9.65|       1|           0.92|
|        2|      324|        107|                4|4.0|4.5|8.87|       1|           0.76|
|        3|      316|        104|                3|3.0|3.5| 8.0|       1|           0.72|
|        4|      322|        110|                3|3.5|2.5|8.67|       1|            0.8|
|        5|      314|        103|                2|2.0|3.0|8.21|       0|           0.65|
|        6|      330|        115|                5|4.5|3.0|9.34|       1|            0.9|
|        7|      321|        109|                3|3.0|4.0| 8.2|       1|           0.75|
|        8|      308|        101|                2|3.0|4.0| 7.9|       0|           0.68|
|        9

In [9]:
print((df.count(),len(df.columns)))

(500, 9)


In [11]:
#print schema 
df.printSchema()

root
 |-- Serial No: integer (nullable = true)
 |-- GRE Score: integer (nullable = true)
 |-- TOEFL Score: integer (nullable = true)
 |-- University Rating: integer (nullable = true)
 |-- SOP: double (nullable = true)
 |-- LOR: double (nullable = true)
 |-- CGPA: double (nullable = true)
 |-- Research: integer (nullable = true)
 |-- Chance of Admit: double (nullable = true)



In [13]:
#get the summary statistics
spark.conf.set("spark.sql.debug.maxToStringFields", "1000")
df.describe().show()

+-------+-----------------+------------------+-----------------+-----------------+------------------+------------------+------------------+------------------+-------------------+
|summary|        Serial No|         GRE Score|      TOEFL Score|University Rating|               SOP|               LOR|              CGPA|          Research|    Chance of Admit|
+-------+-----------------+------------------+-----------------+-----------------+------------------+------------------+------------------+------------------+-------------------+
|  count|              500|               500|              500|              500|               500|               500|               500|               500|                500|
|   mean|            250.5|           316.472|          107.192|            3.114|             3.374|             3.484| 8.576440000000003|              0.56| 0.7217399999999996|
| stddev|144.4818327679989|11.295148372354712|6.081867659564538|1.143511800759815|0.9910036207566072|0.92

# Data Cleaning

In [15]:
#drop the unnecessary column
df = df.drop('Serial No')

In [17]:
df.show()

+---------+-----------+-----------------+---+---+----+--------+---------------+
|GRE Score|TOEFL Score|University Rating|SOP|LOR|CGPA|Research|Chance of Admit|
+---------+-----------+-----------------+---+---+----+--------+---------------+
|      337|        118|                4|4.5|4.5|9.65|       1|           0.92|
|      324|        107|                4|4.0|4.5|8.87|       1|           0.76|
|      316|        104|                3|3.0|3.5| 8.0|       1|           0.72|
|      322|        110|                3|3.5|2.5|8.67|       1|            0.8|
|      314|        103|                2|2.0|3.0|8.21|       0|           0.65|
|      330|        115|                5|4.5|3.0|9.34|       1|            0.9|
|      321|        109|                3|3.0|4.0| 8.2|       1|           0.75|
|      308|        101|                2|3.0|4.0| 7.9|       0|           0.68|
|      302|        102|                1|2.0|1.5| 8.0|       0|            0.5|
|      323|        108|                3

In [19]:
#check for null values
for i in df.columns:
    print(i+":",df[df[i].isNull()].count())

GRE Score: 0
TOEFL Score: 0
University Rating: 0
SOP: 0
LOR: 0
CGPA: 0
Research: 0
Chance of Admit: 0


# Correlation Analysis & Feature Selection

In [21]:
# correlation analysis 
for col in df.columns:
    print('correlation to chance of admit for {} is {}'.format(col,df.stat.corr('Chance of Admit',col)))

correlation to chance of admit for GRE Score is 0.8103506354632598
correlation to chance of admit for TOEFL Score is 0.7922276143050823
correlation to chance of admit for University Rating is 0.6901323687886892
correlation to chance of admit for SOP is 0.6841365241316723
correlation to chance of admit for LOR is 0.6453645135280112
correlation to chance of admit for CGPA is 0.882412574904574
correlation to chance of admit for Research is 0.5458710294711379
correlation to chance of admit for Chance of Admit is 1.0


In [23]:
# feature selection
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols =['GRE Score','TOEFL Score','CGPA'],outputCol='features')

In [25]:
#display dataframe
output_data = assembler.transform(df)
output_data.show()

+---------+-----------+-----------------+---+---+----+--------+---------------+------------------+
|GRE Score|TOEFL Score|University Rating|SOP|LOR|CGPA|Research|Chance of Admit|          features|
+---------+-----------+-----------------+---+---+----+--------+---------------+------------------+
|      337|        118|                4|4.5|4.5|9.65|       1|           0.92|[337.0,118.0,9.65]|
|      324|        107|                4|4.0|4.5|8.87|       1|           0.76|[324.0,107.0,8.87]|
|      316|        104|                3|3.0|3.5| 8.0|       1|           0.72| [316.0,104.0,8.0]|
|      322|        110|                3|3.5|2.5|8.67|       1|            0.8|[322.0,110.0,8.67]|
|      314|        103|                2|2.0|3.0|8.21|       0|           0.65|[314.0,103.0,8.21]|
|      330|        115|                5|4.5|3.0|9.34|       1|            0.9|[330.0,115.0,9.34]|
|      321|        109|                3|3.0|4.0| 8.2|       1|           0.75| [321.0,109.0,8.2]|
|      308

# Building the Linear Regression Model

In [27]:
#import Linearregression and create final data
from pyspark.ml.regression import LinearRegression
final_data = output_data.select('features','Chance of Admit')

In [29]:
#print schema of final data
final_data.printSchema()

root
 |-- features: vector (nullable = true)
 |-- Chance of Admit: double (nullable = true)



In [31]:
#split the dataset into training and testing set
train,test = final_data.randomSplit([0.7,0.3])

In [37]:
#build & train the model
models = LinearRegression(featuresCol='features',labelCol='Chance of Admit',elasticNetParam=0.1)
model = models.fit(train)

24/05/20 17:53:47 WARN Instrumentation: [4cf62285] regParam is zero, which might cause numerical instability and overfitting.


In [39]:
#get coefficients & intercept
print('coefficients:',model.coefficients)
print('intercept:',model.intercept)

coefficients: [0.003000233445834736,0.003012319721086756,0.129702834368776]
intercept: -1.6609176147461988


In [41]:
#get summary of the model
summary = model.summary

In [43]:
#print the rmse & r2 score
print('RMSE',summary.rootMeanSquaredError)
print('R2 SCORE',summary.r2)

RMSE 0.059934699535716666
R2 SCORE 0.8111182790981277


# Evaluating the Model

In [46]:
#transform on the test data
predictions = model.transform(test)

In [48]:
#display the predictions
predictions.show()

+------------------+---------------+-------------------+
|          features|Chance of Admit|         prediction|
+------------------+---------------+-------------------+
| [295.0,99.0,7.65]|           0.57| 0.5145975870837738|
|  [296.0,97.0,7.8]|           0.49| 0.5310286062427512|
| [296.0,99.0,8.03]|           0.61| 0.5668848975897434|
| [297.0,98.0,7.67]|           0.59| 0.5201797909417318|
|  [298.0,99.0,7.6]|           0.46| 0.5171131457028393|
|[298.0,101.0,7.86]|           0.54| 0.5568605220808944|
| [299.0,94.0,7.34]|           0.42|0.47132904360735806|
|[299.0,100.0,7.89]|           0.59| 0.5607395208367052|
|[299.0,100.0,8.02]|           0.63| 0.5776008893046463|
|  [300.0,97.0,8.1]|           0.65| 0.5819403903367228|
|[300.0,100.0,8.66]|           0.64|  0.663610936746498|
|[300.0,101.0,7.88]|           0.59|  0.565455045659939|
|[300.0,104.0,8.16]|           0.71|  0.610808798446457|
| [301.0,97.0,7.88]|           0.44| 0.5564060002214268|
| [301.0,98.0,8.03]|           

In [50]:
#evaluate the model 
from pyspark.ml.evaluation import RegressionEvaluator
evaluator= RegressionEvaluator(predictionCol='prediction',labelCol='Chance of Admit',metricName = 'r2')
print('r2 on the test data',evaluator.evaluate(predictions))

r2 on the test data 0.7869375370863908


In [None]:
#save the model
model.save('model')