In [0]:
pip install findspark

In [0]:
from __future__ import print_function
import findspark
findspark.init()
findspark.find()
import pyspark
findspark.find()
from pyspark.sql import SparkSession
from pyspark.ml.regression import GBTRegressor
from pyspark.ml.feature import StandardScaler, VectorAssembler,VectorIndexer
from pyspark.ml.evaluation import RegressionEvaluator

In [0]:
dataset = spark.read.csv("/FileStore/tables/Admission_Prediction.csv",header=True)

In [0]:
dataset.show()

In [0]:
dataset.printSchema()

In [0]:
from pyspark.sql.functions import col
new_data = dataset.select(*(col(c).cast("float").alias(c) for c in dataset.columns))

In [0]:
new_data.printSchema()

In [0]:
from pyspark.sql.functions import col, count, isnan, when

In [0]:
#checking for null if nan type values in our columns
new_data.select([count(when(col(c).isNull(), c)).alias(c) for c in new_data.columns]).show()

In [0]:
from pyspark.ml.feature import Imputer

In [0]:
imputer = Imputer(inputCols=["GRE Score", "TOEFL Score","University Rating"], 
                  outputCols=["GRE Score", "TOEFL Score","University Rating"])
model = imputer.fit(new_data)

imputed_data = model.transform(new_data)

In [0]:
#checking for null if nan type values in our columns
imputed_data.select([count(when(col(c).isNull(), c)).alias(c) for c in imputed_data.columns]).show()

In [0]:
features = imputed_data.drop('Chance of Admit')

In [0]:
#let's assemble our features together using vectorAssembler
assembler = VectorAssembler(
    inputCols=features.columns,
    outputCol="features")

In [0]:
output = assembler.transform(imputed_data)

In [0]:
featureIndexer =\
    VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(output)

In [0]:
featureIndexer = featureIndexer.transform(output)

In [0]:
new_indexed_data = featureIndexer.select("indexedFeatures", "Chance of Admit")

In [0]:
training, test = new_indexed_data.randomSplit([0.7, 0.3])

In [0]:
training.show()

In [0]:
gbt = GBTRegressor(featuresCol="indexedFeatures",labelCol="Chance of Admit", maxIter=10)

In [0]:
# Train model.  This also runs the indexer.
model = gbt.fit(training)

In [0]:
# Make predictions.
predictions = model.transform(test)

In [0]:
predictions.show()

In [0]:
evaluator = RegressionEvaluator(
    labelCol="Chance of Admit", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print ("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

In [0]:
spark.stop()