In [80]:
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression,RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator
import pyspark.mllib.linalg

In [81]:
spark = SparkSession.builder.appName("pyspark ml algo").getOrCreate()

In [82]:
df = spark.read.csv("/home/jawad/Desktop/spark/Admission_Predict.csv")

In [83]:
type(df)


pyspark.sql.dataframe.DataFrame

In [84]:
df.show()

+----------+---------+-----------+-----------------+---+----+----+--------+----------------+
|       _c0|      _c1|        _c2|              _c3|_c4| _c5| _c6|     _c7|             _c8|
+----------+---------+-----------+-----------------+---+----+----+--------+----------------+
|Serial No.|GRE Score|TOEFL Score|University Rating|SOP|LOR |CGPA|Research|Chance of Admit |
|         1|      337|        118|                4|4.5| 4.5|9.65|       1|            0.92|
|         2|      324|        107|                4|  4| 4.5|8.87|       1|            0.76|
|         3|      316|        104|                3|  3| 3.5|   8|       1|            0.72|
|         4|      322|        110|                3|3.5| 2.5|8.67|       1|             0.8|
|         5|      314|        103|                2|  2|   3|8.21|       0|            0.65|
|         6|      330|        115|                5|4.5|   3|9.34|       1|             0.9|
|         7|      321|        109|                3|  3|   4| 8.2|    

In [85]:
df

DataFrame[_c0: string, _c1: string, _c2: string, _c3: string, _c4: string, _c5: string, _c6: string, _c7: string, _c8: string]

In [86]:
df.printSchema()

root
 |-- _c0: string (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: string (nullable = true)
 |-- _c3: string (nullable = true)
 |-- _c4: string (nullable = true)
 |-- _c5: string (nullable = true)
 |-- _c6: string (nullable = true)
 |-- _c7: string (nullable = true)
 |-- _c8: string (nullable = true)



In [87]:
from pyspark.sql.functions import col
for c in df.columns:
    print((c))

_c0
_c1
_c2
_c3
_c4
_c5
_c6
_c7
_c8


In [88]:
df.select(*(col(c) for c in df.columns)).show()

+----------+---------+-----------+-----------------+---+----+----+--------+----------------+
|       _c0|      _c1|        _c2|              _c3|_c4| _c5| _c6|     _c7|             _c8|
+----------+---------+-----------+-----------------+---+----+----+--------+----------------+
|Serial No.|GRE Score|TOEFL Score|University Rating|SOP|LOR |CGPA|Research|Chance of Admit |
|         1|      337|        118|                4|4.5| 4.5|9.65|       1|            0.92|
|         2|      324|        107|                4|  4| 4.5|8.87|       1|            0.76|
|         3|      316|        104|                3|  3| 3.5|   8|       1|            0.72|
|         4|      322|        110|                3|3.5| 2.5|8.67|       1|             0.8|
|         5|      314|        103|                2|  2|   3|8.21|       0|            0.65|
|         6|      330|        115|                5|4.5|   3|9.34|       1|             0.9|
|         7|      321|        109|                3|  3|   4| 8.2|    

In [89]:
df.select(*(col(c).cast("float") for c in df.columns)).show()


+----+-----+-----+----+----+----+----+----+----+
| _c0|  _c1|  _c2| _c3| _c4| _c5| _c6| _c7| _c8|
+----+-----+-----+----+----+----+----+----+----+
|null| null| null|null|null|null|null|null|null|
| 1.0|337.0|118.0| 4.0| 4.5| 4.5|9.65| 1.0|0.92|
| 2.0|324.0|107.0| 4.0| 4.0| 4.5|8.87| 1.0|0.76|
| 3.0|316.0|104.0| 3.0| 3.0| 3.5| 8.0| 1.0|0.72|
| 4.0|322.0|110.0| 3.0| 3.5| 2.5|8.67| 1.0| 0.8|
| 5.0|314.0|103.0| 2.0| 2.0| 3.0|8.21| 0.0|0.65|
| 6.0|330.0|115.0| 5.0| 4.5| 3.0|9.34| 1.0| 0.9|
| 7.0|321.0|109.0| 3.0| 3.0| 4.0| 8.2| 1.0|0.75|
| 8.0|308.0|101.0| 2.0| 3.0| 4.0| 7.9| 0.0|0.68|
| 9.0|302.0|102.0| 1.0| 2.0| 1.5| 8.0| 0.0| 0.5|
|10.0|323.0|108.0| 3.0| 3.5| 3.0| 8.6| 0.0|0.45|
|11.0|325.0|106.0| 3.0| 3.5| 4.0| 8.4| 1.0|0.52|
|12.0|327.0|111.0| 4.0| 4.0| 4.5| 9.0| 1.0|0.84|
|13.0|328.0|112.0| 4.0| 4.0| 4.5| 9.1| 1.0|0.78|
|14.0|307.0|109.0| 3.0| 4.0| 3.0| 8.0| 1.0|0.62|
|15.0|311.0|104.0| 3.0| 3.5| 2.0| 8.2| 1.0|0.61|
|16.0|314.0|105.0| 3.0| 3.5| 2.5| 8.3| 0.0|0.54|
|17.0|317.0|107.0| 3

In [90]:
df.select(*(col(c).cast("float").alias(c) for c in df.columns)).show()


+----+-----+-----+----+----+----+----+----+----+
| _c0|  _c1|  _c2| _c3| _c4| _c5| _c6| _c7| _c8|
+----+-----+-----+----+----+----+----+----+----+
|null| null| null|null|null|null|null|null|null|
| 1.0|337.0|118.0| 4.0| 4.5| 4.5|9.65| 1.0|0.92|
| 2.0|324.0|107.0| 4.0| 4.0| 4.5|8.87| 1.0|0.76|
| 3.0|316.0|104.0| 3.0| 3.0| 3.5| 8.0| 1.0|0.72|
| 4.0|322.0|110.0| 3.0| 3.5| 2.5|8.67| 1.0| 0.8|
| 5.0|314.0|103.0| 2.0| 2.0| 3.0|8.21| 0.0|0.65|
| 6.0|330.0|115.0| 5.0| 4.5| 3.0|9.34| 1.0| 0.9|
| 7.0|321.0|109.0| 3.0| 3.0| 4.0| 8.2| 1.0|0.75|
| 8.0|308.0|101.0| 2.0| 3.0| 4.0| 7.9| 0.0|0.68|
| 9.0|302.0|102.0| 1.0| 2.0| 1.5| 8.0| 0.0| 0.5|
|10.0|323.0|108.0| 3.0| 3.5| 3.0| 8.6| 0.0|0.45|
|11.0|325.0|106.0| 3.0| 3.5| 4.0| 8.4| 1.0|0.52|
|12.0|327.0|111.0| 4.0| 4.0| 4.5| 9.0| 1.0|0.84|
|13.0|328.0|112.0| 4.0| 4.0| 4.5| 9.1| 1.0|0.78|
|14.0|307.0|109.0| 3.0| 4.0| 3.0| 8.0| 1.0|0.62|
|15.0|311.0|104.0| 3.0| 3.5| 2.0| 8.2| 1.0|0.61|
|16.0|314.0|105.0| 3.0| 3.5| 2.5| 8.3| 0.0|0.54|
|17.0|317.0|107.0| 3

In [91]:
newDf = df.select(*(col(c).cast("float") for c in df.columns))

In [92]:
newDf.printSchema()

root
 |-- _c0: float (nullable = true)
 |-- _c1: float (nullable = true)
 |-- _c2: float (nullable = true)
 |-- _c3: float (nullable = true)
 |-- _c4: float (nullable = true)
 |-- _c5: float (nullable = true)
 |-- _c6: float (nullable = true)
 |-- _c7: float (nullable = true)
 |-- _c8: float (nullable = true)



In [93]:
from pyspark.sql.functions import col,count,isnan,when

In [94]:
newDf.select([count(when(col(c).isNull(),c)).alias(c) for c in newDf.columns]).show()

+---+---+---+---+---+---+---+---+---+
|_c0|_c1|_c2|_c3|_c4|_c5|_c6|_c7|_c8|
+---+---+---+---+---+---+---+---+---+
|  1|  1|  1|  1|  1|  1|  1|  1|  1|
+---+---+---+---+---+---+---+---+---+



In [101]:
from pyspark.ml.feature import Imputer

In [102]:
imputer = Imputer(inputCols=["_c0","_c1","_c2","_c3","_c4","_c5","_c6","_c7","_c8"],
                 outputCols=["_c0","_c1","_c2","_c3","_c4","_c5","_c6","_c7","_c8"])
model = imputer.fit(newDf)
imputedData = model.transform(newDf)

In [103]:
imputedData.show()

+-----+--------+------+------+---+------+--------+------+-------+
|  _c0|     _c1|   _c2|   _c3|_c4|   _c5|     _c6|   _c7|    _c8|
+-----+--------+------+------+---+------+--------+------+-------+
|200.5|316.8075|107.41|3.0875|3.4|3.4525|8.598925|0.5475|0.72435|
|  1.0|   337.0| 118.0|   4.0|4.5|   4.5|    9.65|   1.0|   0.92|
|  2.0|   324.0| 107.0|   4.0|4.0|   4.5|    8.87|   1.0|   0.76|
|  3.0|   316.0| 104.0|   3.0|3.0|   3.5|     8.0|   1.0|   0.72|
|  4.0|   322.0| 110.0|   3.0|3.5|   2.5|    8.67|   1.0|    0.8|
|  5.0|   314.0| 103.0|   2.0|2.0|   3.0|    8.21|   0.0|   0.65|
|  6.0|   330.0| 115.0|   5.0|4.5|   3.0|    9.34|   1.0|    0.9|
|  7.0|   321.0| 109.0|   3.0|3.0|   4.0|     8.2|   1.0|   0.75|
|  8.0|   308.0| 101.0|   2.0|3.0|   4.0|     7.9|   0.0|   0.68|
|  9.0|   302.0| 102.0|   1.0|2.0|   1.5|     8.0|   0.0|    0.5|
| 10.0|   323.0| 108.0|   3.0|3.5|   3.0|     8.6|   0.0|   0.45|
| 11.0|   325.0| 106.0|   3.0|3.5|   4.0|     8.4|   1.0|   0.52|
| 12.0|   

In [104]:
imputedData.select([count(when(col(c).isNull(),c)).alias(c) for c in newDf.columns]).show()

+---+---+---+---+---+---+---+---+---+
|_c0|_c1|_c2|_c3|_c4|_c5|_c6|_c7|_c8|
+---+---+---+---+---+---+---+---+---+
|  0|  0|  0|  0|  0|  0|  0|  0|  0|
+---+---+---+---+---+---+---+---+---+



In [105]:
features = imputedData.drop("_c8")

In [108]:
features.printSchema

<bound method DataFrame.printSchema of DataFrame[_c0: float, _c1: float, _c2: float, _c3: float, _c4: float, _c5: float, _c6: float, _c7: float]>

In [109]:
assambler = VectorAssembler(inputCols=features.columns,outputCol="features")

In [111]:
output = assambler.transform(imputedData)

In [114]:
output.show()

+-----+--------+------+------+---+------+--------+------+-------+--------------------+
|  _c0|     _c1|   _c2|   _c3|_c4|   _c5|     _c6|   _c7|    _c8|            features|
+-----+--------+------+------+---+------+--------+------+-------+--------------------+
|200.5|316.8075|107.41|3.0875|3.4|3.4525|8.598925|0.5475|0.72435|[200.5,316.807495...|
|  1.0|   337.0| 118.0|   4.0|4.5|   4.5|    9.65|   1.0|   0.92|[1.0,337.0,118.0,...|
|  2.0|   324.0| 107.0|   4.0|4.0|   4.5|    8.87|   1.0|   0.76|[2.0,324.0,107.0,...|
|  3.0|   316.0| 104.0|   3.0|3.0|   3.5|     8.0|   1.0|   0.72|[3.0,316.0,104.0,...|
|  4.0|   322.0| 110.0|   3.0|3.5|   2.5|    8.67|   1.0|    0.8|[4.0,322.0,110.0,...|
|  5.0|   314.0| 103.0|   2.0|2.0|   3.0|    8.21|   0.0|   0.65|[5.0,314.0,103.0,...|
|  6.0|   330.0| 115.0|   5.0|4.5|   3.0|    9.34|   1.0|    0.9|[6.0,330.0,115.0,...|
|  7.0|   321.0| 109.0|   3.0|3.0|   4.0|     8.2|   1.0|   0.75|[7.0,321.0,109.0,...|
|  8.0|   308.0| 101.0|   2.0|3.0|   4.0|  

In [116]:
output.select("features").toPandas().values

array([[DenseVector([200.5, 316.8075, 107.41, 3.0875, 3.4, 3.4525, 8.5989, 0.5475])],
       [DenseVector([1.0, 337.0, 118.0, 4.0, 4.5, 4.5, 9.65, 1.0])],
       [DenseVector([2.0, 324.0, 107.0, 4.0, 4.0, 4.5, 8.87, 1.0])],
       [DenseVector([3.0, 316.0, 104.0, 3.0, 3.0, 3.5, 8.0, 1.0])],
       [DenseVector([4.0, 322.0, 110.0, 3.0, 3.5, 2.5, 8.67, 1.0])],
       [DenseVector([5.0, 314.0, 103.0, 2.0, 2.0, 3.0, 8.21, 0.0])],
       [DenseVector([6.0, 330.0, 115.0, 5.0, 4.5, 3.0, 9.34, 1.0])],
       [DenseVector([7.0, 321.0, 109.0, 3.0, 3.0, 4.0, 8.2, 1.0])],
       [DenseVector([8.0, 308.0, 101.0, 2.0, 3.0, 4.0, 7.9, 0.0])],
       [DenseVector([9.0, 302.0, 102.0, 1.0, 2.0, 1.5, 8.0, 0.0])],
       [DenseVector([10.0, 323.0, 108.0, 3.0, 3.5, 3.0, 8.6, 0.0])],
       [DenseVector([11.0, 325.0, 106.0, 3.0, 3.5, 4.0, 8.4, 1.0])],
       [DenseVector([12.0, 327.0, 111.0, 4.0, 4.0, 4.5, 9.0, 1.0])],
       [DenseVector([13.0, 328.0, 112.0, 4.0, 4.0, 4.5, 9.1, 1.0])],
       [DenseVector([

In [117]:
data = output.select("features","_c8")

In [122]:
data.show()

+--------------------+-------+
|            features|    _c8|
+--------------------+-------+
|[200.5,316.807495...|0.72435|
|[1.0,337.0,118.0,...|   0.92|
|[2.0,324.0,107.0,...|   0.76|
|[3.0,316.0,104.0,...|   0.72|
|[4.0,322.0,110.0,...|    0.8|
|[5.0,314.0,103.0,...|   0.65|
|[6.0,330.0,115.0,...|    0.9|
|[7.0,321.0,109.0,...|   0.75|
|[8.0,308.0,101.0,...|   0.68|
|[9.0,302.0,102.0,...|    0.5|
|[10.0,323.0,108.0...|   0.45|
|[11.0,325.0,106.0...|   0.52|
|[12.0,327.0,111.0...|   0.84|
|[13.0,328.0,112.0...|   0.78|
|[14.0,307.0,109.0...|   0.62|
|[15.0,311.0,104.0...|   0.61|
|[16.0,314.0,105.0...|   0.54|
|[17.0,317.0,107.0...|   0.66|
|[18.0,319.0,106.0...|   0.65|
|[19.0,318.0,110.0...|   0.63|
+--------------------+-------+
only showing top 20 rows



In [124]:
data.toPandas().values

array([[DenseVector([200.5, 316.8075, 107.41, 3.0875, 3.4, 3.4525, 8.5989, 0.5475]),
        0.7243499755859375],
       [DenseVector([1.0, 337.0, 118.0, 4.0, 4.5, 4.5, 9.65, 1.0]),
        0.9200000166893005],
       [DenseVector([2.0, 324.0, 107.0, 4.0, 4.0, 4.5, 8.87, 1.0]),
        0.7599999904632568],
       [DenseVector([3.0, 316.0, 104.0, 3.0, 3.0, 3.5, 8.0, 1.0]),
        0.7200000286102295],
       [DenseVector([4.0, 322.0, 110.0, 3.0, 3.5, 2.5, 8.67, 1.0]),
        0.800000011920929],
       [DenseVector([5.0, 314.0, 103.0, 2.0, 2.0, 3.0, 8.21, 0.0]),
        0.6499999761581421],
       [DenseVector([6.0, 330.0, 115.0, 5.0, 4.5, 3.0, 9.34, 1.0]),
        0.8999999761581421],
       [DenseVector([7.0, 321.0, 109.0, 3.0, 3.0, 4.0, 8.2, 1.0]), 0.75],
       [DenseVector([8.0, 308.0, 101.0, 2.0, 3.0, 4.0, 7.9, 0.0]),
        0.6800000071525574],
       [DenseVector([9.0, 302.0, 102.0, 1.0, 2.0, 1.5, 8.0, 0.0]), 0.5],
       [DenseVector([10.0, 323.0, 108.0, 3.0, 3.5, 3.0, 8.6, 0.

In [125]:
train_df,test_df = data.randomSplit([0.7,0.3])

In [127]:
train_df.show()
test_df.show()

+--------------------+----+
|            features| _c8|
+--------------------+----+
|[1.0,337.0,118.0,...|0.92|
|[3.0,316.0,104.0,...|0.72|
|[4.0,322.0,110.0,...| 0.8|
|[5.0,314.0,103.0,...|0.65|
|[7.0,321.0,109.0,...|0.75|
|[8.0,308.0,101.0,...|0.68|
|[10.0,323.0,108.0...|0.45|
|[11.0,325.0,106.0...|0.52|
|[12.0,327.0,111.0...|0.84|
|[15.0,311.0,104.0...|0.61|
|[16.0,314.0,105.0...|0.54|
|[17.0,317.0,107.0...|0.66|
|[18.0,319.0,106.0...|0.65|
|[19.0,318.0,110.0...|0.63|
|[21.0,312.0,107.0...|0.64|
|[22.0,325.0,114.0...| 0.7|
|[24.0,334.0,119.0...|0.95|
|[25.0,336.0,119.0...|0.97|
|[26.0,340.0,120.0...|0.94|
|[28.0,298.0,98.0,...|0.44|
+--------------------+----+
only showing top 20 rows

+--------------------+----+
|            features| _c8|
+--------------------+----+
|[2.0,324.0,107.0,...|0.76|
|[6.0,330.0,115.0,...| 0.9|
|[9.0,302.0,102.0,...| 0.5|
|[13.0,328.0,112.0...|0.78|
|[14.0,307.0,109.0...|0.62|
|[20.0,303.0,102.0...|0.62|
|[23.0,328.0,116.0...|0.94|
|[27.0,322.0,109.0...|

In [129]:
lr = LinearRegression(featuresCol="features", labelCol="_c8")
linearModel = lr.fit(train_df)

In [130]:
print("coefficents:", linearModel.coefficients)
print("intercepts:", linearModel.intercept)

coefficents: [0.0001740053019743413,0.0021213304157050627,0.004616837675371345,0.006676074599082496,0.0024034064025222845,0.022573893093470004,0.0909736180277132,0.02073740457639928]
intercepts: -1.378385917677439


In [135]:
summary = linearModel.summary
print("RMSE",summary.rootMeanSquaredError)
print("r2" ,summary.r2)

RMSE 0.05867934918301974
r2 0.8271913465513027


In [139]:
predictions = linearModel.transform(test_df)
predictions.select("prediction","_c8","features").show()

+------------------+----+--------------------+
|        prediction| _c8|            features|
+------------------+----+--------------------+
|0.7688486078778294|0.76|[2.0,324.0,107.0,...|
|0.8359818759086688| 0.9|[6.0,330.0,115.0,...|
|0.5078720297371739| 0.5|[9.0,302.0,102.0,...|
|0.8232561535003835|0.78|[13.0,328.0,112.0...|
|0.6244237782729847|0.62|[14.0,307.0,109.0...|
|0.6082123259306067|0.62|[20.0,303.0,102.0...|
|0.9002193972773564|0.94|[23.0,328.0,116.0...|
|0.7363881095779996|0.76|[27.0,322.0,109.0...|
|0.5571484188970042|0.65|[31.0,300.0,97.0,...|
|0.9150927978043109|0.94|[35.0,331.0,112.0...|
|0.6346583863161068|0.46|[41.0,308.0,110.0...|
|0.8530813472620842|0.91|[45.0,326.0,113.0...|
|0.7861031203502449|0.82|[49.0,321.0,110.0...|
| 0.650352575307757|0.76|[51.0,313.0,98.0,...|
|0.6857836450038579|0.72|[54.0,324.0,112.0...|
|0.5932986330420849|0.47|[62.0,307.0,101.0...|
|0.6796717457613732|0.56|[64.0,315.0,107.0...|
|0.7325433223424815|0.52|[65.0,325.0,111.0...|
|0.7652261633

In [140]:
!dir


pySpark_initial\ .ipynb
