In [0]:
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import VectorAssembler

In [0]:
table_all = spark.read.csv("/FileStore/tables/diabetes_data-1.csv", inferSchema=True, header=True, sep=",")
table_all.show(14)

+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+
|Pregnancies|Glucose|BloodPressure|SkinThickness|Insulin| BMI|DiabetesPedigreeFunction|Age|Outcome|
+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+
|          6|    148|           72|           35|      0|33.6|                   0.627| 50|      1|
|          1|     85|           66|           29|      0|26.6|                   0.351| 31|      0|
|          8|    183|           64|            0|      0|23.3|                   0.672| 32|      1|
|          1|     89|           66|           23|     94|28.1|                   0.167| 21|      0|
|          0|    137|           40|           35|    168|43.1|                   2.288| 33|      1|
|          5|    116|           74|            0|      0|25.6|                   0.201| 30|      0|
|          3|     78|           50|           32|     88|31.0|                   0.248| 26|      1|


In [0]:
vector_results = VectorAssembler(inputCols=[("Pregnancies"),("Glucose"),("BloodPressure"),("SkinThickness"),("Insulin"),("BMI"),("DiabetesPedigreeFunction"),("Age")],outputCol="result")
table = vector_results.transform(table_all)
table.show(14)

+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+--------------------+
|Pregnancies|Glucose|BloodPressure|SkinThickness|Insulin| BMI|DiabetesPedigreeFunction|Age|Outcome|              result|
+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+--------------------+
|          6|    148|           72|           35|      0|33.6|                   0.627| 50|      1|[6.0,148.0,72.0,3...|
|          1|     85|           66|           29|      0|26.6|                   0.351| 31|      0|[1.0,85.0,66.0,29...|
|          8|    183|           64|            0|      0|23.3|                   0.672| 32|      1|[8.0,183.0,64.0,0...|
|          1|     89|           66|           23|     94|28.1|                   0.167| 21|      0|[1.0,89.0,66.0,23...|
|          0|    137|           40|           35|    168|43.1|                   2.288| 33|      1|[0.0,137.0,40.0,3...|
|          5|    116|           

In [0]:
tableTrain, tableTest = table.randomSplit([0.8,0.2])
Random_Forest_Regressor = RandomForestRegressor(featuresCol="result",labelCol="Outcome")
model = Random_Forest_Regressor.fit(tableTest)

In [0]:
prediction = model.transform(tableTest)
prediction.show(14)

+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+--------------------+--------------------+
|Pregnancies|Glucose|BloodPressure|SkinThickness|Insulin| BMI|DiabetesPedigreeFunction|Age|Outcome|              result|          prediction|
+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+--------------------+--------------------+
|          0|     67|           76|            0|      0|45.3|                   0.194| 46|      0|[0.0,67.0,76.0,0....|  0.3195289610612191|
|          0|     73|            0|            0|      0|21.1|                   0.342| 25|      0|(8,[1,5,6,7],[73....|  0.1768767507002801|
|          0|     94|            0|            0|      0| 0.0|                   0.256| 25|      0|(8,[1,6,7],[94.0,...| 0.14354341736694678|
|          0|     99|            0|            0|      0|25.0|                   0.253| 22|      0|(8,[1,5,6,7],[99....| 0.11666666666666665|
|     

In [0]:
evaluate = RegressionEvaluator(predictionCol="prediction",labelCol="Outcome",metricName="rmse")
print(evaluate.evaluate(prediction))

0.24065948085875882
