In [1]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488491 sha256=7d07406a6f047de0f0d907b67cceb2157c0abd23b5f57bbe77f5a2b98bfe7130
  Stored in directory: /root/.cache/pip/wheels/80/1d/60/2c256ed38dddce2fdd93be545214a63e02fbd8d74fb0b7f3a6
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.1


In [8]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

In [9]:
spark = SparkSession.builder.appName("Insurance_RF").getOrCreate()

In [10]:
df = spark.read.csv("insurance.csv", header=True, inferSchema=True)

## <span style="line-height:1.7">**insurance.csv**를 로드하여 <b><font color='blue'>Random forest regressor</font></b>을 생성하시오.</span>
> - **X**(feature)는 **age, sex, bmi, children** 4개의 column으로 하고, **charges**를 **y**로 한다.
>
> - **학습(train)** 데이터와 **테스트(test)** 데이터를 **6:4** 비율로 나눈다.
>  
> - 학습 데이터를 **3개 그룹**으로 분리하여 **cross validation**을 수행하되, <font color='blue'>**minInstancesPerNode**</font> parameter는 <font color='red'>**1**</font>, <font color='red'>**100**</font>, <font color='blue'>**featureSubsetStrategy**</font> parameter는 <font color='red'>**onethird**</font>, <font color='red'>**sqrt**</font>가 적용된 regressor 중 <font color='red'>**R2 score**</font>가 가장 높은 best model을 찾으시오. <br>(**best model의 minInstancesPerNode와 featureSubsetStrategy**를 찾으시오.)
>  
> - **best model**의 <font color='red'><b>training data</b>에 대한 <b>R2</b></font>, <font color='red'><b>test data</b>에 대한 <b>R2</b></font>를 찾으시오.

In [30]:
rf = RandomForestRegressor(featuresCol='features', labelCol='charges')

cv = CrossValidator(estimator=rf,
                    estimatorParamMaps=ParamGridBuilder()
                    .addGrid(rf.minInstancesPerNode, [1, 100])
                    .addGrid(rf.featureSubsetStrategy, ['onethird', 'sqrt'])
                    .build(),
                    evaluator=RegressionEvaluator(labelCol='charges'),
                    numFolds=3)

cvModel = cv.fit(train)

best_model = cvModel.bestModel

train_predictions = best_model.transform(train)
test_predictions = best_model.transform(test)

evaluator = RegressionEvaluator(labelCol='charges', predictionCol='prediction', metricName='r2')
train_r2 = evaluator.evaluate(train_predictions)
test_r2 = evaluator.evaluate(test_predictions)

print('Best model parameters:')
print(f'minInstancesPerNode: {best_model.getMinInstancesPerNode()}')
print(f'featureSubsetStrategy: {best_model.getFeatureSubsetStrategy()}')
print('Training R2:', train_r2)
print('Test R2:', test_r2)


Best model parameters:
minInstancesPerNode: 100
featureSubsetStrategy: onethird
Training R2: 0.13071607145491482
Test R2: 0.10732446373910398
