# Data Preparation

In [1]:
!pip install pyspark



In [2]:
from pyspark.sql import SparkSession
from pyspark import SparkConf, SparkContext

spark = SparkSession.builder.master("local[*]").getOrCreate()
sc = SparkContext.getOrCreate();

In [3]:
from pyspark.sql.functions import *
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StringIndexer



In [4]:
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [5]:
import pandas as pd
import numpy as np

In [6]:
ins = spark.read.csv("/content/drive/MyDrive/Sample_Data/insurance.csv", header=True)

AnalysisException: Path does not exist: file:/content/drive/MyDrive/Sample_Data/insurance.csv

In [None]:
df = pd.read_csv('/content/drive/MyDrive/Sample_Data/insurance.csv')

In [None]:
df

In [None]:
df.info()

femail = 0 , male = 1

In [None]:
df['sex'] = np.where(df['sex'] == "female", 0, 1)

In [None]:
df['smoker'] = np.where(df['smoker'] == 'no', 0, 1)

In [None]:
df

In [None]:
df.loc[df['region'] == 'southwest', 'region'] = 0
df.loc[df['region'] == 'southeast', 'region'] = 1
df.loc[df['region'] == 'northwest', 'region'] = 2
df.loc[df['region'] == 'northeast', 'region'] = 3
df

In [None]:
df['label'] = df['expenses']

In [None]:
from pyspark.ml.regression import LinearRegression

In [None]:
insDF = spark.createDataFrame(df)
insDF.show()

In [None]:
vectorAssembler = VectorAssembler(inputCols=['age','sex','bmi','children','smoker','region'], outputCol='features')

In [None]:
vinsDF = vectorAssembler.transform(insDF)
vinsDF.show()

In [None]:
#indexer = StringIndexer(inputCol='expenses', outputCol='label')
#ivinsDF = indexer.fit(vinsDF).transform(vinsDF)
#ivinsDF.show()

In [None]:
splits = vinsDF.randomSplit([0.7,0.3],1)
trainDF = splits[0]
testDF = splits[1]

In [None]:
trainDF.show()

In [None]:
testDF.show()

In [None]:
testDF.select('features').where('expenses == 1737.38').take(1)

In [None]:
testDF.select('features').where('expenses == 2331.52').take(1)

In [None]:
vinsDF.count()

# Linear Regression

In [None]:
lr = LinearRegression(featuresCol='features', labelCol='label')

In [None]:
lrModel = lr.fit(trainDF)

In [None]:
lrPrediction = lrModel.transform(testDF)
lrPrediction.show()

In [None]:
evaluator = MulticlassClassificationEvaluator()

In [None]:
evaluator.setPredictionCol("prediction")

In [None]:
dfAccuracy = evaluator.evaluate(lrPrediction)

In [None]:
dfAccuracy

In [None]:
from pyspark.ml.feature import ChiSqSelector

In [None]:
selector=ChiSqSelector(percentile=0.9, featuresCol="features", outputCol='selectedFeatures', labelCol= "label")
model=selector.fit(trainDF)
result = model.transform(trainDF)
trainDF =result.select('label','selectedFeatures').withColumnRenamed('selectedFeatures', 'features')
new_test=model.transform(testDF)
testDF=new_test.select('label','selectedFeatures').withColumnRenamed('selectedFeatures', 'features')

In [None]:
df.sort_values(by=['expenses'])

In [None]:
lrModel.coefficients

In [None]:
lrModel.summary.rootMeanSquaredError

# Decision Tree

In [None]:
from pyspark.ml.regression import DecisionTreeRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import VectorAssembler

In [None]:
dt = DecisionTreeRegressor(featuresCol='features', labelCol='label')

In [None]:
dtModel = dt.fit(trainDF)

In [None]:
dtPredictions = dtModel.transform(testDF)

In [None]:
dtPredictions.show()

In [None]:
dtEvaluator = RegressionEvaluator(labelCol='label', predictionCol='prediction', metricName='rmse')

In [None]:
rmse = dtEvaluator.evaluate(dtPredictions)
rmse

# Random Rain Forrest

In [None]:
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import VectorAssembler

In [None]:
rf = RandomForestRegressor(featuresCol='features', labelCol='label')

In [None]:
rfModel = rf.fit(trainDF)

In [None]:
rfPrediction = rfModel.transform(testDF)

In [None]:
rfpdf = rfPrediction.toPandas()

In [None]:
rfpdf['prediction']

In [None]:
rfPrediction.show()

In [None]:
import matplotlib.pyplot as plt
plt.scatter(rfpdf['label'], rfpdf['prediction'])
plt.xlabel('Number')
plt.ylabel('Prediction')

In [None]:
dtEvaluator = RegressionEvaluator(labelCol='label', predictionCol='prediction', metricName='rmse')

In [None]:
rmse = dtEvaluator.evaluate(rfPrediction)

In [None]:
rmse