In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
        .appName("MultipleLinearReg") \
        .master("local[4]") \
        .config("spark.executor.memory", "4g") \
        .getOrCreate()

In [3]:
df = spark.read.format("csv") \
    .option("header", True) \
    .option("sep", ",") \
    .option("inferSchema", True) \
    .load("C:/Users/htcso/OneDrive/Masaüstü/pythonProject/datasets/Adversiting.csv")

In [4]:
df.toPandas().head()

Unnamed: 0,ID,TV,Radio,Newspaper,Sales
0,1,230.1,37.8,69.2,22.1
1,2,44.5,39.3,45.1,10.4
2,3,17.2,45.9,69.3,9.3
3,4,151.5,41.3,58.5,18.5
4,5,180.8,10.8,58.4,12.9


In [5]:
df = df.selectExpr("ID", "TV","Radio", "Newspaper", "Sales as Label")
df.toPandas().head()

Unnamed: 0,ID,TV,Radio,Newspaper,Label
0,1,230.1,37.8,69.2,22.1
1,2,44.5,39.3,45.1,10.4
2,3,17.2,45.9,69.3,9.3
3,4,151.5,41.3,58.5,18.5
4,5,180.8,10.8,58.4,12.9


In [6]:
num_cols = ["TV","Radio", "Newspaper"]
label = ["label"]

In [7]:
df.describe().toPandas().head()

Unnamed: 0,summary,ID,TV,Radio,Newspaper,Label
0,count,200.0,200.0,200.0,200.0,200.0
1,mean,100.5,147.0425,23.264000000000024,30.553999999999995,14.022500000000004
2,stddev,57.87918451395112,85.85423631490805,14.846809176168728,21.77862083852283,5.217456565710477
3,min,1.0,0.7,0.0,0.3,1.6
4,max,200.0,296.4,49.6,114.0,27.0


In [8]:
from pyspark.ml.feature import VectorAssembler

vector_assembler = VectorAssembler().setInputCols(num_cols).setOutputCol("features")

In [9]:
from pyspark.ml.regression import LinearRegression

lr_obj = LinearRegression().setFeaturesCol("features").setLabelCol("Label")

In [10]:
from pyspark.ml import Pipeline

pipeline_obj = Pipeline().setStages([vector_assembler, lr_obj])

In [11]:
train_df, test_df = df.randomSplit([0.8, 0.2], seed = 142)

In [12]:
pipeline_model = pipeline_obj.fit(train_df)

In [13]:
result_df = pipeline_model.transform(test_df)
result_df.toPandas().head()

Unnamed: 0,ID,TV,Radio,Newspaper,Label,features,prediction
0,4,151.5,41.3,58.5,18.5,"[151.5, 41.3, 58.5]",17.57651
1,9,8.6,2.1,1.0,4.8,"[8.6, 2.1, 1.0]",3.786783
2,11,66.1,5.8,24.2,8.6,"[66.1, 5.8, 24.2]",7.028963
3,15,204.1,32.9,46.0,19.0,"[204.1, 32.9, 46.0]",18.428555
4,25,62.3,12.6,18.3,9.7,"[62.3, 12.6, 18.3]",8.178136


In [14]:
lr_model = pipeline_model.stages[1]

In [15]:
lr_model.coefficients

DenseVector([0.0457, 0.191, -0.0041])

In [16]:
lr_model.intercept

2.996373266653382

In [17]:
lr_model.summary.r2

0.9066490596556787

In [18]:
lr_model.summary.rootMeanSquaredError

1.5657192354052747

In [19]:
lr_model.summary.pValues

[0.0, 0.0, 0.5139804175664002, 6.661338147750939e-16]

In [20]:
num_cols = ["TV","Radio"]
label = ["label"]

In [21]:
vector_assembler = VectorAssembler().setInputCols(num_cols).setOutputCol("features")

In [22]:
lr_obj = LinearRegression().setFeaturesCol("features").setLabelCol("Label")

In [23]:
pipeline_obj = Pipeline().setStages([vector_assembler, lr_obj])

In [24]:
train_df, test_df = df.randomSplit([0.8, 0.2], seed = 142)

In [25]:
pipeline_model = pipeline_obj.fit(train_df)

In [26]:
lr_model = pipeline_model.stages[1]

In [27]:
lr_model.summary.r2

0.9063946363969204

In [28]:
lr_model.coefficients

DenseVector([0.0457, 0.1888])

In [29]:
lr_model.intercept

2.926454603092079

In [30]:
lr_model.summary.rootMeanSquaredError

1.5678514278266518

In [31]:
lr_model.summary.pValues

[0.0, 0.0, 0.0]

In [32]:
import pandas as pd

d = {"TV": [100.0], "Radio": [10.0]}
pd_df = pd.DataFrame(data = d)
pd_df.head()

Unnamed: 0,TV,Radio
0,100.0,10.0


In [33]:
predict_df = spark.createDataFrame(pd_df)
predict_df.show()

+-----+-----+
|   TV|Radio|
+-----+-----+
|100.0| 10.0|
+-----+-----+



In [35]:
lr_model.transform(vector_assembler.transform(predict_df)).show()

+-----+-----+------------+-----------------+
|   TV|Radio|    features|       prediction|
+-----+-----+------------+-----------------+
|100.0| 10.0|[100.0,10.0]|9.386286831310258|
+-----+-----+------------+-----------------+



In [36]:
spark.stop()