In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
        .appName("SimpleLinearReg") \
        .master("local[4]") \
        .config("spark.executor.memory", "4g") \
        .getOrCreate()

In [3]:
df = spark.read.format("csv") \
    .option("header", True) \
    .option("sep", ",") \
    .option("inferSchema", True) \
    .load("C:/Users/htcso/OneDrive/Masaüstü/pythonProject/datasets/Adversiting.csv")

In [4]:
df.toPandas().head()

Unnamed: 0,ID,TV,Radio,Newspaper,Sales
0,1,230.1,37.8,69.2,22.1
1,2,44.5,39.3,45.1,10.4
2,3,17.2,45.9,69.3,9.3
3,4,151.5,41.3,58.5,18.5
4,5,180.8,10.8,58.4,12.9


In [5]:
df = df.withColumn("advertisement", df.TV + df.Radio + df.Newspaper) \
    .withColumnRenamed("Sales", "label") \
    .drop("TV", "Radio", "Newspaper")

df.toPandas().head()

Unnamed: 0,ID,label,advertisement
0,1,22.1,337.1
1,2,10.4,128.9
2,3,9.3,132.4
3,4,18.5,251.3
4,5,12.9,250.0


In [6]:
df.describe("label", "advertisement").toPandas().head()

Unnamed: 0,summary,label,advertisement
0,count,200.0,200.0
1,mean,14.022500000000004,200.86049999999992
2,stddev,5.217456565710477,92.9851805869837
3,min,1.6,11.7
4,max,27.0,433.6


In [7]:
from pyspark.ml.feature import VectorAssembler

vector_assembler = VectorAssembler() \
                .setInputCols(["advertisement"]) \
                .setOutputCol("features")

In [8]:
from pyspark.ml.regression import LinearRegression

linear_reg_obj = LinearRegression() \
                .setLabelCol("label") \
                .setFeaturesCol("features")

In [9]:
from pyspark.ml import Pipeline

pipeline_obj = Pipeline().setStages([vector_assembler, linear_reg_obj])

In [10]:
train_df, test_df = df.randomSplit([0.8, 0.2], seed = 42)

In [11]:
pipeline_model = pipeline_obj.fit(train_df)

In [12]:
result_df = pipeline_model.transform(test_df)
result_df.toPandas().head()

Unnamed: 0,ID,label,advertisement,features,prediction
0,3,9.3,132.4,[132.39999999999998],10.680252
1,7,11.8,113.8,[113.8],9.763225
2,9,4.8,11.7,[11.7],4.729432
3,14,9.7,112.3,[112.3],9.689271
4,20,14.6,190.3,[190.3],13.534871


In [13]:
pipeline_model.stages

[VectorAssembler_803225039187,
 LinearRegressionModel: uid=LinearRegression_98438675719e, numFeatures=1]

In [14]:
lr_model = pipeline_model.stages[1]

In [15]:
lr_model.coefficients

DenseVector([0.0493])

In [16]:
lr_model.intercept

4.1525924428297

In [17]:
lr_model.summary.r2

0.7598154344381861

In [18]:
lr_model.summary.pValues

[0.0, 2.4424906541753444e-15]

In [19]:
lr_model.summary.rootMeanSquaredError

2.5847874097485826

In [20]:
# y = 4.1525924428297 + 0.0493 * advertisement

In [21]:
df_predict_rdd = spark.sparkContext.parallelize([100.0])
df_predict = df_predict_rdd.map(lambda x: (x,)).toDF(["advertisement"])

In [22]:
df_predict.show()

+-------------+
|advertisement|
+-------------+
|        100.0|
+-------------+



In [23]:
df_pred_vec = vector_assembler.transform(df_predict)

In [24]:
lr_model.transform(df_pred_vec).toPandas().head()

Unnamed: 0,advertisement,features,prediction
0,100.0,[100.0],9.082849


In [25]:
spark.stop()