In [22]:
from pyspark import SparkConf, SparkContext
from pyspark.sql.session import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

from pyspark.ml.regression import LinearRegression
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [23]:
conf = SparkConf().setMaster("local[10]").setAppName("regressao_linear_multipla").set("spark.executor.memory", "14g")
spark = SparkSession.builder.config(conf=conf).enableHiveSupport().getOrCreate()

In [67]:
file_schema = StructType([ StructField("semanas", IntegerType(), True),
                           StructField("clientes", IntegerType(), True),
                           StructField("vendas", DoubleType(), True)])

In [74]:
dataframe = spark.read.format("csv") \
               .option("header", "true") \
               .schema(file_schema) \
               .load('data.csv')

In [75]:
dataframe.toPandas()

Unnamed: 0,semanas,clientes,vendas
0,1,907,11.2
1,2,926,11.05
2,3,506,6.84
3,4,741,9.21
4,5,789,9.42
5,6,889,10.08
6,7,874,9.45
7,8,510,6.73
8,9,529,7.24
9,10,420,6.12


In [91]:
# MULTIPLA
feature_assembler = VectorAssembler(inputCols=['semanas','clientes'],outputCol='independent_features')

# SIMPLES
#feature_assembler = VectorAssembler(inputCols=['clientes'],outputCol='independent_features')

output = feature_assembler.transform(dataframe)
output.toPandas()

Unnamed: 0,semanas,clientes,vendas,independent_features
0,1,907,11.2,"[1.0, 907.0]"
1,2,926,11.05,"[2.0, 926.0]"
2,3,506,6.84,"[3.0, 506.0]"
3,4,741,9.21,"[4.0, 741.0]"
4,5,789,9.42,"[5.0, 789.0]"
5,6,889,10.08,"[6.0, 889.0]"
6,7,874,9.45,"[7.0, 874.0]"
7,8,510,6.73,"[8.0, 510.0]"
8,9,529,7.24,"[9.0, 529.0]"
9,10,420,6.12,"[10.0, 420.0]"


In [92]:
dataframe_final = output.select('independent_features','vendas')

In [93]:
dataframe_final.toPandas()

Unnamed: 0,independent_features,vendas
0,"[1.0, 907.0]",11.2
1,"[2.0, 926.0]",11.05
2,"[3.0, 506.0]",6.84
3,"[4.0, 741.0]",9.21
4,"[5.0, 789.0]",9.42
5,"[6.0, 889.0]",10.08
6,"[7.0, 874.0]",9.45
7,"[8.0, 510.0]",6.73
8,"[9.0, 529.0]",7.24
9,"[10.0, 420.0]",6.12


In [94]:
train_data,test_data = dataframe_final.randomSplit([0.75,0.25])

In [100]:
regressor = LinearRegression(featuresCol='independent_features',labelCol='vendas')
model = regressor.fit(train_data)

In [114]:
print("Coefficients: %s" % str(model.coefficients))
print("Intercept: %s" % str(model.intercept))

Coefficients: [-0.018828605289097407,0.008592062200864042]
Intercept: 2.73967903593


In [106]:
training_summary = model.summary

In [115]:
print("numIterations: %d" % training_summary.totalIterations)
print("objectiveHistory: %s" % str(training_summary.objectiveHistory))

numIterations: 1
objectiveHistory: [0.0]


In [116]:
training_summary.residuals.toPandas()

Unnamed: 0,residuals
0,0.686149
1,0.391729
2,-0.190777
3,0.178917
4,-0.004673
5,-0.185051
6,-0.667341
7,-0.241002
8,-0.040059
9,-0.736575


In [117]:
print("RMSE: %f" % training_summary.rootMeanSquaredError)
print("r2: %f" % training_summary.r2)

RMSE: 0.437445
r2: 0.917529


In [118]:
pred_result = model.evaluate(test_data)

In [119]:
pred_result.predictions.toPandas()

Unnamed: 0,independent_features,vendas,prediction
0,"[9.0, 529.0]",7.24,7.115422
1,"[13.0, 924.0]",9.46,10.433973
2,"[19.0, 1010.0]",11.77,11.059918
