In [0]:
from pyspark.ml.regression import LinearRegression, RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import VectorAssembler

In [0]:
# Importa
Carros_temp = spark.read.csv("/FileStore/tables/Carros.csv", inferSchema=True, header=True, sep=";")
Carros_temp.show(5)

+-------+---------+-----------+---------------+----+-----+---------+-----------+-------+-----------+---+
|Consumo|Cilindros|Cilindradas|RelEixoTraseiro|Peso|Tempo|TipoMotor|Transmissao|Marchas|Carburadors| HP|
+-------+---------+-----------+---------------+----+-----+---------+-----------+-------+-----------+---+
|     21|        6|        160|             39| 262| 1646|        0|          1|      4|          4|110|
|     21|        6|        160|             39|2875| 1702|        0|          1|      4|          4|110|
|    228|        4|        108|            385| 232| 1861|        1|          1|      4|          1| 93|
|    214|        6|        258|            308|3215| 1944|        1|          0|      3|          1|110|
|    187|        8|        360|            315| 344| 1702|        0|          0|      3|          2|175|
+-------+---------+-----------+---------------+----+-----+---------+-----------+-------+-----------+---+
only showing top 5 rows



In [0]:
# Separa Colunas
Carros = Carros_temp.select("Consumo", "Cilindros", "Cilindradas", "HP")
Carros.show(5)

+-------+---------+-----------+---+
|Consumo|Cilindros|Cilindradas| HP|
+-------+---------+-----------+---+
|     21|        6|        160|110|
|     21|        6|        160|110|
|    228|        4|        108| 93|
|    214|        6|        258|110|
|    187|        8|        360|175|
+-------+---------+-----------+---+
only showing top 5 rows



In [0]:
# Vetorização Atributos
veccaracteristicas = VectorAssembler(inputCols=[("Consumo"),("Cilindros"),("Cilindradas")],outputCol="Características")
Carros = veccaracteristicas.transform(Carros)
Carros.show(5)

+-------+---------+-----------+---+-----------------+
|Consumo|Cilindros|Cilindradas| HP|  Características|
+-------+---------+-----------+---+-----------------+
|     21|        6|        160|110| [21.0,6.0,160.0]|
|     21|        6|        160|110| [21.0,6.0,160.0]|
|    228|        4|        108| 93|[228.0,4.0,108.0]|
|    214|        6|        258|110|[214.0,6.0,258.0]|
|    187|        8|        360|175|[187.0,8.0,360.0]|
+-------+---------+-----------+---+-----------------+
only showing top 5 rows



In [0]:
# Divide Treino e Teste
CarrosTreino, CarrosTeste = Carros.randomSplit([0.7,0.3])
print(CarrosTreino.count())
print(CarrosTeste.count())

18
14


In [0]:
# Modelo
reglin = LinearRegression(featuresCol="Características", labelCol="HP")
modelo = reglin.fit(CarrosTreino)

In [0]:
# Prever
previsao = modelo.transform(CarrosTeste)
previsao.show(5)

+-------+---------+-----------+---+-----------------+------------------+
|Consumo|Cilindros|Cilindradas| HP|  Características|        prediction|
+-------+---------+-----------+---+-----------------+------------------+
|     15|        8|        301|335| [15.0,8.0,301.0]|191.42657084954985|
|     21|        6|        160|110| [21.0,6.0,160.0]|128.26993562009395|
|    104|        8|        460|215|[104.0,8.0,460.0]|193.79983665454407|
|    104|        8|        472|205|[104.0,8.0,472.0]|193.91421936417865|
|    133|        8|        350|245|[133.0,8.0,350.0]|193.03080210275576|
+-------+---------+-----------+---+-----------------+------------------+
only showing top 5 rows



In [0]:
# Avaliar Performance
avaliar = RegressionEvaluator(predictionCol="prediction", labelCol="HP",metricName="rmse")
rmse = avaliar.evaluate(previsao)
print(rmse)

48.90471729185873
