In [8]:
from pathlib import Path
from pyspark.sql import SparkSession, DataFrame
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline


BASE_DATA_DIR = Path().home() / "Documents/PySparkCurso/download"

spark: SparkSession = (
    SparkSession.builder.master("local").appName("Ml with spark").getOrCreate()
)

In [4]:
cars_temp: DataFrame = spark.read.csv(
    str(BASE_DATA_DIR / "Carros.csv"), inferSchema=True, header=True, sep=";"
)
cars_temp.show(5)

+-------+---------+-----------+---------------+----+-----+---------+-----------+-------+-----------+---+
|Consumo|Cilindros|Cilindradas|RelEixoTraseiro|Peso|Tempo|TipoMotor|Transmissao|Marchas|Carburadors| HP|
+-------+---------+-----------+---------------+----+-----+---------+-----------+-------+-----------+---+
|     21|        6|        160|             39| 262| 1646|        0|          1|      4|          4|110|
|     21|        6|        160|             39|2875| 1702|        0|          1|      4|          4|110|
|    228|        4|        108|            385| 232| 1861|        1|          1|      4|          1| 93|
|    214|        6|        258|            308|3215| 1944|        1|          0|      3|          1|110|
|    187|        8|        360|            315| 344| 1702|        0|          0|      3|          2|175|
+-------+---------+-----------+---------------+----+-----+---------+-----------+-------+-----------+---+
only showing top 5 rows



In [5]:
cars = cars_temp.select("Consumo", "Cilindros", "Cilindradas", "HP")
cars.show(5)

+-------+---------+-----------+---+
|Consumo|Cilindros|Cilindradas| HP|
+-------+---------+-----------+---+
|     21|        6|        160|110|
|     21|        6|        160|110|
|    228|        4|        108| 93|
|    214|        6|        258|110|
|    187|        8|        360|175|
+-------+---------+-----------+---+
only showing top 5 rows



In [6]:
vec_carac = VectorAssembler(
    inputCols=[("Consumo"), ("Cilindros"), ("Cilindradas")],
    outputCol="caracteristicas",
)

vec_car_traine = vec_carac.transform(cars)
vec_car_traine.show(5)

+-------+---------+-----------+---+-----------------+
|Consumo|Cilindros|Cilindradas| HP|  caracteristicas|
+-------+---------+-----------+---+-----------------+
|     21|        6|        160|110| [21.0,6.0,160.0]|
|     21|        6|        160|110| [21.0,6.0,160.0]|
|    228|        4|        108| 93|[228.0,4.0,108.0]|
|    214|        6|        258|110|[214.0,6.0,258.0]|
|    187|        8|        360|175|[187.0,8.0,360.0]|
+-------+---------+-----------+---+-----------------+
only showing top 5 rows



In [7]:
reglin = LinearRegression(featuresCol="caracteristicas", labelCol="HP")
model = reglin.fit(vec_car_traine)

24/04/15 15:56:32 WARN Instrumentation: [41a8e95d] regParam is zero, which might cause numerical instability and overfitting.


In [9]:
pipeline = Pipeline(stages=[vec_carac, reglin])
pipeline_model = pipeline.fit(cars)

24/04/15 15:58:09 WARN Instrumentation: [e9227fc9] regParam is zero, which might cause numerical instability and overfitting.


In [10]:
prev = pipeline_model.transform(cars)
prev.show()

+-------+---------+-----------+---+------------------+------------------+
|Consumo|Cilindros|Cilindradas| HP|   caracteristicas|        prediction|
+-------+---------+-----------+---+------------------+------------------+
|     21|        6|        160|110|  [21.0,6.0,160.0]|162.32154816816646|
|     21|        6|        160|110|  [21.0,6.0,160.0]|162.32154816816646|
|    228|        4|        108| 93| [228.0,4.0,108.0]| 82.51715587712931|
|    214|        6|        258|110| [214.0,6.0,258.0]|141.86680518718754|
|    187|        8|        360|175| [187.0,8.0,360.0]|202.93528239714834|
|    181|        6|        225|105| [181.0,6.0,225.0]| 145.4980634611832|
|    143|        8|        360|245| [143.0,8.0,360.0]|   207.41448530972|
|    244|        4|       1467| 62|[244.0,4.0,1467.0]| 69.69282676584851|
|    228|        4|       1408| 95|[228.0,4.0,1408.0]| 71.80767356085781|
|    192|        6|       1676|123|[192.0,6.0,1676.0]|132.42483285541724|
|    178|        6|       1676|123|[17