In [11]:
from pathlib import Path
from pyspark.sql import SparkSession, DataFrame
from pyspark.ml.regression import LinearRegression, RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import VectorAssembler


BASE_DATA_DIR = Path().home() / "Documents/PySparkCurso/download"

spark: SparkSession = (
    SparkSession.builder.master("local").appName("Ml with spark").getOrCreate()
)

In [3]:
cars_temp: DataFrame = spark.read.csv(
    str(BASE_DATA_DIR / "Carros.csv"), inferSchema=True, header=True, sep=";"
)

cars_temp.show(5)

+-------+---------+-----------+---------------+----+-----+---------+-----------+-------+-----------+---+
|Consumo|Cilindros|Cilindradas|RelEixoTraseiro|Peso|Tempo|TipoMotor|Transmissao|Marchas|Carburadors| HP|
+-------+---------+-----------+---------------+----+-----+---------+-----------+-------+-----------+---+
|     21|        6|        160|             39| 262| 1646|        0|          1|      4|          4|110|
|     21|        6|        160|             39|2875| 1702|        0|          1|      4|          4|110|
|    228|        4|        108|            385| 232| 1861|        1|          1|      4|          1| 93|
|    214|        6|        258|            308|3215| 1944|        1|          0|      3|          1|110|
|    187|        8|        360|            315| 344| 1702|        0|          0|      3|          2|175|
+-------+---------+-----------+---------------+----+-----+---------+-----------+-------+-----------+---+
only showing top 5 rows



In [4]:
cars: DataFrame = cars_temp.select("Consumo", "Cilindros", "Cilindradas", "HP")
cars.show(5)

+-------+---------+-----------+---+
|Consumo|Cilindros|Cilindradas| HP|
+-------+---------+-----------+---+
|     21|        6|        160|110|
|     21|        6|        160|110|
|    228|        4|        108| 93|
|    214|        6|        258|110|
|    187|        8|        360|175|
+-------+---------+-----------+---+
only showing top 5 rows



In [5]:
vec_carac = VectorAssembler(
    inputCols=[("Consumo"), ("Cilindros"), ("Cilindradas")],
    outputCol="characteristics",
)

In [6]:
cars = vec_carac.transform(cars)
cars.show(5)

+-------+---------+-----------+---+-----------------+
|Consumo|Cilindros|Cilindradas| HP|  characteristics|
+-------+---------+-----------+---+-----------------+
|     21|        6|        160|110| [21.0,6.0,160.0]|
|     21|        6|        160|110| [21.0,6.0,160.0]|
|    228|        4|        108| 93|[228.0,4.0,108.0]|
|    214|        6|        258|110|[214.0,6.0,258.0]|
|    187|        8|        360|175|[187.0,8.0,360.0]|
+-------+---------+-----------+---+-----------------+
only showing top 5 rows



In [7]:
cars_traine, car_test = cars.randomSplit([0.7, 0.3])

print(cars_traine.count(), car_test.count())

22 10


### Modelo Regressão linear

In [8]:
reglin = LinearRegression(featuresCol="characteristics", labelCol="HP")
model = reglin.fit(cars_traine)

24/04/15 14:20:26 WARN Instrumentation: [cd13b401] regParam is zero, which might cause numerical instability and overfitting.


In [9]:
prev: DataFrame = model.transform(car_test)
prev.show(5)

+-------+---------+-----------+---+------------------+------------------+
|Consumo|Cilindros|Cilindradas| HP|   characteristics|        prediction|
+-------+---------+-----------+---+------------------+------------------+
|    133|        8|        350|245| [133.0,8.0,350.0]|215.40613079110423|
|    152|        8|        304|150| [152.0,8.0,304.0]| 213.8041142265771|
|    152|        8|       2758|180|[152.0,8.0,2758.0]|186.72919714566717|
|    155|        8|        318|150| [155.0,8.0,318.0]|213.31656838415577|
|    158|        8|        351|264| [158.0,8.0,351.0]|212.61939604436802|
+-------+---------+-----------+---+------------------+------------------+
only showing top 5 rows



In [10]:
aval = RegressionEvaluator(
    predictionCol="prediction",
    labelCol="HP",
    metricName="rmse",
)

rmse = aval.evaluate(prev)
print(rmse)

41.26235225342907


### RandomForestRegression

In [12]:
rfr = RandomForestRegressor(featuresCol="characteristics", labelCol="HP")
model_2 = rfr.fit(cars_traine)

24/04/15 14:25:42 WARN DecisionTreeMetadata: DecisionTree reducing maxBins from 32 to 22 (= number of training instances)


In [13]:
prev_2: DataFrame = model_2.transform(car_test)
prev_2.show(5)



+-------+---------+-----------+---+------------------+------------------+
|Consumo|Cilindros|Cilindradas| HP|   characteristics|        prediction|
+-------+---------+-----------+---+------------------+------------------+
|    133|        8|        350|245| [133.0,8.0,350.0]|207.91118326118325|
|    152|        8|        304|150| [152.0,8.0,304.0]| 214.3695165945166|
|    152|        8|       2758|180|[152.0,8.0,2758.0]| 195.8945165945166|
|    155|        8|        318|150| [155.0,8.0,318.0]| 214.3695165945166|
|    158|        8|        351|264| [158.0,8.0,351.0]|192.23618326118327|
+-------+---------+-----------+---+------------------+------------------+
only showing top 5 rows



In [14]:
rmse_2 = aval.evaluate(prev_2)

print(rmse_2, rmse)

45.82873897758248 41.26235225342907
