In [15]:
from pyspark.ml.regression import LinearRegression, RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature    import VectorAssembler
from pyspark.sql.session   import SparkSession
from pyspark.sql           import functions as f

In [7]:
spark = SparkSession.builder.appName('ML').getOrCreate()
spark

In [47]:
params = dict(
    delimiter = ';',
    header    = 'True',
    inferschema='True'
)
carros = spark\
            .read.format('csv')\
            .options(**params).load('datasets/Carros.csv')

In [48]:
carros.printSchema()

root
 |-- Consumo: integer (nullable = true)
 |-- Cilindros: integer (nullable = true)
 |-- Cilindradas: integer (nullable = true)
 |-- RelEixoTraseiro: integer (nullable = true)
 |-- Peso: integer (nullable = true)
 |-- Tempo: integer (nullable = true)
 |-- TipoMotor: integer (nullable = true)
 |-- Transmissao: integer (nullable = true)
 |-- Marchas: integer (nullable = true)
 |-- Carburadors: integer (nullable = true)
 |-- HP: integer (nullable = true)



In [49]:
carros.describe().show()

+-------+-----------------+------------------+-----------------+------------------+------------------+-----------------+------------------+-------------------+------------------+------------------+-----------------+
|summary|          Consumo|         Cilindros|      Cilindradas|   RelEixoTraseiro|              Peso|            Tempo|         TipoMotor|        Transmissao|           Marchas|       Carburadors|               HP|
+-------+-----------------+------------------+-----------------+------------------+------------------+-----------------+------------------+-------------------+------------------+------------------+-----------------+
|  count|               32|                32|               32|                32|                32|               32|                32|                 32|                32|                32|               32|
|   mean|         177.5625|            6.1875|        796.34375|         318.03125|        1257.78125|         969.8125|            0.43

In [50]:
carros.show(3)

+-------+---------+-----------+---------------+----+-----+---------+-----------+-------+-----------+---+
|Consumo|Cilindros|Cilindradas|RelEixoTraseiro|Peso|Tempo|TipoMotor|Transmissao|Marchas|Carburadors| HP|
+-------+---------+-----------+---------------+----+-----+---------+-----------+-------+-----------+---+
|     21|        6|        160|             39| 262| 1646|        0|          1|      4|          4|110|
|     21|        6|        160|             39|2875| 1702|        0|          1|      4|          4|110|
|    228|        4|        108|            385| 232| 1861|        1|          1|      4|          1| 93|
+-------+---------+-----------+---------------+----+-----+---------+-----------+-------+-----------+---+
only showing top 3 rows



In [51]:
carros.select(
    [f.sum_distinct(f.col(i)).alias(i) for i in carros.columns]
    ).show()

+-------+---------+-----------+---------------+-----+-----+---------+-----------+-------+-----------+----+
|Consumo|Cilindros|Cilindradas|RelEixoTraseiro| Peso|Tempo|TipoMotor|Transmissao|Marchas|Carburadors|  HP|
+-------+---------+-----------+---------------+-----+-----+---------+-----------+-------+-----------+----+
|   4467|       18|      17771|           7011|39204|29143|        1|          1|     12|         24|3180|
+-------+---------+-----------+---------------+-----+-----+---------+-----------+-------+-----------+----+



In [52]:
encoder = VectorAssembler(
    inputCols=[('Consumo'),('Cilindros'),('Cilindradas')], 
    outputCol='caracteristicas'
)
encoder

VectorAssembler_c744669575d9

In [53]:
carros = carros.select(['Consumo','Cilindros','Cilindradas','HP'])

In [54]:
carros = encoder.transform(carros)

In [55]:
carros.show()

+-------+---------+-----------+---+------------------+
|Consumo|Cilindros|Cilindradas| HP|   caracteristicas|
+-------+---------+-----------+---+------------------+
|     21|        6|        160|110|  [21.0,6.0,160.0]|
|     21|        6|        160|110|  [21.0,6.0,160.0]|
|    228|        4|        108| 93| [228.0,4.0,108.0]|
|    214|        6|        258|110| [214.0,6.0,258.0]|
|    187|        8|        360|175| [187.0,8.0,360.0]|
|    181|        6|        225|105| [181.0,6.0,225.0]|
|    143|        8|        360|245| [143.0,8.0,360.0]|
|    244|        4|       1467| 62|[244.0,4.0,1467.0]|
|    228|        4|       1408| 95|[228.0,4.0,1408.0]|
|    192|        6|       1676|123|[192.0,6.0,1676.0]|
|    178|        6|       1676|123|[178.0,6.0,1676.0]|
|    164|        8|       2758|180|[164.0,8.0,2758.0]|
|    173|        8|       2758|180|[173.0,8.0,2758.0]|
|    152|        8|       2758|180|[152.0,8.0,2758.0]|
|    104|        8|        472|205| [104.0,8.0,472.0]|
|    104| 

In [56]:
train, test = carros.randomSplit(weights=[0.75, 0.25], seed=42)

In [60]:
reg = LinearRegression(
    featuresCol='caracteristicas',
    labelCol='HP'
    )
reg

LinearRegression_b9ca3404a07e

In [65]:
model = reg.fit(dataset=train)

24/05/10 18:28:20 WARN Instrumentation: [bd1fca83] regParam is zero, which might cause numerical instability and overfitting.


In [67]:
pred = model.transform(test)

In [69]:
pred.show()

+-------+---------+-----------+---+------------------+------------------+
|Consumo|Cilindros|Cilindradas| HP|   caracteristicas|        prediction|
+-------+---------+-----------+---+------------------+------------------+
|     21|        6|        160|110|  [21.0,6.0,160.0]| 176.6769761803343|
|    133|        8|        350|245| [133.0,8.0,350.0]| 216.0600490024798|
|    147|        8|        440|230| [147.0,8.0,440.0]| 212.5232856856811|
|    152|        8|        304|150| [152.0,8.0,304.0]|213.25582934964052|
|    164|        8|       2758|180|[164.0,8.0,2758.0]|182.01360055097877|
|    192|        8|        400|175| [192.0,8.0,400.0]| 205.0633823107468|
|    215|        4|       1201| 97|[215.0,4.0,1201.0]| 68.72757990618165|
|    304|        4|        951|113| [304.0,4.0,951.0]| 56.00181462650449|
|    324|        4|        787| 66| [324.0,4.0,787.0]|54.421801264423046|
+-------+---------+-----------+---+------------------+------------------+

