In [14]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler, StandardScaler
from pyspark.ml.classification import LogisticRegression


In [2]:
spark = SparkSession.builder.appName("Pipeline").getOrCreate()

In [4]:
dados = spark.read.option("header",True).option("inferSchema",True).csv("dados_regressao_cor.csv")

In [5]:
dados.show(5)

+--------+------------------+------------------+------------------+
|     cor|                x1|                x2|                 y|
+--------+------------------+------------------+------------------+
|   verde| 3.745401188473625|3.7364081846669848| 12.30060447765728|
|    azul|  9.50714306409916|3.3291209623140325|11.757695314234958|
|    azul| 7.319939418114051|1.7615391250286005| 9.506989456356553|
|   verde| 5.986584841970366|  6.07266670101488|19.769172200123297|
|vermelho|1.5601864044243652| 4.766241605086289| 7.215801339758797|
+--------+------------------+------------------+------------------+
only showing top 5 rows



In [10]:
indexador_label = StringIndexer(inputCol="cor",outputCol="label")


In [19]:
assembler = VectorAssembler(inputCols=["x1","x2", "y"], outputCol="features_raw")

In [13]:
scaler = StandardScaler(inputCol="features_raw",outputCol="features")

In [15]:
lr = LogisticRegression(featuresCol="features",labelCol="label")

In [17]:
from pyspark.ml import Pipeline

In [20]:
pipeline = Pipeline(stages=[indexador_label,assembler,scaler,lr])

In [23]:
from pyspark.ml.tuning import ParamGridBuilder
paramGrid = ParamGridBuilder() \
    .addGrid(lr.regParam,[0.01,0.1]) \
    .addGrid(lr.elasticNetParam,[0.0,0.5,1.0]) \
    .build()
    

In [25]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="accuracy")

In [28]:
from pyspark.ml.tuning import CrossValidator

validador = validador = CrossValidator(
    estimator=pipeline,
    estimatorParamMaps=paramGrid,
    evaluator=evaluator,
    numFolds=3,
    seed=42
)

In [29]:
modelo_cv = validador.fit(dados)

In [30]:
resultado = modelo_cv.transform(dados)

In [31]:
resultado.select("cor", "x1","x2", "y", "prediction").show(5)

+--------+------------------+------------------+------------------+----------+
|     cor|                x1|                x2|                 y|prediction|
+--------+------------------+------------------+------------------+----------+
|   verde| 3.745401188473625|3.7364081846669848| 12.30060447765728|       0.0|
|    azul|  9.50714306409916|3.3291209623140325|11.757695314234958|       2.0|
|    azul| 7.319939418114051|1.7615391250286005| 9.506989456356553|       2.0|
|   verde| 5.986584841970366|  6.07266670101488|19.769172200123297|       0.0|
|vermelho|1.5601864044243652| 4.766241605086289| 7.215801339758797|       0.0|
+--------+------------------+------------------+------------------+----------+
only showing top 5 rows



In [32]:
acuracia = evaluator.evaluate(resultado)

In [33]:
print(acuracia)

0.8873


In [34]:
modelo_cv.write().overwrite().save("modelos/pipeline_classificacao")

In [35]:
from pyspark.ml.tuning import CrossValidatorModel

In [36]:
modelo_carregado = CrossValidatorModel.load("modelos/pipeline_classificacao")

In [38]:
from pyspark.sql import Row
novo_dado = spark.createDataFrame([
    Row(cor="azul", x1=4.32,x2=5.7,y=10.3)
])

In [39]:
previsao = modelo_carregado.transform(novo_dado)

In [40]:
previsao.show(10)

+----+----+---+----+-----+---------------+--------------------+--------------------+--------------------+----------+
| cor|  x1| x2|   y|label|   features_raw|            features|       rawPrediction|         probability|prediction|
+----+----+---+----+-----+---------------+--------------------+--------------------+--------------------+----------+
|azul|4.32|5.7|10.3|  2.0|[4.32,5.7,10.3]|[1.50192890159389...|[-2.1126177117169...|[0.07876847205812...|       1.0|
+----+----+---+----+-----+---------------+--------------------+--------------------+--------------------+----------+



----------------------------------------
Exception occurred during processing of request from ('127.0.0.1', 35334)
Traceback (most recent call last):
  File "/opt/conda/lib/python3.11/socketserver.py", line 317, in _handle_request_noblock
    self.process_request(request, client_address)
  File "/opt/conda/lib/python3.11/socketserver.py", line 348, in process_request
    self.finish_request(request, client_address)
  File "/opt/conda/lib/python3.11/socketserver.py", line 361, in finish_request
    self.RequestHandlerClass(request, client_address, self)
  File "/opt/conda/lib/python3.11/socketserver.py", line 755, in __init__
    self.handle()
  File "/usr/local/spark/python/pyspark/accumulators.py", line 295, in handle
    poll(accum_updates)
  File "/usr/local/spark/python/pyspark/accumulators.py", line 267, in poll
    if self.rfile in r and func():
                           ^^^^^^
  File "/usr/local/spark/python/pyspark/accumulators.py", line 271, in accum_updates
    num_updates =