Importando Spark e CSV

In [8]:
import findspark
import shutil
import glob
import os
findspark.init()
from pyspark.sql import functions as F

from pyspark.sql import SparkSession
from pyspark.sql.functions import concat
from pyspark.sql.functions import col

# Criar a sessão Spark de forma mais simples
spark = SparkSession.builder \
    .appName("TestePySpark") \
    .master("local[*]") \
    .getOrCreate()

caminho_csv = "dados/dados_gol.csv"

Leitura do CSV e criando um dataframe correto

In [9]:
with open(caminho_csv, "r", encoding="utf-8") as file:
    lines = file.readlines()

colunas = lines[1].strip().split(";")

colunas = [col.strip('"') for col in colunas]

novo_caminho_csv = "dados/dados_sem_data.csv"
with open(novo_caminho_csv, "w", encoding="utf-8") as file:
    file.writelines(lines[2:]) 


df = spark.read.option("header", "true").option("delimiter", ";").csv(novo_caminho_csv)

df = df.toDF(*colunas)

Aplicando o Filtro

In [10]:
# Filtro
df_filtrado = df.filter(
    (df["EMPRESA_SIGLA"] == "GLO") &
    (df["GRUPO_DE_VOO"] == "REGULAR") &
    (df["NATUREZA"] == "DOMÉSTICA")
)

# Exibir os dados filtrados
df_filtrado.show(truncate=False)

+-------------+---------------------------------------------------+---------------------+----+---+-------------------------+------------------------+----------------------+--------------------------+------------------------+------------------------------+--------------------------+-------------------------+-----------------------+---------------------------+-------------------------+-------------------------------+---------+------------+-----------------+------------------+-------------+---------------+----------+-------+-------+------+------+------------------+------------------+----------+-------------+---------------+----------+--------+-------+------------+----------+
|EMPRESA_SIGLA|EMPRESA_NOME                                       |EMPRESA_NACIONALIDADE|ANO |MES|AEROPORTO_DE_ORIGEM_SIGLA|AEROPORTO_DE_ORIGEM_NOME|AEROPORTO_DE_ORIGEM_UF|AEROPORTO_DE_ORIGEM_REGIAO|AEROPORTO_DE_ORIGEM_PAIS|AEROPORTO_DE_ORIGEM_CONTINENTE|AEROPORTO_DE_DESTINO_SIGLA|AEROPORTO_DE_DESTINO_NOME|AEROPORTO_D

Criando um data com os campos que preciso

In [11]:
df_prep = df_filtrado.select(
    "MES", 
    "ANO", 
    "RPK", 
    concat(df_filtrado["AEROPORTO_DE_ORIGEM_SIGLA"], df_filtrado["AEROPORTO_DE_DESTINO_SIGLA"]).alias("MERCADO")
)

df_final = df_prep.na.drop()

df_final = df_final.withColumn(
    "MES", 
    F.when(df_final["MES"] == 1, "01")
     .otherwise(F.when(df_final["MES"].isin([10, 11, 12]), df_final["MES"])
     .otherwise(F.lpad(df_final["MES"].cast("string"), 2, '0')))
)

df_final.show(5, truncate=False)

+---+----+-------+--------+
|MES|ANO |RPK    |MERCADO |
+---+----+-------+--------+
|01 |2001|887040 |SBBHSBSP|
|01 |2001|743280 |SBBHSBSV|
|01 |2001|1172660|SBBRSBGL|
|01 |2001|2845110|SBBRSBSP|
|01 |2001|1144680|SBBRSBSV|
+---+----+-------+--------+
only showing top 5 rows



Conectando com o PSQL

In [None]:
from pyspark.sql import SparkSession

jdbc_driver_path = "C:/Users/User/AppData/Local/Programs/Python/Python311/Lib/site-packages/pyspark/jars/postgresql-42.7.5.jar"

spark = SparkSession.builder \
    .appName("PostgreSQL Connection") \
    .config("spark.jars", jdbc_driver_path) \
    .config("spark.driver.extraClassPath", jdbc_driver_path) \
    .getOrCreate()

database_url = "jdbc:postgresql://localhost:5432/anac"


Escrevendo o PSQL

In [13]:
df_DB = df_final.withColumn("mes", col("mes").cast("int")) \
             .withColumn("ano", col("ano").cast("int")) \
             .withColumn("rpk", col("rpk").cast("double"))

df_DB.write \
    .format("jdbc") \
    .option("url", database_url) \
    .option("dbtable", "voos") \
    .option("user", "root") \
    .option("password", "root") \
    .option("driver", "org.postgresql.Driver") \
    .mode("append") \
    .save()

Py4JJavaError: An error occurred while calling o174.save.
: org.postgresql.util.PSQLException: A tentativa de conexão falhou.
	at org.postgresql.core.v3.ConnectionFactoryImpl.openConnectionImpl(ConnectionFactoryImpl.java:354)
	at org.postgresql.core.ConnectionFactory.openConnection(ConnectionFactory.java:54)
	at org.postgresql.jdbc.PgConnection.<init>(PgConnection.java:253)
	at org.postgresql.Driver.makeConnection(Driver.java:434)
	at org.postgresql.Driver.connect(Driver.java:291)
	at org.apache.spark.sql.execution.datasources.jdbc.connection.BasicConnectionProvider.getConnection(BasicConnectionProvider.scala:49)
	at org.apache.spark.sql.execution.datasources.jdbc.connection.ConnectionProviderBase.create(ConnectionProvider.scala:102)
	at org.apache.spark.sql.jdbc.JdbcDialect.$anonfun$createConnectionFactory$1(JdbcDialects.scala:161)
	at org.apache.spark.sql.jdbc.JdbcDialect.$anonfun$createConnectionFactory$1$adapted(JdbcDialects.scala:157)
	at org.apache.spark.sql.execution.datasources.jdbc.JdbcRelationProvider.createRelation(JdbcRelationProvider.scala:50)
	at org.apache.spark.sql.execution.datasources.SaveIntoDataSourceCommand.run(SaveIntoDataSourceCommand.scala:48)
	at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult$lzycompute(commands.scala:75)
	at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult(commands.scala:73)
	at org.apache.spark.sql.execution.command.ExecutedCommandExec.executeCollect(commands.scala:84)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.$anonfun$applyOrElse$1(QueryExecution.scala:107)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$6(SQLExecution.scala:125)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:201)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:108)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:900)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:66)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:107)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:98)
	at org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$transformDownWithPruning$1(TreeNode.scala:461)
	at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(origin.scala:76)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformDownWithPruning(TreeNode.scala:461)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.org$apache$spark$sql$catalyst$plans$logical$AnalysisHelper$$super$transformDownWithPruning(LogicalPlan.scala:32)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning(AnalysisHelper.scala:267)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning$(AnalysisHelper.scala:263)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:32)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:32)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:437)
	at org.apache.spark.sql.execution.QueryExecution.eagerlyExecuteCommands(QueryExecution.scala:98)
	at org.apache.spark.sql.execution.QueryExecution.commandExecuted$lzycompute(QueryExecution.scala:85)
	at org.apache.spark.sql.execution.QueryExecution.commandExecuted(QueryExecution.scala:83)
	at org.apache.spark.sql.execution.QueryExecution.assertCommandExecuted(QueryExecution.scala:142)
	at org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:869)
	at org.apache.spark.sql.DataFrameWriter.saveToV1Source(DataFrameWriter.scala:391)
	at org.apache.spark.sql.DataFrameWriter.saveInternal(DataFrameWriter.scala:364)
	at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:251)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.lang.Thread.run(Thread.java:750)
Caused by: java.net.UnknownHostException: postgresql
	at java.net.AbstractPlainSocketImpl.connect(AbstractPlainSocketImpl.java:184)
	at java.net.PlainSocketImpl.connect(PlainSocketImpl.java:172)
	at java.net.SocksSocketImpl.connect(SocksSocketImpl.java:392)
	at java.net.Socket.connect(Socket.java:607)
	at org.postgresql.core.PGStream.createSocket(PGStream.java:243)
	at org.postgresql.core.PGStream.<init>(PGStream.java:98)
	at org.postgresql.core.v3.ConnectionFactoryImpl.tryConnect(ConnectionFactoryImpl.java:132)
	at org.postgresql.core.v3.ConnectionFactoryImpl.openConnectionImpl(ConnectionFactoryImpl.java:258)
	... 50 more


Lendo o Banco PSQL

In [None]:
properties = {
    "user": "root",
    "password": "root",
    "driver": "org.postgresql.Driver"
}
# Lendo uma tabela
df_DB = spark.read.jdbc(url=database_url, 
          table="voos", 
          properties=properties)

df_DB.show()

+---+----+---+--------+---------+
| id| ano|mes| mercado|      rpk|
+---+----+---+--------+---------+
|  1|2012|  6|SBILSBSV| 549900.0|
|  2|2012|  6|SBIZSBBR|3975550.0|
|  3|2012|  6|SBIZSBSL|1452580.0|
|  4|2012|  6|SBJPSBAR|  40824.0|
|  5|2012|  6|SBJPSBBR|6005780.0|
|  6|2012|  6|SBJPSBGL|1.35814E7|
|  7|2012|  6|SBJPSBGR|      0.0|
|  8|2012|  6|SBJPSBSV|2562850.0|
|  9|2012|  6|SBJUSBFZ| 868411.0|
| 10|2012|  6|SBJUSBRF|1253910.0|
| 11|2012|  6|SBJVSBKP|      0.0|
| 12|2012|  6|SBJVSBSP|1388040.0|
| 13|2012|  6|SBKGSBSV|3405680.0|
| 14|2012|  6|SBKPSBBR|6066400.0|
| 15|2012|  6|SBKPSBCF|4226530.0|
| 16|2012|  6|SBKPSBCT|1379600.0|
| 17|2012|  6|SBKPSBGL|5765030.0|
| 18|2012|  6|SBKPSBGR|  38678.0|
| 19|2012|  6|SBKPSBPA| 328624.0|
| 20|2012|  6|SBKPSBSP|  21924.0|
+---+----+---+--------+---------+
only showing top 20 rows

