In [1]:
import os
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

In [2]:
# criar spark session
spark = SparkSession \
    .builder \
    .config('spark.jars.packages', 'org.apache.hadoop:hadoop-aws:3.2.0,org.postgresql:postgresql:42.2.20') \
    .config('spark.hadoop.fs.s3a.access.key', '') \
    .config('spark.hadoop.fs.s3a.secret.key', '') \
    .appName('spark etl - pensionistas') \
    .getOrCreate()

# ler arquivos no s3
df = spark.read.csv('s3a://custos-stn-bucket/pensionista/pensionista_0.csv', header=True, inferSchema=True)

In [3]:
# schema
df.printSchema()

root
 |-- co_natureza_juridica: integer (nullable = true)
 |-- ds_natureza_juridica: string (nullable = true)
 |-- co_organizacao_n1: integer (nullable = true)
 |-- ds_organizacao_n1: string (nullable = true)
 |-- co_organizacao_n2: integer (nullable = true)
 |-- ds_organizacao_n2: string (nullable = true)
 |-- co_organizacao_n3: integer (nullable = true)
 |-- ds_organizacao_n3: string (nullable = true)
 |-- an_lanc: integer (nullable = true)
 |-- me_lanc: integer (nullable = true)
 |-- va_custo_pensionistas: double (nullable = true)



In [4]:
# contagem de linhas
df.count()

250

In [5]:
# emprimir primeiras 5 linhas
df.show(5)

+--------------------+--------------------+-----------------+-------------------+-----------------+--------------------+-----------------+--------------------+-------+-------+---------------------+
|co_natureza_juridica|ds_natureza_juridica|co_organizacao_n1|  ds_organizacao_n1|co_organizacao_n2|   ds_organizacao_n2|co_organizacao_n3|   ds_organizacao_n3|an_lanc|me_lanc|va_custo_pensionistas|
+--------------------+--------------------+-----------------+-------------------+-----------------+--------------------+-----------------+--------------------+-------+-------+---------------------+
|                   3|ADMINISTRACAO DIRETA|              304|MINISTERIO DA SAUDE|             3273|SECRETARIA-EXECUTIVA|            39109|SUPERINTENDENCIA ...|   2019|      7|        1.476405474E7|
|                   3|ADMINISTRACAO DIRETA|              304|MINISTERIO DA SAUDE|             3273|SECRETARIA-EXECUTIVA|            39109|SUPERINTENDENCIA ...|   2019|      1|        1.464746072E7|
|         

In [6]:
# criar view para spark sql
df.createOrReplaceTempView("pensionistas")

In [7]:

df_ps = spark.sql("""
                  select *
                  from pensionistas
                  """)

df_ps.printSchema()


root
 |-- co_natureza_juridica: integer (nullable = true)
 |-- ds_natureza_juridica: string (nullable = true)
 |-- co_organizacao_n1: integer (nullable = true)
 |-- ds_organizacao_n1: string (nullable = true)
 |-- co_organizacao_n2: integer (nullable = true)
 |-- ds_organizacao_n2: string (nullable = true)
 |-- co_organizacao_n3: integer (nullable = true)
 |-- ds_organizacao_n3: string (nullable = true)
 |-- an_lanc: integer (nullable = true)
 |-- me_lanc: integer (nullable = true)
 |-- va_custo_pensionistas: double (nullable = true)



In [8]:
df_ps.show()

+--------------------+--------------------+-----------------+--------------------+-----------------+--------------------+-----------------+--------------------+-------+-------+---------------------+
|co_natureza_juridica|ds_natureza_juridica|co_organizacao_n1|   ds_organizacao_n1|co_organizacao_n2|   ds_organizacao_n2|co_organizacao_n3|   ds_organizacao_n3|an_lanc|me_lanc|va_custo_pensionistas|
+--------------------+--------------------+-----------------+--------------------+-----------------+--------------------+-----------------+--------------------+-------+-------+---------------------+
|                   3|ADMINISTRACAO DIRETA|              304| MINISTERIO DA SAUDE|             3273|SECRETARIA-EXECUTIVA|            39109|SUPERINTENDENCIA ...|   2019|      7|        1.476405474E7|
|                   3|ADMINISTRACAO DIRETA|              304| MINISTERIO DA SAUDE|             3273|SECRETARIA-EXECUTIVA|            39109|SUPERINTENDENCIA ...|   2019|      1|        1.464746072E7|
|    

In [9]:
df_ps.head(1) # Mostra o cabeçalho da primeira linha

[Row(co_natureza_juridica=3, ds_natureza_juridica='ADMINISTRACAO DIRETA', co_organizacao_n1=304, ds_organizacao_n1='MINISTERIO DA SAUDE', co_organizacao_n2=3273, ds_organizacao_n2='SECRETARIA-EXECUTIVA', co_organizacao_n3=39109, ds_organizacao_n3='SUPERINTENDENCIA ESTADUAL DO MINISTERIO DA SAUDE EM SAO PAULO', an_lanc=2019, me_lanc=7, va_custo_pensionistas=14764054.74)]

In [10]:
df_ps.describe(["co_natureza_juridica","ds_natureza_juridica","co_organizacao_n1","ds_organizacao_n1","co_organizacao_n2"]).show()

+-------+--------------------+--------------------+-----------------+--------------------+------------------+
|summary|co_natureza_juridica|ds_natureza_juridica|co_organizacao_n1|   ds_organizacao_n1| co_organizacao_n2|
+-------+--------------------+--------------------+-----------------+--------------------+------------------+
|  count|                 250|                 250|              250|                 250|               250|
|   mean|               2.776|                null|        18416.016|                null|          5162.596|
| stddev|  0.8583537058447119|                null|54633.80115116138|                null|19974.832500284996|
|    min|                   2|ADMINISTRACAO DIRETA|               14|MINISTERIO DA AGR...|                 3|
|    max|                   4|    FUNDACAO PUBLICA|           235876|MINISTERIO DO MEI...|            239899|
+-------+--------------------+--------------------+-----------------+--------------------+------------------+



In [11]:
df_ps.describe(["ds_organizacao_n2","co_organizacao_n3","ds_organizacao_n3","an_lanc","me_lanc"]).show()

+-------+--------------------+------------------+--------------------+------------------+-----------------+
|summary|   ds_organizacao_n2| co_organizacao_n3|   ds_organizacao_n3|           an_lanc|          me_lanc|
+-------+--------------------+------------------+--------------------+------------------+-----------------+
|  count|                 250|               250|                 250|               250|              250|
|   mean|                null|         15657.344|                null|          2018.332|            6.628|
| stddev|                null|21799.122217361266|                null|0.6809041945004913|3.415299853946864|
|    min|    COLEGIO PEDRO II|                -9|CENTRO DE CIENCIA...|              2015|                1|
|    max|UNIVERSIDADE FEDE...|             69292|UNIDADE ESTADUAL ...|              2019|               12|
+-------+--------------------+------------------+--------------------+------------------+-----------------+



In [12]:
df_ps.describe(["va_custo_pensionistas"]).show()

+-------+---------------------+
|summary|va_custo_pensionistas|
+-------+---------------------+
|  count|                  250|
|   mean|    7742443.837840007|
| stddev| 1.2636753672918526E7|
|    min|             45891.47|
|    max|        4.136315255E7|
+-------+---------------------+



In [28]:
df_ps.select('ds_organizacao_n1').distinct().rdd.map(lambda r: r[0]).collect()
# Util para conhecer classes discretas de uma coluna categórica.
# Neste caso mostra as organizacoes presentes nestes dados

['MINISTERIO DA INFRAESTRUTURA',
 'MINISTERIO DA SAUDE',
 'MINISTERIO DA DEFESA',
 'MINISTERIO DO MEIO AMBIENTE',
 'MINISTERIO DA EDUCACAO',
 'MINISTERIO DA AGRICULTURA, PECUARIA E ABASTECIMENTO',
 'MINISTERIO DA ECONOMIA']

In [29]:
df_ps.select('ds_organizacao_n3').distinct().rdd.map(lambda r: r[0]).collect()
# Util para conhecer classes discretas de uma coluna categórica.
# Neste caso mostra as modalidades de aplicacao presentes nestes dados

['SUPERINTENDENCIA ESTADUAL DO MINISTERIO DA SAUDE EM SAO PAULO',
 'SUPERINTENDENCIA ESTADUAL DE SAO PAULO',
 'CENTRO DE CIENCIAS DA SAUDE',
 'DIRETORIA DE GESTAO ADMINISTRATIVA',
 'SUPERINTENDENCIA REGIONAL DO DNIT NO ESTADO DO PARANA',
 'REITORIA',
 'NAO SE APLICA',
 'SUPERINTENDENCIA REGIONAL DO DNIT NO ESTADO DO RIO GRANDE DO SUL',
 'SUPERINTENDENCIA ESTADUAL DO MINISTERIO DA SAUDE NA BAHIA',
 'SECRETARIA DE ECONOMIA E FINANCAS',
 'SUBSECRETARIA DE PLANEJAMENTO, ORCAMENTO E ADMINISTRACAO',
 'UNIDADE ESTADUAL DO IBGE EM GOIAS',
 'INSTITUTO NACIONAL DE CARDIOLOGIA',
 'HOSPITAL UNIVERSITARIO JOAO DE BARROS BARRETO',
 'SECRETARIA-GERAL DA MARINHA',
 'UNIDADE ESTADUAL DO IBGE EM SAO PAULO']

In [30]:
df_ps = spark.sql("""
                  select distinct *
                  from pensionistas
                  """)
df_ps.count() # Verificar se tem linhas duplicadas

# Retorna 250, isso quer dizer que as linhas sao todas diferentes

250

In [31]:
df_ps.head(1)

[Row(co_natureza_juridica=2, ds_natureza_juridica='FUNDACAO PUBLICA', co_organizacao_n1=244, ds_organizacao_n1='MINISTERIO DA EDUCACAO', co_organizacao_n2=468, ds_organizacao_n2='FUNDACAO UNIVERSIDADE FEDERAL DO PIAUI', co_organizacao_n3=-9, ds_organizacao_n3='NAO SE APLICA', an_lanc=2018, me_lanc=2, va_custo_pensionistas=2080673.4)]

In [47]:
#  select co_natureza_juridica || '-' || ds_natureza_juridica || '-' || co_organizacao_n1 || '-' || 
#                   ds_organizacao_n1 || '-' ||co_organizacao_n2 || '-' || ds_organizacao_n2 ||'-' || co_organizacao_n3 || '-' ||
#                   ds_organizacao_n3 || '-' || an_lanc || '-'|| me_lanc || '-' || va_custo_pensionistas, 
#                   count(1)
#                   from pensionistas
#                   group by 1
#                   having count(1) > 1
#                   """)
df_ps_unique = spark.sql("""
                  select co_organizacao_n3 ||'-'|| me_lanc || '-' || va_custo_pensionistas, 
                  count(1)
                  from pensionistas
                  group by 1
                  having count(1) > 1
                  """)
# Encontrando a chave primária através de:
# 1- co_organizacao_n3
# 2- me_lanc
# 3- va_custo_pensionistas

df_ps_unique.count()

0

In [48]:
df_ps.groupBy(["co_natureza_juridica","ds_natureza_juridica"]).count().sort("count", ascending=True).show()
# Mostra a quantidade de descricao por natureza juridica

+--------------------+--------------------+-----+
|co_natureza_juridica|ds_natureza_juridica|count|
+--------------------+--------------------+-----+
|                   3|ADMINISTRACAO DIRETA|   54|
|                   4|           AUTARQUIA|   70|
|                   2|    FUNDACAO PUBLICA|  126|
+--------------------+--------------------+-----+



In [13]:
df.write \
  .format("jdbc") \
  .option("url", "jdbc:postgresql://<database-ip>:<port>/custos_stn") \
  .option("dbtable", "custos_stn.pensionistas") \
  .option("user", "postgres") \
  .option("driver", "org.postgresql.Driver") \
  .option("password", "postgres")\
  .save()

Py4JJavaError: An error occurred while calling o67.save.
: org.postgresql.util.PSQLException: The connection attempt failed.
	at org.postgresql.core.v3.ConnectionFactoryImpl.openConnectionImpl(ConnectionFactoryImpl.java:315)
	at org.postgresql.core.ConnectionFactory.openConnection(ConnectionFactory.java:51)
	at org.postgresql.jdbc.PgConnection.<init>(PgConnection.java:223)
	at org.postgresql.Driver.makeConnection(Driver.java:465)
	at org.postgresql.Driver.connect(Driver.java:264)
	at org.apache.spark.sql.execution.datasources.jdbc.connection.BasicConnectionProvider.getConnection(BasicConnectionProvider.scala:49)
	at org.apache.spark.sql.execution.datasources.jdbc.connection.ConnectionProvider$.create(ConnectionProvider.scala:63)
	at org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$.$anonfun$createConnectionFactory$1(JdbcUtils.scala:62)
	at org.apache.spark.sql.execution.datasources.jdbc.JdbcRelationProvider.createRelation(JdbcRelationProvider.scala:48)
	at org.apache.spark.sql.execution.datasources.SaveIntoDataSourceCommand.run(SaveIntoDataSourceCommand.scala:46)
	at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult$lzycompute(commands.scala:70)
	at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult(commands.scala:68)
	at org.apache.spark.sql.execution.command.ExecutedCommandExec.doExecute(commands.scala:90)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:180)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:218)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:215)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:176)
	at org.apache.spark.sql.execution.QueryExecution.toRdd$lzycompute(QueryExecution.scala:132)
	at org.apache.spark.sql.execution.QueryExecution.toRdd(QueryExecution.scala:131)
	at org.apache.spark.sql.DataFrameWriter.$anonfun$runCommand$1(DataFrameWriter.scala:989)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$5(SQLExecution.scala:103)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:163)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:90)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:772)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64)
	at org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:989)
	at org.apache.spark.sql.DataFrameWriter.saveToV1Source(DataFrameWriter.scala:438)
	at org.apache.spark.sql.DataFrameWriter.saveInternal(DataFrameWriter.scala:415)
	at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:301)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)
Caused by: java.net.SocketTimeoutException: connect timed out
	at java.net.PlainSocketImpl.socketConnect(Native Method)
	at java.net.AbstractPlainSocketImpl.doConnect(AbstractPlainSocketImpl.java:476)
	at java.net.AbstractPlainSocketImpl.connectToAddress(AbstractPlainSocketImpl.java:218)
	at java.net.AbstractPlainSocketImpl.connect(AbstractPlainSocketImpl.java:200)
	at java.net.SocksSocketImpl.connect(SocksSocketImpl.java:394)
	at java.net.Socket.connect(Socket.java:606)
	at org.postgresql.core.PGStream.createSocket(PGStream.java:231)
	at org.postgresql.core.PGStream.<init>(PGStream.java:95)
	at org.postgresql.core.v3.ConnectionFactoryImpl.tryConnect(ConnectionFactoryImpl.java:98)
	at org.postgresql.core.v3.ConnectionFactoryImpl.openConnectionImpl(ConnectionFactoryImpl.java:213)
	... 40 more
