In [None]:
%idle_timeout 30
%glue_version 3.0
%worker_type G.1X
%number_of_workers 2

import boto3
import sys
import re
from awsglue.dynamicframe import DynamicFrame
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from pyspark.sql.functions import *
from pyspark.sql.types import *
from awsglue.context import GlueContext
from awsglue.job import Job
from pyspark.sql import Row
  
sc = SparkContext.getOrCreate()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)

s3_path = "s3://micro-dados-sp-outros/stage/2000/Pes35.txt"

### Leitura do arquivo do S3

In [None]:
arquivo = spark.read.text(s3_path).rdd

In [None]:
def remover_espacos(linha):
    
    linha = linha["value"].strip()
    linha = re.sub(r'\s+', '', linha)
    
    return Row(linha=linha)

In [None]:
linhas_ajustadas = arquivo.map(remover_espacos)

In [None]:
schema = StructType([StructField("linha", StringType(), True)])

df_linhas_ajustadas = spark.createDataFrame(linhas_ajustadas, schema)

In [None]:
# Informações conforme os dados disponibilizados pelo IBGE
# ('Nome da Coluna', inicio, deslocamento)

columns = [
    ('V0102',1,2),
('V1002',3,4),
('V1003',7,5),
('V1103',12,7),
('V0104',19,9),
('V0105',28,11),
('V0300',39,8),
('V0400',47,2),
('V1004',49,2),
('AREAP',51,13),
('V1001',64,1),
('V1005',65,1),
('V1006',66,1),
('V1007',67,1),
('MARCA',68,1),
('V0401',69,1),
('M0401',70,1),
('V0402',71,2),
('M0402',73,1),
('V0403',74,2),
('M0403',76,1),
('V0404',77,1),
('M0404',78,1),
('V4572',79,3),
('M4752',82,1),
('V4754',83,2),
('M4754',85,1),
('V4070',86,1),
('V0408',87,1),
('M0408',88,1),
('V4090',89,3),
('M4090',92,1),
('V0410',93,1),
('M0410',94,1),
('V0411',95,1),
('M0411',96,1),
('V0412',97,1),
('M0412',98,1),
('V0413',99,1),
('M0413',100,1),
('V0414',101,1),
('M0414',102,1),
('V0415',103,1),
('M0415',104,1),
('V0416',105,2),
('M0416',107,1),
('V0417',108,1),
('M0417',109,1),
('V0418',110,1),
('M0418',111,1),
('V0419',112,1),
('M0419',113,1),
('V0420',114,4),
('M0420',118,1),
('V4210',119,2),
('M4210',121,1),
('V0422',122,2),
('M0422',124,1),
('V4230',125,2),
('M4230',127,1),
('V0424',128,1),
('M0424',129,1),
('V4250',130,7),
('M4250',137,1),
('V4260',138,2),
('M4260',140,1),
('V4276',141,7),
('M4276',148,1),
('V0428',149,1),
('M0428',150,1),
('V0429',151,1),
('M0429',152,1),
('V0430',153,2),
('M0430',155,1),
('V0431',156,1),
('M0431',157,1),
('V0432',158,1),
('M0432',159,1),
('V0433',160,2),
('M0433',162,1),
('V0434',163,1),
('M0434',164,1),
('V4355',165,2),
('M0435',167,1),
('V4300',168,2),
('V0436',170,1),
('M0436',171,1),
('V0437',172,1),
('M0437',173,1),
('V0438',174,1),
('M0438',175,1),
('V0439',176,1),
('M0439',177,1),
('V0440',178,1),
('M0440',179,1),
('V0441',180,1),
('M0441',181,1),
('V0442',182,1),
('M0442',183,1),
('V0443',184,1),
('M0443',185,1),
('V0444',186,1),
('M0444',187,1),
('V4452',188,4),
('M4452',192,1),
('V4462',193,5),
('M4462',198,1),
('V0447',199,1),
('M0447',200,1),
('V0448',201,1),
('M0448',202,1),
('V0449',203,1),
('M0449',204,1),
('V0450',205,1),
('M0450',206,1),
('V4511',207,1),
('M4511',208,1),
('V4512',209,6),
('M4512',215,1),
('V4513',216,6),
('V4514',222,6),
('V4521',228,1),
('M4521',229,1),
('V4522',230,6),
('M4522',236,1),
('V4523',237,6),
('V4524',243,6),
('V4525',249,6),
('V4526',255,6),
('V0453',261,2),
('M4523',263,1),
('V0454',264,2),
('M0454',266,1),
('V4534',267,3),
('V0455',270,1),
('M0455',271,1),
('V0456',272,1),
('M0456',273,1),
('V4573',274,6),
('M4573',280,1),
('V4583',281,6),
('M4583',287,1),
('V4593',288,6),
('M4593',294,1),
('V4603',295,6),
('M4603',301,1),
('V4613',302,6),
('M4613',308,1),
('V4614',309,6),
('V4615',315,6),
('V4620',321,2),
('M4620',323,1),
('V0463',324,2),
('V4654',326,2),
('M4654',328,1),
('V4670',329,2),
('M4670',331,1),
('V4690',332,2),
('M0463',334,1),
('P001',335,11),
('ESTR',346,2),
('ESTRP',348,2),
('V4621',350,2),
('M4621',352,1),
('V4622',353,2),
('M4622',355,1),
('V4631',356,2),
('M4631',358,1),
('V4632',359,2),
('M4632',361,1),
('V0464',362,1),
('M0464',363,1),
('V4671',364,2),
('M4671',366,1),
('V4672',367,2),
('M4672',369,1),
('V4354',370,3),
('V4219',373,3),
('V4239',376,3),
('V4269',379,3),
('V4279',382,3),
('V4451',385,3),
('V4461',388,3)]

In [None]:
def extract_variable(data, start, end):
    return data[start-1:start-1+end]

def create_udf(start, end):
    return udf(lambda data: extract_variable(data, start, end), StringType())

In [None]:
for var, start, end in columns:
    df_linhas_ajustadas = df_linhas_ajustadas.withColumn(var, create_udf(start, end)(df_linhas_ajustadas["linha"]))

In [None]:
df_linhas_ajustadas = df_linhas_ajustadas.select([col(column).cast('int') for column in df_linhas_ajustadas.columns])

In [None]:
raw_data = df_linhas_ajustadas.drop('linha')

In [None]:
raw_data_dynamic_frame = DynamicFrame.fromDF(raw_data, glueContext, "raw_data_dynamic_frame")

In [None]:
# Configuração do destino S3
s3_parquet = glueContext.getSink(
    path="s3://micro-dados-sp-outros/bronze/2000/",
    connection_type="s3",
    updateBehavior="UPDATE_IN_DATABASE",
    partitionKeys=["V1002"],
    compression="gzip",
    enableUpdateCatalog=True,
    transformation_ctx="s3_parquet",
)

# Definindo informações do catálogo
s3_parquet.setCatalogInfo(
    catalogDatabase="ibge", catalogTableName="micro_dados_sp_2000"
)

# Definindo o formato dos dados
s3_parquet.setFormat("glueparquet")

# Escrevendo os dados
s3_parquet.writeFrame(raw_data_dynamic_frame)
