# Importa bibliotecas

In [4]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

# Cria uma SparkSession

In [5]:
spark = SparkSession.builder.master('local[*]').getOrCreate()

# Importa dados

In [6]:
path_covid = '/content/drive/MyDrive/Pós Tech/Tech Challenges/Tech Challenge 3/dados/data_pnad_covid_19/manual_raw/DADOS/ano=2020/mes=10/PNAD_COVID_102020.csv'
df_covid = spark.read.csv(path_covid, header=True, inferSchema=True)
print('df.columns :', df_covid.columns)

df.columns : ['Ano', 'UF', 'CAPITAL', 'RM_RIDE', 'V1008', 'V1012', 'V1013', 'V1016', 'Estrato', 'UPA', 'V1022', 'V1023', 'V1030', 'V1031', 'V1032', 'posest', 'A001', 'A001A', 'A001B1', 'A001B2', 'A001B3', 'A002', 'A003', 'A004', 'A005', 'A006', 'A007', 'A008', 'A009', 'B0011', 'B0012', 'B0013', 'B0014', 'B0015', 'B0016', 'B0017', 'B0018', 'B0019', 'B00110', 'B00111', 'B00112', 'B00113', 'B002', 'B0031', 'B0032', 'B0033', 'B0034', 'B0035', 'B0036', 'B0037', 'B0041', 'B0042', 'B0043', 'B0044', 'B0045', 'B0046', 'B005', 'B006', 'B007', 'B008', 'B009A', 'B009B', 'B009C', 'B009D', 'B009E', 'B009F', 'B0101', 'B0102', 'B0103', 'B0104', 'B0105', 'B0106', 'B011', 'C001', 'C002', 'C003', 'C004', 'C005', 'C0051', 'C0052', 'C0053', 'C006', 'C007', 'C007A', 'C007B', 'C007C', 'C007D', 'C007E', 'C007E1', 'C007E2', 'C007F', 'C008', 'C009', 'C009A', 'C010', 'C0101', 'C01011', 'C01012', 'C0102', 'C01021', 'C01022', 'C0103', 'C0104', 'C011A', 'C011A1', 'C011A11', 'C011A12', 'C011A2', 'C011A21', 'C011A22'

In [8]:
path_dicionarios = '/content/drive/MyDrive/Pós Tech/Tech Challenges/Tech Challenge 3/workspace/gold/dicionario_valores.csv'
df_dicionario_valores = spark.read.csv(path_dicionarios, header=True, inferSchema=True)
df_dicionario_valores.show()

+------+----------+-------------------+
|CODIGO|SUB_CODIGO|          DESCRICAO|
+------+----------+-------------------+
|    UF|        11|           Rondônia|
|    UF|        12|               Acre|
|    UF|        13|           Amazonas|
|    UF|        14|            Roraima|
|    UF|        15|               Pará|
|    UF|        16|              Amapá|
|    UF|        17|          Tocantins|
|    UF|        21|           Maranhão|
|    UF|        22|              Piauí|
|    UF|        23|              Ceará|
|    UF|        24|Rio Grande do Norte|
|    UF|        25|            Paraíba|
|    UF|        26|         Pernambuco|
|    UF|        27|            Alagoas|
|    UF|        28|            Sergipe|
|    UF|        29|              Bahia|
|    UF|        31|       Minas Gerais|
|    UF|        32|     Espírito Santo|
|    UF|        33|     Rio de Janeiro|
|    UF|        35|          São Paulo|
+------+----------+-------------------+
only showing top 20 rows



# Junta a base Covid19 com o dicionário de códigos

In [9]:
df_covid_com_valores = df_covid

for column in df_covid.columns[1:]:
    # Cria dicionário de códigos/valores
    df_filtro = df_dicionario_valores.where(df_dicionario_valores['CODIGO'] == column)
    dict_from_df_filtro = df_filtro.select('SUB_CODIGO', 'DESCRICAO').rdd.collectAsMap()

    # Converte chaves para string
    dict_from_df_filtro = {str(k): str(v) for k, v in dict_from_df_filtro.items()}
    df_covid_com_valores = df_covid_com_valores.withColumn(column, col(column).cast('string'))

    # Substitui códigos da base pelos valores do dicionário
    df_covid_com_valores = df_covid_com_valores.replace(to_replace=dict_from_df_filtro, subset=[column])

df_covid_com_valores.show(10)


+----+--------+--------------------+-------+-----+-----+-----+-----+-------+---------+------+-------+------+------------+------------+------+----+-----+-------------+-------------+------+----+------+------+--------------------+----+--------------------+------+--------------------+-----+-----+-----+-----+-----+-----+-----+-----+-----+------+------+------+------+----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+----+----+----+----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+--------------------+----+----+----+----+----+-----+-----+-----+----+--------------------+-----+--------------------+--------------------+--------------------+-----+------+------+-----+----+----+-----+--------------------+-----------+------+------+-----+------+------+-----+-----+--------------------+-----------+-------+-------+------+-------+-------+----+----+----+----+----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-

# Seleciona colunas necessárias

In [10]:
df_covid_com_valores.createOrReplaceTempView('df_covid_com_valores')

In [11]:
covid_tratado = spark.sql(
    '''
    SELECT
      A002 as idade,
      A003 as sexo,
      A004 as cor_raca,
      B0011 as febre,
      A005 as escolaridade,
      B00111 as perda_cheiro_sabor,
      B0012 as tosse,
      B0014 as dificuldade_respirar,
      B002 as foi_hospital,
      B009A as fez_swab,
      B011 as resultado_teste,
      B0019 as fadiga,
      B007 as plano_de_saude,
      B008 as fez_teste_covid,
      B009C as coleta_sangue_furo_dedo,
      B009E as coleta_sangue_veia,
      C007A as area_trabalho,
      C011A as remuneracao,
      C013 as trabalho_remoto,
      D0031 as bolsa_familia,
      A006 as frequenta_escola,
      B00112 as dor_muscular,
      B00113 as diarreia,
      B0016 as dor_peito,
      B0031 as ficou_em_casa,
      B0033 as tomou_remedio_conta_propria,
      B0035 as recebeu_visita_sus,
      B0042 as buscou_pronto_socorro_sus_upa,
      B005 as ficou_internado,
      B006 as foi_sedado,
      B0103 as tem_asma_bronquite,
      C007C as cargo_no_trabalho,
      C007E as qtd_empregados_no_trabalho,
      C010 as remuneracao_em_todos_trabalhos,
      C012 as trabalhou_no_local_de_costume,
      C014 as contribui_inss,
      CAPITAL as capital,
      F002A2 as tem_alcool_no_domicilio,
      F002A3 as tem_mascara_no_domicilio,
      UF as unidade_federativa,
      V1013 as mes_da_pesquisa,
      V1022 as situacao_domicilio,
      V1023 as tipo_area

    FROM df_covid_com_valores
    '''
)
covid_tratado.show()

+-----+------+--------+-----+--------------------+------------------+-----+--------------------+------------+--------+--------------------+------+--------------+---------------+-----------------------+------------------+-------------+--------------------+---------------+-------------+----------------+------------+--------+---------+-------------+---------------------------+------------------+-----------------------------+---------------+----------+------------------+--------------------+--------------------------+------------------------------+-----------------------------+--------------+--------------------+-----------------------+------------------------+------------------+---------------+------------------+---------+
|idade|  sexo|cor_raca|febre|        escolaridade|perda_cheiro_sabor|tosse|dificuldade_respirar|foi_hospital|fez_swab|     resultado_teste|fadiga|plano_de_saude|fez_teste_covid|coleta_sangue_furo_dedo|coleta_sangue_veia|area_trabalho|         remuneracao|trabalho_remoto

# Converte resultado para CSV

In [12]:
covid_tratado.write.mode('overwrite').option('header', 'true').csv('/content/drive/MyDrive/Pós Tech/Tech Challenges/Tech Challenge 3/workspace/gold/covid_gold.csv')

# Converte resultado para Parquet

In [13]:
covid_tratado.write.mode('overwrite').option('header', 'true').parquet('/content/drive/MyDrive/Pós Tech/Tech Challenges/Tech Challenge 3/gold/covid_gold.parquet')