## Setup

In [81]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
import pandas as pd
import numpy as np

# Create a Spark session
spark = SparkSession.builder.master("local").appName("PySpark Tutorial").getOrCreate()

# Verify Spark version
print("Spark version: ", spark.version)

Spark version:  3.5.4


In [82]:

schema_investing_fields = StructType([
    StructField("Data", DateType(), True),
    StructField("Último", FloatType(), True),
    StructField("Abertura", FloatType(), True),
    StructField("Máxima", FloatType(), True),
    StructField("Mínima", FloatType(), True),
    StructField("Vol.", StringType(), True),
    StructField("Var%", StringType(), True),
])

columns_to_float = ['ultimo', 'abertura', 'maxima', 'minima']


rename_fields = {
    "Data": "data",
    "Último": "ultimo",
    "Abertura": "abertura",
    "Máxima": "maxima",
    "Mínima": "minima",
    "Vol.": "volume",
    "Var%": "variacao"
}

partitions = ['category', 'item']

## Read

In [83]:
INPUT_PATH = '/home/lucas-nunes/workspace/Postech/challenges/2_ibov/data/bronze/source_investing/'
INPUT_PATH_SAMPLE = '/home/lucas-nunes/workspace/Postech/challenges/2_ibov/input/data/source_investing/category=commodities/item=cobre/Dados Históricos - Cobre Futuros.csv'

SILVER_PATH = '/home/lucas-nunes/workspace/Postech/challenges/2_ibov/data/silver'
BRONZE_PATH = '/home/lucas-nunes/workspace/Postech/challenges/2_ibov/data/bronze'

df = spark.read.csv(INPUT_PATH, header=True)

df = df.withColumnsRenamed(rename_fields)

## Process

In [84]:
for column in columns_to_float:

    df = df.withColumn(column, regexp_replace(regexp_replace(column, r'\.', ''), ',', r'\.').astype('float'))


df = df.withColumn('variacao', regexp_replace(regexp_replace('variacao', r'%', ''), ',', r'\.').astype('float'))
df = df.withColumn('volume', regexp_replace(regexp_replace('volume', r'K', ''), ',', r'\.').astype('float'))
df = df.withColumn('data', to_date(col('data'), 'dd.MM.yyyy'))
df = df.drop_duplicates(subset=['data', 'item'])


## Write

In [85]:
# df.toPandas().to_csv(f'{SILVER_PATH}/silver.csv')
df.toPandas().to_parquet(f'{SILVER_PATH}/silver.parquet')


                                                                                

In [86]:
df = pd.read_parquet(f'{SILVER_PATH}/silver.parquet')

In [87]:
df

Unnamed: 0,data,ultimo,abertura,maxima,minima,volume,variacao,category,item
0,1980-01-02,30.049999,30.049999,30.049999,30.049999,,3.44,commodities,prata
1,1980-01-03,31.049999,31.049999,31.049999,31.049999,,3.33,commodities,prata
2,1980-01-04,32.049999,32.049999,32.049999,32.049999,,3.22,commodities,prata
3,1980-01-07,33.049999,33.049999,33.049999,33.049999,,3.12,commodities,prata
4,1980-01-08,32.750000,33.974998,34.049999,32.500000,,-0.91,commodities,prata
...,...,...,...,...,...,...,...,...,...
205243,2025-03-07,126677.000000,125130.000000,127455.000000,123810.000000,72.050003,1.39,index,ibov_futuro
205244,2025-03-10,13224.190430,13295.089844,13300.889648,13150.919922,,-0.53,index,FTXIN9
205245,2025-03-10,125935.000000,126230.000000,126350.000000,124905.000000,52.130001,-0.59,index,ibov_futuro
205246,2025-03-11,13286.150391,13224.190430,13286.150391,13123.559570,,0.47,index,FTXIN9
