## Setup

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
import pandas as pd
import numpy as np

# Create a Spark session
spark = SparkSession.builder.master("local").appName("PySpark Tutorial").getOrCreate()

# Verify Spark version
print("Spark version: ", spark.version)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/02/18 00:20:01 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Spark version:  3.5.4


In [2]:

schema_investing_fields = StructType([
    StructField("Data", DateType(), True),
    StructField("Último", FloatType(), True),
    StructField("Abertura", FloatType(), True),
    StructField("Máxima", FloatType(), True),
    StructField("Mínima", FloatType(), True),
    StructField("Vol.", StringType(), True),
    StructField("Var%", StringType(), True),
])

columns_to_float = ['ultimo', 'abertura', 'maxima', 'minima']


rename_fields = {
    "Data": "data",
    "Último": "ultimo",
    "Abertura": "abertura",
    "Máxima": "maxima",
    "Mínima": "minima",
    "Vol.": "volume",
    "Var%": "variacao"
}

partitions = ['category', 'item']

## Read

In [None]:
INPUT_PATH = '/home/lucas-nunes/workspace/Postech/challenges/2_ibov/input/data/source_investing/'
INPUT_PATH_SAMPLE = '/home/lucas-nunes/workspace/Postech/challenges/2_ibov/input/data/source_investing/category=commodities/item=cobre/Dados Históricos - Cobre Futuros.csv'

SILVER_PATH = '/home/lucas-nunes/workspace/Postech/challenges/2_ibov/data/silver'

df = spark.read.csv(f'{INPUT_PATH}', header=True)

df = df.withColumnsRenamed(rename_fields)

                                                                                

## Process

In [4]:
for column in columns_to_float:

    df = df.withColumn(column, regexp_replace(regexp_replace(column, r'\.', ''), ',', r'\.').astype('float'))


df = df.withColumn('variacao', regexp_replace(regexp_replace('variacao', r'%', ''), ',', r'\.').astype('float'))
df = df.withColumn('volume', regexp_replace(regexp_replace('volume', r'K', ''), ',', r'\.').astype('float'))
df = df.withColumn('data', to_date(col('data'), 'dd.MM.yyyy'))

## Write

In [7]:
# df.toPandas().to_csv(f'{SILVER_PATH}/silver.csv')
df.toPandas().to_parquet(f'{SILVER_PATH}/silver.parquet')


                                                                                

In [8]:
df = pd.read_parquet(f'{SILVER_PATH}/silver.parquet')

In [12]:
df

Unnamed: 0,data,ultimo,abertura,maxima,minima,volume,variacao,category,item
0,2025-02-14,22114.689453,22038.609375,22139.429688,22010.349609,,0.38,index,ndx
1,2025-02-13,22030.710938,21786.160156,22038.150391,21758.039062,,1.43,index,ndx
2,2025-02-12,21719.259766,21475.410156,21745.660156,21454.189453,,0.12,index,ndx
3,2025-02-11,21693.519531,21629.109375,21776.250000,21625.509766,,-0.29,index,ndx
4,2025-02-10,21756.730469,21670.060547,21793.130859,21645.169922,,1.24,index,ndx
...,...,...,...,...,...,...,...,...,...
153358,2024-09-30,5753.500000,5744.200195,5765.100098,5702.399902,,0.18,index,us500
153359,2024-09-29,5743.399902,5727.399902,5744.899902,5722.399902,,0.25,index,us500
153360,2024-09-27,5729.200195,5749.200195,5763.600098,5726.500000,,-0.35,index,us500
153361,2024-09-26,5749.299805,5732.200195,5772.799805,5721.500000,,0.31,index,us500


In [None]:
df_test.select('*').orderBy('data').show

+----------+------+--------+------+------+------+--------+-----------+-----+
|      data|ultimo|abertura|maxima|minima|volume|variacao|   category| item|
+----------+------+--------+------+------+------+--------+-----------+-----+
|1980-01-02| 30.05|   30.05| 30.05| 30.05|  NULL|    3.44|commodities|prata|
|1980-01-03| 31.05|   31.05| 31.05| 31.05|  NULL|    3.33|commodities|prata|
|1980-01-04| 32.05|   32.05| 32.05| 32.05|  NULL|    3.22|commodities|prata|
|1980-01-07| 33.05|   33.05| 33.05| 33.05|  NULL|    3.12|commodities|prata|
|1980-01-08| 32.75|  33.975| 34.05|  32.5|  NULL|   -0.91|commodities|prata|
|1980-01-09|  33.5|    33.5|  33.5|  33.5|  NULL|    2.29|commodities|prata|
|1980-01-10|  34.5|    34.5|  34.5|  34.5|  NULL|    2.99|commodities|prata|
|1980-01-11|  35.5|    35.5|  35.5|  35.5|  NULL|     2.9|commodities|prata|
|1980-01-14| 863.6|   858.5| 870.7| 855.0|  NULL|    0.59|      index| us30|
|1980-01-14|  36.5|    36.5|  36.5|  36.5|  NULL|    2.82|commodities|prata|