In [23]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

spark = SparkSession.builder.appName("ProcessamentoSPTrans").getOrCreate()

df = spark.read.csv("data/raw/sptrans_linhas.csv", header=True, inferSchema=False)

df.show(5)

+-----+--------------------+
|   hr|                   l|
+-----+--------------------+
|21:21|{'c': '5632-10', ...|
|21:21|{'c': '2714-10', ...|
|21:21|{'c': '8400-10', ...|
|21:21|{'c': '4006-10', ...|
|21:21|{'c': '4056-10', ...|
+-----+--------------------+
only showing top 5 rows



In [27]:
from pyspark.sql import SparkSession, functions as F, types as T
import ast

spark = SparkSession.builder.appName("ProcessamentoSPTrans").getOrCreate()

# Leitura do CSV
df = spark.read.csv("data/raw/sptrans_linhas.csv", header=True, inferSchema=False)

# UDF convertendo dicionário Python com tipos mistos para string no Map
def parse_dict(s):
    try:
        d = ast.literal_eval(s)
        return {str(k): str(v) for k, v in d.items()}
    except:
        return {}

parse_udf = F.udf(parse_dict, T.MapType(T.StringType(), T.StringType()))

df = df.withColumn("l_json", parse_udf(F.col("l")))

# Agora conseguimos acessar tudo como string
df_final = df.select(
    "hr",
    F.col("l_json")["c"].alias("codigo_linha"),
    F.col("l_json")["lt0"].alias("origem"),
    F.col("l_json")["lt1"].alias("destino"),
    F.col("l_json")["tp"].alias("tipo"),
    F.col("l_json")["sl"].alias("circular_str"),
    F.col("l_json")["cl"].alias("codigo_cl_str")
)

# Cast para os campos numéricos
df_final = df_final.withColumn("circular", F.col("circular_str").cast(T.IntegerType()))
df_final = df_final.withColumn("codigo_cl", F.col("codigo_cl_str").cast(T.IntegerType()))

df_final = df_final.drop("circular_str", "codigo_cl_str")

df_final.show(10)

df_final.write.csv("data/processed/dataset.csv", header=True, mode="overwrite")

                                                                                

+-----+------------+--------------------+-----------------+----+--------+---------+
|   hr|codigo_linha|              origem|          destino|tipo|circular|codigo_cl|
+-----+------------+--------------------+-----------------+----+--------+---------+
|21:21|     5632-10|  LGO. SÃO FRANCISCO|     VL. SÃO JOSÉ|NULL|       2|    32849|
|21:21|     2714-10|     VL. RUI BARBOSA|METRÔ VL. MATILDE|NULL|       2|    33695|
|21:21|     8400-10|PÇA. RAMOS DE AZE...|   TERM. PIRITUBA|NULL|       2|    34033|
|21:21|     4006-10|            CIRCULAR| JD. NOVA VITÓRIA|NULL|       1|     2180|
|21:21|     4056-10|    TERM. SÃO MATEUS|PQ. BOA ESPERANÇA|NULL|       2|    35005|
|21:21|     9653-10|     PÇA. DO CORREIO|     PEDRA BRANCA|NULL|       2|    33356|
|21:21|     6358-10|      TERM. BANDEIRA|         JD. LUSO|NULL|       2|    34062|
|21:21|     4310-10|TERM. PQ. D. PEDR...|    E.T. ITAQUERA|NULL|       1|     2160|
|21:21|     675K-10|     METRÔ STA. CRUZ| TERM. JD. ÂNGELA|NULL|       2|   

                                                                                