In [21]:
!pip install python-dotenv



In [22]:
from dotenv import load_dotenv
import os
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
import pandas as pd

In [23]:
# Cargo mis variables de entorno
load_dotenv()

# Genero el dict de datos para conectarme con Snowflake a schema Raw
credencialesSnowflakeAnalytics = {
    "sfURL" : os.getenv("SNOWFLAKE_URL"),
    "sfUser" :  os.getenv("SNOWFLAKE_USER"),
    "sfPassword" : os.getenv("SNOWFLAKE_PASSWORD"),
    "sfDatabase" : os.getenv("SNOWFLAKE_DATABASE"),
    "sfSchema" : os.getenv("SNOWFLAKE_SCHEMA_ANALYTICS"),
    "sfWarehouse" : os.getenv("SNOWFLAKE_WAREHOUSE"),
    "sfRole" : os.getenv("SNOWFLAKE_ROLE"),
}

print(f"Estas son mis credenciales para Snowflake con schema Analytics: {credencialesSnowflakeAnalytics}")

Estas son mis credenciales para Snowflake con schema Analytics: {'sfURL': 'DXNHVFP-EFC53756.snowflakecomputing.com', 'sfUser': 'usuario_spark', 'sfPassword': 'EstudianteEstudiante64', 'sfDatabase': 'NY_TAXI', 'sfSchema': 'ANALYTICS', 'sfWarehouse': 'WAREHOUSE_TAXIS', 'sfRole': 'rol_pocos_privilegios'}


In [24]:
# Creo SparkSession para conexión con Snowflake
spark = (SparkSession.builder.appName("IngestaNewYorkTaxis").config("spark.jars.packages", "net.snowflake:snowflake-jdbc:3.13.30,net.snowflake:spark-snowflake_2.12:2.9.0-spark_3.1").getOrCreate())

print(spark)
print("Spark Version : " + spark.version)

# Ejecuto una query de prueba para validar comunicacion con Snowflake
query = "SELECT current_version()"

df = spark.read.format("snowflake").options(**credencialesSnowflakeAnalytics).option("query", query).load()

df.show()

<pyspark.sql.session.SparkSession object at 0x757c51e50910>
Spark Version : 3.5.0
+-------------------+
|"CURRENT_VERSION()"|
+-------------------+
|             9.32.1|
+-------------------+



In [25]:
try:
    df_obt_taxis = spark.read.format("snowflake") \
                .options(**credencialesSnowflakeAnalytics) \
                .option("dbtable", "NY_TAXI_OBT_ULTIMATE") \
                .load()
    print("Se cargo correctamente datos de tabla OBT para consultas")
    print(df_obt_taxis.count())
except Exception as e:
    print(f"No se pudo cargar tabla OBT:{e}")

Se cargo correctamente datos de tabla OBT para consultas
813032013


In [6]:
#Top 10 zonas de pickup por volumen mensual.

df_top_10_pickup=df_obt_taxis.groupBy("MONTH","PU_ZONE").agg(F.count("*").alias("Viajes_Mensuales_Por_Zona")).orderBy("Viajes_Mensuales_Por_Zona",ascending=False).limit(10)
df_top_10_pickup.show(truncate=False)

+-----+---------------------+-------------------------+
|MONTH|PU_ZONE              |Viajes_Mensuales_Por_Zona|
+-----+---------------------+-------------------------+
|5    |Upper East Side South|2912807                  |
|1    |Upper East Side South|2825330                  |
|4    |Upper East Side South|2781522                  |
|3    |Upper East Side South|2781380                  |
|10   |Upper East Side South|2757085                  |
|3    |Midtown Center       |2747117                  |
|1    |Upper East Side North|2702733                  |
|2    |Upper East Side South|2698617                  |
|6    |Upper East Side South|2656560                  |
|1    |Midtown Center       |2655556                  |
+-----+---------------------+-------------------------+



In [7]:
#Top 10 zonas de dropoff por volumen mensual.

df_top_10_dropoff=df_obt_taxis.groupBy("MONTH","DO_ZONE").agg(F.count("*").alias("Viajes_Mensuales_Por_Zona")).orderBy("Viajes_Mensuales_Por_Zona",ascending=False).limit(10)
df_top_10_dropoff.show(truncate=False)

+-----+---------------------+-------------------------+
|MONTH|DO_ZONE              |Viajes_Mensuales_Por_Zona|
+-----+---------------------+-------------------------+
|1    |Upper East Side North|2849889                  |
|5    |Upper East Side North|2756013                  |
|3    |Upper East Side North|2697999                  |
|2    |Upper East Side North|2680467                  |
|4    |Upper East Side North|2671567                  |
|10   |Upper East Side North|2634008                  |
|3    |Midtown Center       |2619012                  |
|5    |Upper East Side South|2589130                  |
|1    |Upper East Side South|2528147                  |
|1    |Midtown Center       |2519808                  |
+-----+---------------------+-------------------------+



In [8]:
#Evolución mensual de total_amount y tip_pct por borough.

df_amount_tip_por_mes=df_obt_taxis.groupBy("MONTH","PU_BOROUGH").agg(F.sum("TOTAL_AMOUNT").alias("Total_Amount_Mensual_Por_Borough"),F.avg("TIP_PCT").alias("Tip_Percentage_Avg_Por_Mes"))
df_amount_tip_por_mes.show(truncate=False)

+-----+-------------+--------------------------------+--------------------------+
|MONTH|PU_BOROUGH   |Total_Amount_Mensual_Por_Borough|Tip_Percentage_Avg_Por_Mes|
+-----+-------------+--------------------------------+--------------------------+
|11   |Queens       |2.1357763931000906E8            |9.122942264420072         |
|4    |Unknown      |1.351250497999989E7             |10.393555994135419        |
|5    |Staten Island|132764.51                       |4.843937952329194         |
|1    |N/A          |576414.9500000003               |8.145481499495833         |
|1    |Bronx        |5831635.650000129               |2.1747138718037875        |
|1    |Manhattan    |9.519660447390596E8             |10.715967860219795        |
|5    |N/A          |655094.3800000006               |8.262361418609373         |
|4    |Staten Island|143854.34000000003              |5.3619165448994           |
|4    |Queens       |2.383825879000098E8             |8.890324087934674         |
|5    |Bronx    

In [9]:
#Ticket promedio (avg total_amount) por service_type y mes.

df_ticket_promedio=df_obt_taxis.groupBy("MONTH","SOURCE_SERVICE").agg(F.avg("TOTAL_AMOUNT").alias("Total_Amount_Promedio"))
df_ticket_promedio.show(truncate=False)

+-----+--------------+---------------------+
|MONTH|SOURCE_SERVICE|Total_Amount_Promedio|
+-----+--------------+---------------------+
|2    |yellow        |17.68252079724796    |
|6    |yellow        |19.067894111715923   |
|10   |yellow        |18.777449144136156   |
|3    |yellow        |18.266701799678643   |
|6    |green         |15.539750050668545   |
|12   |yellow        |18.603408843212648   |
|10   |green         |15.424860715358566   |
|1    |green         |14.774939511845796   |
|1    |yellow        |17.299166892871135   |
|5    |green         |15.566530356254617   |
|3    |green         |14.955538694316838   |
|4    |green         |15.147751741305306   |
|12   |green         |15.053053966331479   |
|5    |yellow        |19.09619490295871    |
|4    |yellow        |18.503571447240827   |
|2    |green         |14.763832264674505   |
|8    |green         |15.508258907639338   |
|11   |green         |15.204317067416701   |
|8    |yellow        |18.84944809830649    |
|11   |yel

In [10]:
#Viajes por hora del día y día de semana (picos).

df_viajes_hora_dia=df_obt_taxis.groupBy("DAY_OF_WEEK","PICKUP_HOUR").agg(F.count("*").alias("Viajes_Totales"))
df_viajes_hora_dia.show(truncate=False)

+-----------+-----------+--------------+
|DAY_OF_WEEK|PICKUP_HOUR|Viajes_Totales|
+-----------+-----------+--------------+
|3          |8          |6198868       |
|4          |21         |7281791       |
|3          |20         |6984362       |
|6          |16         |6016595       |
|7          |4          |1846307       |
|5          |9          |6123703       |
|5          |8          |6333495       |
|4          |11         |5832135       |
|5          |16         |6003255       |
|4          |13         |6115441       |
|7          |17         |6782953       |
|1          |22         |4152874       |
|4          |7          |5138664       |
|6          |20         |7056103       |
|1          |3          |3272816       |
|6          |0          |4235603       |
|3          |2          |735436        |
|4          |8          |6350295       |
|6          |1          |2702549       |
|1          |6          |1104567       |
+-----------+-----------+--------------+
only showing top

In [11]:
#p50/p90 de trip_duration_min por borough de pickup.

df_percentiles = df_obt_taxis.groupBy("PU_BOROUGH").agg(F.expr('percentile_approx(TRIP_DURATION_MIN, 0.5)').alias('P50_TRIP_DURATION_MIN'),F.expr('percentile_approx(TRIP_DURATION_MIN, 0.9)').alias('P90_TRIP_DURATION_MIN'))
df_percentiles.show(truncate=False)

+-------------+---------------------+---------------------+
|PU_BOROUGH   |P50_TRIP_DURATION_MIN|P90_TRIP_DURATION_MIN|
+-------------+---------------------+---------------------+
|Queens       |25.2                 |54.483333333333334   |
|EWR          |8.316666666666666    |56.06666666666667    |
|Unknown      |11.416666666666666   |28.866666666666667   |
|Brooklyn     |12.466666666666667   |30.55                |
|Staten Island|28.316666666666666   |74.18333333333334    |
|N/A          |12.016666666666667   |41.4                 |
|Manhattan    |10.783333333333333   |24.866666666666667   |
|Bronx        |12.583333333333334   |35.7                 |
+-------------+---------------------+---------------------+



In [12]:
#avg_speed_mph por franja horaria (6–9, 17–20) y borough.

df_avg_speed_franja_hora= df_obt_taxis.groupBy("PU_BOROUGH").agg(F.avg(F.when((F.col("PICKUP_HOUR") >= 6) & (F.col("PICKUP_HOUR") <= 9), F.col("AVG_SPEED_MPH"))).alias("AVG_SPEED_MPH_FRANJA_6_9"),
                                                                 F.avg(F.when((F.col("PICKUP_HOUR") >= 17) & (F.col("PICKUP_HOUR") <= 20), F.col("AVG_SPEED_MPH"))).alias("AVG_SPEED_MPH_FRANJA_17_20"))
df_avg_speed_franja_hora.show(truncate=False)                                                               

+-------------+------------------------+--------------------------+
|PU_BOROUGH   |AVG_SPEED_MPH_FRANJA_6_9|AVG_SPEED_MPH_FRANJA_17_20|
+-------------+------------------------+--------------------------+
|Queens       |18.440068232894614      |18.842744858808093        |
|EWR          |20.86101789368879       |21.272504425523284        |
|Unknown      |11.951832673137515      |10.889114209008534        |
|Brooklyn     |13.106931697663832      |11.261717155784128        |
|Staten Island|20.520584615984472      |20.458713644964973        |
|N/A          |18.808947092660794      |17.25272758113177         |
|Manhattan    |11.402220311166971      |9.998196846171018         |
|Bronx        |13.459353612523852      |12.569569889951303        |
+-------------+------------------------+--------------------------+



In [13]:
#Participación por payment_type_desc y su relación con tip_pct.

df_payment_tip= df_obt_taxis.groupBy("PAYMENT_TYPE_NORMALIZADO").agg(F.avg("TIP_PCT").alias("TIP_PERCENTAGE_PROMEDIO"))
df_payment_tip.show(truncate=False)

+------------------------+-----------------------+
|PAYMENT_TYPE_NORMALIZADO|TIP_PERCENTAGE_PROMEDIO|
+------------------------+-----------------------+
|Unknown                 |0.13478461581594878    |
|Cash                    |4.926361093605739E-4   |
|Dispute                 |0.021482159622660927   |
|No charge               |0.01650348833553887    |
|Credit card             |15.08882316859497      |
+------------------------+-----------------------+



In [14]:
#¿Qué rate_code_desc concentran mayor trip_distance y total_amount?

df_rate_code= df_obt_taxis.groupBy("RATE_CODE_NORMALIZADO").agg(F.sum("TRIP_DISTANCE").alias("TRIP_DISTANCE_ACUMULADO"),F.sum("TOTAL_AMOUNT").alias("TOTAL_AMOUNT_ACUMULADO"))
df_rate_code.show(truncate=False)

+---------------------+-----------------------+----------------------+
|RATE_CODE_NORMALIZADO|TRIP_DISTANCE_ACUMULADO|TOTAL_AMOUNT_ACUMULADO|
+---------------------+-----------------------+----------------------+
|Negotiated fare      |2.5632549419999998E7   |1.5371499788999826E8  |
|Nassau or Westchester|1.3144986409999996E7   |6.842887901999949E7   |
|Unknown              |9013121.389999995      |4.031903173000019E7   |
|Newark               |2.7214186490000002E7   |1.555342989599984E8   |
|Standard rate        |2.0815318190199568E9   |1.310971785348881E10  |
|Group ride           |3377.8                 |41445.009999999966    |
|JFK                  |3.3018326846000004E8   |1.314904539819577E9   |
+---------------------+-----------------------+----------------------+



In [15]:
#Mix yellow vs green por mes y borough.

df_yellow_green= df_obt_taxis.groupBy("PU_BOROUGH","SOURCE_SERVICE").agg(F.count("*").alias("VIAJES_TOTALES"))
df_yellow_green.show(truncate=False)

+-------------+--------------+--------------+
|PU_BOROUGH   |SOURCE_SERVICE|VIAJES_TOTALES|
+-------------+--------------+--------------+
|Unknown      |green         |34919         |
|N/A          |yellow        |133097        |
|EWR          |yellow        |5572          |
|Staten Island|green         |12716         |
|Queens       |green         |17961921      |
|Queens       |yellow        |49361792      |
|Brooklyn     |green         |21980562      |
|Manhattan    |yellow        |680377308     |
|Manhattan    |green         |20732230      |
|N/A          |green         |17572         |
|Bronx        |yellow        |780195        |
|Unknown      |yellow        |8731149       |
|Bronx        |green         |3169121       |
|Brooklyn     |yellow        |9711718       |
|Staten Island|yellow        |21982         |
|EWR          |green         |159           |
+-------------+--------------+--------------+



In [16]:
#Top 20 flujos PU→DO por volumen y su ticket promedio.

df_pu_do= df_obt_taxis.groupBy("PU_LOCATION_ID","DO_LOCATION_ID").agg(F.count("*").alias("VIAJES_TOTALES"),F.avg("TOTAL_AMOUNT").alias("TICKET_PROMEDIO")).orderBy("VIAJES_TOTALES",ascending=False)
df_pu_do.show(20)

+--------------+--------------+--------------+------------------+
|PU_LOCATION_ID|DO_LOCATION_ID|VIAJES_TOTALES|   TICKET_PROMEDIO|
+--------------+--------------+--------------+------------------+
|           264|           264|       7068810|16.568227703118726|
|           237|           236|       4382649|10.318373517931079|
|           236|           237|       3738587|11.110493239822585|
|           236|           236|       3352999| 8.405734481278307|
|           237|           237|       3215383| 8.940615401024205|
|           239|           238|       1955272| 8.888066831621796|
|           239|           142|       1939879| 9.415928034685882|
|           237|           161|       1878751|12.110161575427949|
|           237|           162|       1877525| 10.69889138093909|
|           142|           239|       1857159| 9.877218024950237|
|           161|           237|       1798585|11.741787694213826|
|           238|           239|       1683858|  8.83833200305508|
|         

In [17]:
#Distribución de passenger_count y efecto en total_amount.

df_passengers_and_amount= df_obt_taxis.groupBy("PASSENGER_COUNT").agg(F.avg("TOTAL_AMOUNT").alias("TOTAL_AMOUNT_PROMEDIO"))
df_passengers_and_amount.show(truncate=False)

+---------------+---------------------+
|PASSENGER_COUNT|TOTAL_AMOUNT_PROMEDIO|
+---------------+---------------------+
|1              |18.073842688217383   |
|9              |75.23325092707043    |
|4              |19.660885126454446   |
|7              |50.32720714285714    |
|8              |57.46265167007499    |
|3              |18.873019687466034   |
|2              |19.42674953052388    |
|5              |17.036894951985776   |
|6              |16.880792399920196   |
+---------------+---------------------+



In [18]:
#Impacto de tolls_amount y congestion_surcharge por zona.

df_impacto_zona = (df_obt_taxis.groupBy("PU_ZONE").agg(F.avg("TOLLS_AMOUNT").alias("TOLLS_AMOUNT_PROMEDIO"),F.avg("CONGESTION_SURCHARGE").alias("CONGESTION_SURCHARGE_PROMEDIO"),
F.avg("TOTAL_AMOUNT").alias("TOTAL_AMOUNT_PROMEDIO")).withColumn("PCT_TOLLS_SOBRE_TOTAL", F.col("TOLLS_AMOUNT_PROMEDIO") / F.col("TOTAL_AMOUNT_PROMEDIO") * 100)
    .withColumn("PCT_CONGESTION_OVER_TOTAL", F.col("CONGESTION_SURCHARGE_PROMEDIO") / F.col("TOTAL_AMOUNT_PROMEDIO") * 100)
    .orderBy(F.desc("TOLLS_AMOUNT_PROMEDIO")))
df_impacto_zona.show(truncate=False)

+---------------------------------+---------------------+-----------------------------+---------------------+---------------------+-------------------------+
|PU_ZONE                          |TOLLS_AMOUNT_PROMEDIO|CONGESTION_SURCHARGE_PROMEDIO|TOTAL_AMOUNT_PROMEDIO|PCT_TOLLS_SOBRE_TOTAL|PCT_CONGESTION_OVER_TOTAL|
+---------------------------------+---------------------+-----------------------------+---------------------+---------------------+-------------------------+
|Arden Heights                    |12.356260213702077   |0.14430604982206405          |83.58190446260215    |14.783415493039831   |0.17265226336955777      |
|Arrochar/Fort Wadsworth          |8.432450791200312    |0.13073593073593073          |35.198984947896605   |23.956516938435783   |0.3714196046546597       |
|Bloomfield/Emerson Hill          |7.14049686847599     |0.05121055495103373          |71.09724217119       |10.043282482438482   |0.07202889083619793      |
|Charleston/Tottenville           |6.998537586860397

In [26]:
#Proporción de viajes cortos vs largos por borough y estacionalidad.

df_proporcion_por_duracion = df_obt_taxis.groupBy("PU_BOROUGH", "MONTH").agg(F.count("*").alias("CANTIDAD_VIAJES"), F.count(F.when(F.col("TRIP_DURATION_MIN") <= 10, 1)).alias("CANTIDAD_VIAJES_CORTOS"), 
                                                                                F.count(F.when(F.col("TRIP_DURATION_MIN") > 10, 1)).alias("CANTIDAD_VIAJES_LARGOS")).withColumn("PCT_VIAJES_CORTOS", (F.col("CANTIDAD_VIAJES_CORTOS") / F.col("CANTIDAD_VIAJES"))*100).withColumn("PCT_VIAJES_LARGOS", (F.col("CANTIDAD_VIAJES_LARGOS") / F.col("CANTIDAD_VIAJES"))*100) 
df_proporcion_por_duracion.show(truncate=False)

+-------------+-----+---------------+----------------------+----------------------+------------------+------------------+
|PU_BOROUGH   |MONTH|CANTIDAD_VIAJES|CANTIDAD_VIAJES_CORTOS|CANTIDAD_VIAJES_LARGOS|PCT_VIAJES_CORTOS |PCT_VIAJES_LARGOS |
+-------------+-----+---------------+----------------------+----------------------+------------------+------------------+
|Queens       |4    |5773641        |1157214               |4616427               |20.043054287580404|79.9569457124196  |
|Staten Island|4    |3247           |776                   |2471                  |23.89898367724053 |76.10101632275948 |
|Bronx        |1    |339509         |138207                |201302                |40.707904650539454|59.29209534946055 |
|Manhattan    |8    |52599774       |24528297              |28071477              |46.63194370378854 |53.36805629621146 |
|Manhattan    |4    |61393463       |28088438              |33305025              |45.75151266511876 |54.24848733488123 |
|Bronx        |4    |371

In [27]:
#Diferencias por vendor en avg_speed_mph y trip_duration_min.

df_vendor_speed_duration= df_obt_taxis.groupBy("VENDOR_NORMALIZADO").agg(F.avg("AVG_SPEED_MPH").alias("SPEED_PROMEDIO"), F.avg("TRIP_DURATION_MIN").alias("TRIP_DURATION_PROMEDIO"))
df_vendor_speed_duration.show(truncate=False)

+----------------------------------+------------------+----------------------+
|VENDOR_NORMALIZADO                |SPEED_PROMEDIO    |TRIP_DURATION_PROMEDIO|
+----------------------------------+------------------+----------------------+
|Creative Mobile Technologies,  LLC|11.314623752064605|14.406576203117869    |
|Curb Mobility, LLC                |11.807734554353639|14.6699590657767      |
|Unknown                           |10.62233270937165 |14.334596274602358    |
+----------------------------------+------------------+----------------------+



In [28]:
#Relación método de pago ↔ tip_amount por hora.

df_pago_tip_amount= df_obt_taxis.groupBy("PAYMENT_TYPE_NORMALIZADO").agg(F.avg("TIP_AMOUNT").alias("TIP_AMOUNT_PROMEDIO"))
df_pago_tip_amount.show(truncate=False)

+------------------------+---------------------+
|PAYMENT_TYPE_NORMALIZADO|TIP_AMOUNT_PROMEDIO  |
+------------------------+---------------------+
|Unknown                 |0.5674839400428267   |
|Cash                    |1.2270949366258144E-4|
|Dispute                 |0.010330252819326303 |
|No charge               |0.004960965692320069 |
|Credit card             |3.0108076514459885   |
+------------------------+---------------------+



In [30]:
#Zonas con percentil 99 de duración/distancia fuera de rango (posible congestión/eventos).

percentiles = df_obt_taxis.select(F.expr("percentile_approx(TRIP_DURATION_MIN, 0.99)").alias("P99_TRIP_DURATION_MIN"),F.expr("percentile_approx(TRIP_DISTANCE, 0.99)").alias("P99_TRIP_DISTANCE")).collect()[0]
p99_duration = percentiles["P99_TRIP_DURATION_MIN"]
p99_distance = percentiles["P99_TRIP_DISTANCE"]
df_fuera_rango = df_obt_taxis.filter((F.col("TRIP_DURATION_MIN") > p99_duration) | (F.col("TRIP_DISTANCE") > p99_distance))
df_zonas_fuera_rango = df_fuera_rango.groupBy("PU_ZONE").agg(F.count("*").alias("CANTIDAD_VIAJES_FUERA_DE_RANGO")).orderBy(F.col("CANTIDAD_VIAJES_FUERA_DE_RANGO").desc())
df_zonas_fuera_rango.show(10, truncate=False)

+----------------------------+------------------------------+
|PU_ZONE                     |CANTIDAD_VIAJES_FUERA_DE_RANGO|
+----------------------------+------------------------------+
|JFK Airport                 |7444644                       |
|LaGuardia Airport           |995799                        |
|Times Sq/Theatre District   |330765                        |
|Midtown Center              |222279                        |
|Midtown North               |189843                        |
|Clinton East                |162603                        |
|Midtown East                |152138                        |
|Upper West Side South       |150582                        |
|N/A                         |150220                        |
|Penn Station/Madison Sq West|144079                        |
+----------------------------+------------------------------+
only showing top 10 rows



In [31]:
#Yield por milla (total_amount/trip_distance) por borough y hora.

df_yield = df_obt_taxis.withColumn("YIELD_POR_MILLA", F.col("TOTAL_AMOUNT") / F.col("TRIP_DISTANCE"))
df_yield_por_borough_hora = (df_yield.groupBy("PU_BOROUGH", "PICKUP_HOUR").agg(F.avg("YIELD_POR_MILLA").alias("YIELD_PROMEDIO_POR_MILLA"),F.count("*").alias("CANTIDAD_VIAJES")))
df_yield_por_borough_hora.show(truncate=False) 

+-------------+-----------+------------------------+---------------+
|PU_BOROUGH   |PICKUP_HOUR|YIELD_PROMEDIO_POR_MILLA|CANTIDAD_VIAJES|
+-------------+-----------+------------------------+---------------+
|Queens       |4          |7.129948289804233       |740330         |
|Staten Island|4          |18.660552876666788      |475            |
|Bronx        |1          |8.620889505930553       |85505          |
|Manhattan    |8          |9.069458782895286       |31371398       |
|N/A          |9          |76.8022947313159        |5957           |
|N/A          |21         |101.65338317367672      |7352           |
|Bronx        |16         |8.27834723828453        |237885         |
|Bronx        |14         |7.320456074838179       |222777         |
|Manhattan    |4          |6.875554490303124       |5861274        |
|Brooklyn     |17         |8.379383851448962       |1590087        |
|Bronx        |4          |7.690969390272295       |53712          |
|Unknown      |5          |9.45329

In [33]:
#Cambios YoY en volumen y ticket promedio por service_type.
from pyspark.sql import Window
df_service_year = (df_obt_taxis.groupBy("SOURCE_SERVICE", "YEAR").agg(F.count("*").alias("CANTIDAD_VIAJES"),F.avg("TOTAL_AMOUNT").alias("PROMEDIO_TOTAL_AMOUNT")))
window_spec = Window.partitionBy("SOURCE_SERVICE").orderBy("YEAR")
df_yoy = (df_service_year.withColumn("CAMBIO_VIAJES_YOY", (F.col("CANTIDAD_VIAJES") - F.lag("CANTIDAD_VIAJES").over(window_spec)) / F.lag("CANTIDAD_VIAJES").over(window_spec) * 100).withColumn("CAMBIO_TICKET_YOY", (F.col("PROMEDIO_TOTAL_AMOUNT") - F.lag("PROMEDIO_TOTAL_AMOUNT").over(window_spec)) / F.lag("PROMEDIO_TOTAL_AMOUNT").over(window_spec) * 100))
df_yoy.show(truncate=False)

+--------------+----+---------------+---------------------+-------------------+--------------------+
|SOURCE_SERVICE|YEAR|CANTIDAD_VIAJES|PROMEDIO_TOTAL_AMOUNT|CAMBIO_VIAJES_YOY  |CAMBIO_TICKET_YOY   |
+--------------+----+---------------+---------------------+-------------------+--------------------+
|green         |2015|18689108       |14.835134619584023   |NULL               |NULL                |
|green         |2016|15863890       |14.663376017488199   |-15.116922648207714|-1.157782564838232  |
|green         |2017|11404390       |14.301719276536708   |-28.111011864051   |-2.4663947819394547 |
|green         |2018|8549212        |15.879904067192971   |-25.035780081179265|11.034930557233217  |
|green         |2019|5390279        |15.99642754670758    |-36.949990244714954|0.733779492757388   |
|green         |2020|1119714        |15.6341909273239     |-79.2271605978095  |-2.2644844814630876 |
|green         |2021|607051         |18.924640615037756   |-45.78517371400197 |21.046498043

In [36]:
#Días con alta congestion_surcharge: efecto en total_amount vs días “normales”

#Se pone valor como umbral
df_congestion = df_obt_taxis.withColumn("TIPO_DIA_CONGESTION",F.when(F.col("CONGESTION_SURCHARGE") > 2.5, F.lit("ALTA")).otherwise(F.lit("NORMAL")))
df_efecto_congestion = (df_congestion.groupBy("TIPO_DIA_CONGESTION").agg( F.count("*").alias("CANTIDAD_VIAJES"),F.avg("TOTAL_AMOUNT").alias("PROMEDIO_TOTAL_AMOUNT")))
df_efecto_congestion.show(truncate=False)

+-------------------+---------------+---------------------+
|TIPO_DIA_CONGESTION|CANTIDAD_VIAJES|PROMEDIO_TOTAL_AMOUNT|
+-------------------+---------------+---------------------+
|ALTA               |1771914        |25.323585094985404   |
|NORMAL             |811260099      |18.240499993314376   |
+-------------------+---------------+---------------------+

