In [1]:
import logging
import sys
import os

# Create logger
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

# Clear any existing handlers to avoid duplicates
if logger.handlers:
    logger.handlers.clear()

# File handler - logs to file
log_path = os.path.join(os.path.dirname(os.getcwd()), 'logs', 'eda.log')
file_handler = logging.FileHandler(log_path, mode='a')
file_handler.setLevel(logging.INFO)
file_formatter = logging.Formatter(
    '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
file_handler.setFormatter(file_formatter)
logger.addHandler(file_handler)

# Console handler - minimal output to cell (optional)
console_handler = logging.StreamHandler(sys.stdout)
console_handler.setLevel(logging.WARNING)  # Only show warnings/errors in notebook
console_formatter = logging.Formatter('%(levelname)s: %(message)s')
console_handler.setFormatter(console_formatter)
logger.addHandler(console_handler)

# Helper function for flushing
def log_and_flush(message, level='info'):
    if level == 'info':
        logger.info(message)
    elif level == 'warning':
        logger.warning(message)
    elif level == 'error':
        logger.error(message)
    
    for handler in logger.handlers:
        handler.flush()

print("✓ Logger configured - logs will be written to logs/eda.log")

✓ Logger configured - logs will be written to logs/eda.log


In [2]:
from pyspark.sql import SparkSession
import importlib.util
import os

# ========== LOAD CONFIG FIRST ==========
src_path = os.path.join(os.path.dirname(os.getcwd()), 'src')
config_file = os.path.join(src_path, 'config.py')

spec = importlib.util.spec_from_file_location("config", config_file)
config_module = importlib.util.module_from_spec(spec)
spec.loader.exec_module(config_module)

Config = config_module.Config
print("✓ Config loaded")

# ========== HELPER FUNCTION ==========
def log_and_flush(message):
    logger.info(message)
    for handler in logger.handlers:
        handler.flush()

# ========== STOP EXISTING SPARK ==========
try:
    spark.stop()
    log_and_flush("Stopped existing Spark session")
except:
    log_and_flush("No existing Spark session to stop")

# ========== CREATE SPARK SESSION ==========
log_and_flush(f"Creating Spark session: {Config.APP_NAME}")

spark = SparkSession.builder \
    .appName(Config.APP_NAME) \
    .config("spark.driver.memory", Config.SPARK_DRIVER_MEMORY) \
    .config("spark.executor.memory", Config.SPARK_EXECUTOR_MEMORY) \
    .config("spark.executor.instances", Config.SPARK_EXECUTOR_INSTANCES) \
    .config("spark.executor.cores", Config.SPARK_EXECUTOR_CORES) \
    .config("spark.sql.execution.arrow.pyspark.enabled", "true") \
    .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:3.3.4,com.amazonaws:aws-java-sdk-bundle:1.12.262") \
    .getOrCreate()

log_and_flush(f"Spark session created successfully (version {spark.version})")

# ========== CONFIGURE HADOOP FOR MINIO ==========
log_and_flush("Configuring Hadoop for MinIO")

hadoop_conf = spark._jsc.hadoopConfiguration()
hadoop_conf.set("fs.s3a.endpoint", Config.MINIO_ENDPOINT)
hadoop_conf.set("fs.s3a.access.key", Config.MINIO_ACCESS_KEY)
hadoop_conf.set("fs.s3a.secret.key", Config.MINIO_SECRET_KEY)
hadoop_conf.set("fs.s3a.path.style.access", "true")
hadoop_conf.set("fs.s3a.connection.ssl.enabled", "false")
hadoop_conf.set("fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")

log_and_flush(f"Spark configured for environment: {Config.ENVIRONMENT}")
log_and_flush(f"Using MinIO endpoint: {Config.MINIO_ENDPOINT}")
log_and_flush(f"Reading from bucket: {Config.S3_BUCKET_NAME}")

print("\n" + "="*50)
print("✓ Spark Session Ready")
print("="*50)

# Display configuration (on its own line!)
Config.display_config()

✓ Config loaded

✓ Spark Session Ready
Current Configuration:
App Name: NYC Taxi EDA
Environment: development
MinIO Endpoint: http://minio:9000
S3 Bucket: nyc-taxi
Spark Driver Memory: 3g
Spark Executor Memory: 3g
Spark Executor Instances: 3
Spark Executor Cores: 2
Log Level: INFO
Log File: eda.log


# EDA

In [3]:
logger.info("Starting data load from S3")
df = spark.read.parquet("s3a://nyc-taxi/Initial_DF/")
logger.info(f"Data loaded successfully. Total rows: {df.count()}")

In [4]:
df.describe().show()

+-------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+-----------------+-----------------+---------------------+------------------+--------------------+-------------------+-------------------+
|summary|          VendorID|   passenger_count|     trip_distance|        RatecodeID|store_and_fwd_flag|      PULocationID|      DOLocationID|      payment_type|       fare_amount|             extra|           mta_tax|       tip_amount|     tolls_amount|improvement_surcharge|      total_amount|congestion_surcharge|        Airport_fee| cbd_congestion_fee|
+-------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+-----------------+-----------------+---------------------+----------

We can see three problems with the data through the summary statistics - 
1. Many rows don't contain any value in the passenger count column.
2. The min values of fees and amount-related columns are negative.
3. The max values for the trip_distance and fare_amount columns are too high.
4. Zero trip_distance and fare_amount values.

### Going through the rows with null values in the passenger count column.

In [32]:
from pyspark.sql.functions import col

df_null_passenger = df.filter(col("passenger_count").isNull())
df_null_passenger.describe().show()

+-------+------------------+---------------+------------------+----------+------------------+------------------+------------------+------------+------------------+------------------+------------------+------------------+------------------+---------------------+------------------+--------------------+-----------+------------------+
|summary|          VendorID|passenger_count|     trip_distance|RatecodeID|store_and_fwd_flag|      PULocationID|      DOLocationID|payment_type|       fare_amount|             extra|           mta_tax|        tip_amount|      tolls_amount|improvement_surcharge|      total_amount|congestion_surcharge|Airport_fee|cbd_congestion_fee|
+-------+------------------+---------------+------------------+----------+------------------+------------------+------------------+------------+------------------+------------------+------------------+------------------+------------------+---------------------+------------------+--------------------+-----------+------------------+
|

In [33]:
df_null_passenger.groupBy("VendorID").count().show()

+--------+--------+
|VendorID|   count|
+--------+--------+
|       1| 2391564|
|       2|10333323|
|       6|   19291|
+--------+--------+



In [34]:
df_null_passenger.groupBy("PULocationID").count().show()

+------------+------+
|PULocationID| count|
+------------+------+
|         148|246081|
|         243| 30060|
|          31|   923|
|         137|180374|
|          85| 13129|
|          65| 43493|
|         255| 84605|
|          53|  2760|
|         133|  8166|
|          78|  6104|
|         155|  6324|
|         108|  2026|
|         211|137590|
|         193| 15606|
|          34|  6386|
|         101|  2033|
|         126|  5921|
|          81|  3948|
|          28|  9754|
|         210|  4470|
+------------+------+
only showing top 20 rows



In [35]:
df_null_passenger.groupBy("DOLocationID").count().show()

+------------+------+
|DOLocationID| count|
+------------+------+
|         148|236461|
|         243| 85820|
|          31|  1774|
|         137|179076|
|          85| 13306|
|          65| 22668|
|         255| 41330|
|          53|  6211|
|         133|  7503|
|          78|  7578|
|         155|  7884|
|         108|  3590|
|         211|125434|
|         193| 11754|
|          34|  4322|
|         126|  6356|
|         101|  2569|
|          81|  4304|
|          28|  9795|
|         210|  6891|
+------------+------+
only showing top 20 rows



This shows there is not a pattern with null values for a specific vendor or location. Therefore we can go ahead with removing these rows with null values.

In [5]:
df = df.dropna()
df.count()

98292206

### For tipping prediction/analysis, negative amounts are not normal “real trips”, so we are going to exclude them

In [8]:
from pyspark.sql.functions import col

df = df.filter(
    (col("fare_amount") >= 0) &
    (col("trip_distance") >= 0) &
    (col("total_amount") >= 0) &
    (col("tip_amount") >= 0) &
    (col("extra") >= 0) &
    (col("mta_tax") >= 0) &
    (col("tolls_amount") >= 0) &
    (col("improvement_surcharge") >= 0) &
    (col("congestion_surcharge") >= 0) &
    (col("Airport_fee") >= 0) &
    (col("cbd_congestion_fee") >= 0)
)

# df.describe().show()

### Now, we will do the outlier analysis for trip distance

In [60]:
df.select("trip_distance").describe().show()

+-------+------------------+
|summary|     trip_distance|
+-------+------------------+
|  count|          96746457|
|   mean|3.5408033883862964|
| stddev| 76.14819849332225|
|    min|               0.0|
|    max|          161726.1|
+-------+------------------+



In [61]:
q1, median, q3 = df.approxQuantile("trip_distance", [0.25, 0.5, 0.75], 0.01)

iqr = q3 - q1
lower = q1 - 1.5 * iqr
upper = q3 + 1.5 * iqr

print("Q1:", q1)
print("Median:", median)
print("Q3:", q3)
print("IQR:", iqr)
print("Lower Bound:", lower)
print("Upper Bound:", upper)

Q1: 1.02
Median: 1.72
Q3: 3.28
IQR: 2.26
Lower Bound: -2.3699999999999997
Upper Bound: 6.67


In [62]:
from pyspark.sql.functions import col

outliers_df = df.filter((col("trip_distance") < lower) | (col("trip_distance") > upper))
normal_df   = df.filter((col("trip_distance") >= lower) & (col("trip_distance") <= upper))

print("Normal Rows:", normal_df.count())
print("Outlier Rows:", outliers_df.count())

Normal Rows: 83708873
Outlier Rows: 13037584


The IQR method won't be applicable here since the outlier rows are in millions. So, we will take 100 as the outlier distance.

In [66]:
long_distance_df = df.filter(col("trip_distance") > 100)

print("Long Distance Rows:", long_distance_df.count())

High Distance Rows: 2012


In [67]:
total_rows = df.count()
long_distance_rows = long_distance_df.count()

print("Long Distance Rows %:", (long_distance_rows / total_rows) * 100)

High Distance Rows %: 0.0020796627208787607


100 seems like a good value to take as an outlier. Therefore, we will filter out the rows that contain a trip distance greater than 100.

In [9]:
df = df.filter((col("trip_distance") <= 100))

print("Rows after trimming:", df.count())

Rows after trimming: 96744445


In [70]:
df.describe().show()

+-------+------------------+------------------+-----------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+---------------------+------------------+--------------------+-------------------+-------------------+
|summary|          VendorID|   passenger_count|    trip_distance|        RatecodeID|store_and_fwd_flag|      PULocationID|      DOLocationID|      payment_type|       fare_amount|             extra|           mta_tax|        tip_amount|      tolls_amount|improvement_surcharge|      total_amount|congestion_surcharge|        Airport_fee| cbd_congestion_fee|
+-------+------------------+------------------+-----------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+---------------------+-------

We can still see very high total amount values. Next, we will filter for that (i.e., amount greater than 500).  

In [73]:
high_cost_df = df.filter(col("total_amount") > 500)
print("High Cost Rows:", high_cost_df.count())

High Cost Rows: 1761


In [74]:
total_rows = df.count()
high_cost_df = high_cost_df.count()

print("High Cost Rows %:", (high_cost_df / total_rows) * 100)

High Cost Rows %: 0.0018202595508196879


In [10]:
df = df.filter((col("total_amount") <= 500))

print("Rows after trimming:", df.count())

Rows after trimming: 96742684


In [76]:
df.describe().show()

+-------+------------------+------------------+-----------------+------------------+------------------+------------------+------------------+------------------+-----------------+------------------+--------------------+------------------+------------------+---------------------+------------------+--------------------+-------------------+-------------------+
|summary|          VendorID|   passenger_count|    trip_distance|        RatecodeID|store_and_fwd_flag|      PULocationID|      DOLocationID|      payment_type|      fare_amount|             extra|             mta_tax|        tip_amount|      tolls_amount|improvement_surcharge|      total_amount|congestion_surcharge|        Airport_fee| cbd_congestion_fee|
+-------+------------------+------------------+-----------------+------------------+------------------+------------------+------------------+------------------+-----------------+------------------+--------------------+------------------+------------------+---------------------+----

### Check for zero distance and zero fare rows.

In [81]:
zero_distance_df = df.filter(col("trip_distance") == 0)
print("Zero Distance Rows:", zero_distance_df.count())

Zero Distance Rows: 1179794


In [82]:
zero_fare_df = df.filter(col("fare_amount") == 0)
print("Zero Fare Rows:", zero_fare_df.count())

Zero Fare Rows: 30019


In [86]:
zero_distance_df.select("fare_amount", "total_amount", "tip_amount", "payment_type") \
  .describe() \
  .show()

+-------+------------------+-----------------+-----------------+------------------+
|summary|       fare_amount|     total_amount|       tip_amount|      payment_type|
+-------+------------------+-----------------+-----------------+------------------+
|  count|           1179794|          1179794|          1179794|           1179794|
|   mean|30.403765801486582|37.93141550134817|3.454934920842125|  1.67966950162486|
| stddev| 39.06620980276862|42.90768346177385|7.665664297335325|0.9476936000172407|
|    min|               0.0|              0.0|              0.0|                 1|
|    max|             500.0|            500.0|            494.0|                 5|
+-------+------------------+-----------------+-----------------+------------------+



In [88]:
zero_fare_df.select("trip_distance", "fare_amount", "total_amount", "tip_amount", "payment_type") \
  .describe() \
  .show()

+-------+------------------+-----------+-----------------+------------------+------------------+
|summary|     trip_distance|fare_amount|     total_amount|        tip_amount|      payment_type|
+-------+------------------+-----------+-----------------+------------------+------------------+
|  count|             30019|      30019|            30019|             30019|             30019|
|   mean|1.9949012292214916|        0.0|2.604796295679404|0.7418185149405377|2.5519171191578667|
| stddev|  5.50628341973777|        0.0|9.404846416926187| 8.269101898021075|1.0091499106679318|
|    min|               0.0|        0.0|              0.0|               0.0|                 1|
|    max|              93.5|        0.0|           286.43|             270.0|                 5|
+-------+------------------+-----------+-----------------+------------------+------------------+



Because trip_distance and fare_amount are important features, and 0-distance or 0-fare-amount rows can confuse our analysis. We will filter them out.

In [11]:
df = df.filter((col("trip_distance") > 0) & (col("fare_amount") > 0))
df.count()

95547339

In [90]:
df.describe().show()

+-------+------------------+------------------+------------------+-----------------+------------------+------------------+------------------+------------------+------------------+------------------+-------------------+------------------+------------------+---------------------+------------------+--------------------+-------------------+-------------------+
|summary|          VendorID|   passenger_count|     trip_distance|       RatecodeID|store_and_fwd_flag|      PULocationID|      DOLocationID|      payment_type|       fare_amount|             extra|            mta_tax|        tip_amount|      tolls_amount|improvement_surcharge|      total_amount|congestion_surcharge|        Airport_fee| cbd_congestion_fee|
+-------+------------------+------------------+------------------+-----------------+------------------+------------------+------------------+------------------+------------------+------------------+-------------------+------------------+------------------+---------------------+----

In [4]:
logger.info("Saving cleaned dataset to S3")
try:
    df.write \
      .mode("overwrite") \
      .parquet("s3a://nyc-taxi/Clean_DF")
    logger.info("✓ Cleaned dataset saved successfully to s3a://nyc-taxi/Clean_DF")
except Exception as e:
    logger.error(f"Failed to save cleaned dataset: {str(e)}")
    raise

### The dataset is clean and ready to be used for modelling.