In [1]:
from pyspark.sql import SparkSession
from pathlib import Path
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.functions import col

# Create SparkSession in local mode
spark = SparkSession.builder \
    .appName("FareTipModeling") \
    .master("local[*]") \
    .getOrCreate()


In [6]:
folder_path = Path(r"D:\L4S2\Big Data\assignment\NYC_Taxi_Trip_Data_Analysis\data\cleaned")

parquet_files = sorted(folder_path.glob("*.parquet"))
print(f"Found {len(parquet_files)} parquet files:")
for f in parquet_files:
    print(f.name)


Found 8 parquet files:
part-00000-c267671b-1832-48c2-97b7-10a9827a836f-c000.snappy.parquet
part-00001-c267671b-1832-48c2-97b7-10a9827a836f-c000.snappy.parquet
part-00002-c267671b-1832-48c2-97b7-10a9827a836f-c000.snappy.parquet
part-00003-c267671b-1832-48c2-97b7-10a9827a836f-c000.snappy.parquet
part-00004-c267671b-1832-48c2-97b7-10a9827a836f-c000.snappy.parquet
part-00005-c267671b-1832-48c2-97b7-10a9827a836f-c000.snappy.parquet
part-00006-c267671b-1832-48c2-97b7-10a9827a836f-c000.snappy.parquet
part-00007-c267671b-1832-48c2-97b7-10a9827a836f-c000.snappy.parquet


In [7]:
def filter_valid_fares_tips(df):
    filtered_df = df.filter((df.fare_amount > 0) & (df.tip_amount >= 0))
    print(f"Filtered from {df.count()} rows to {filtered_df.count()} rows with valid fare and tip amounts")
    return filtered_df


In [8]:
for file_path in parquet_files:
    print(f"\nProcessing file: {file_path.name}")
    df = spark.read.parquet(str(file_path))
    
    # Run your filter
    filtered_df = filter_valid_fares_tips(df)
    
    # You can add more processing steps here (e.g. modeling)



Processing file: part-00000-c267671b-1832-48c2-97b7-10a9827a836f-c000.snappy.parquet
Filtered from 3918134 rows to 3918134 rows with valid fare and tip amounts

Processing file: part-00001-c267671b-1832-48c2-97b7-10a9827a836f-c000.snappy.parquet
Filtered from 3933455 rows to 3933455 rows with valid fare and tip amounts

Processing file: part-00002-c267671b-1832-48c2-97b7-10a9827a836f-c000.snappy.parquet
Filtered from 3928190 rows to 3928190 rows with valid fare and tip amounts

Processing file: part-00003-c267671b-1832-48c2-97b7-10a9827a836f-c000.snappy.parquet
Filtered from 3936382 rows to 3936382 rows with valid fare and tip amounts

Processing file: part-00004-c267671b-1832-48c2-97b7-10a9827a836f-c000.snappy.parquet
Filtered from 3930808 rows to 3930808 rows with valid fare and tip amounts

Processing file: part-00005-c267671b-1832-48c2-97b7-10a9827a836f-c000.snappy.parquet
Filtered from 5205187 rows to 5205187 rows with valid fare and tip amounts

Processing file: part-00006-c2676