In [1]:

from pyspark.sql import SparkSession


spark = SparkSession.builder.appName("RideSharingAnalytics").getOrCreate()


data = [
("T001","Amit","Hyderabad","Ramesh","Sedan",12.5,320,28,"UPI","Completed"),
("T002","Neha","Bangalore","Suresh","Mini",8.2,210,22,"Card","Completed"),
("T003","Rahul","Delhi","Anil","Bike",5.1,120,15,"Cash","Completed"),
("T004","Pooja","Mumbai","Vikas","SUV",18.0,560,45,"UPI","Cancelled"),
("T005","Arjun","Chennai","Kumar","Mini",7.8,200,20,"UPI","Completed"),

]

columns = [
"trip_id","rider_name","city","driver_name","vehicle_type",
"distance_km","trip_fare","trip_duration_minutes",
"payment_mode","trip_status"
]


df = spark.createDataFrame(data, columns)
df.show()
df.printSchema()


+-------+----------+---------+-----------+------------+-----------+---------+---------------------+------------+-----------+
|trip_id|rider_name|     city|driver_name|vehicle_type|distance_km|trip_fare|trip_duration_minutes|payment_mode|trip_status|
+-------+----------+---------+-----------+------------+-----------+---------+---------------------+------------+-----------+
|   T001|      Amit|Hyderabad|     Ramesh|       Sedan|       12.5|      320|                   28|         UPI|  Completed|
|   T002|      Neha|Bangalore|     Suresh|        Mini|        8.2|      210|                   22|        Card|  Completed|
|   T003|     Rahul|    Delhi|       Anil|        Bike|        5.1|      120|                   15|        Cash|  Completed|
|   T004|     Pooja|   Mumbai|      Vikas|         SUV|       18.0|      560|                   45|         UPI|  Cancelled|
|   T005|     Arjun|  Chennai|      Kumar|        Mini|        7.8|      200|                   20|         UPI|  Completed|


In [2]:
df.write.option("header", True).csv("trips_csv/")

In [3]:

csv_df = spark.read.option("header", True).csv("trips_csv/")
filtered_df = csv_df.filter((csv_df.trip_fare > 400) & (csv_df.trip_status == "Completed"))
filtered_df.show()


+-------+----------+----+-----------+------------+-----------+---------+---------------------+------------+-----------+
|trip_id|rider_name|city|driver_name|vehicle_type|distance_km|trip_fare|trip_duration_minutes|payment_mode|trip_status|
+-------+----------+----+-----------+------------+-----------+---------+---------------------+------------+-----------+
+-------+----------+----+-----------+------------+-----------+---------+---------------------+------------+-----------+



In [5]:
csv_df.select("trip_id","city","vehicle_type","trip_fare") \
      .orderBy(csv_df.trip_fare.desc()).show()

+-------+---------+------------+---------+
|trip_id|     city|vehicle_type|trip_fare|
+-------+---------+------------+---------+
|   T004|   Mumbai|         SUV|      560|
|   T001|Hyderabad|       Sedan|      320|
|   T002|Bangalore|        Mini|      210|
|   T005|  Chennai|        Mini|      200|
|   T003|    Delhi|        Bike|      120|
+-------+---------+------------+---------+



In [6]:

bike_df = df.filter(df.vehicle_type == "Bike")
bike_df.write.option("header", True).option("delimiter", "|").csv("bike_trips_csv/")


In [7]:

mumbai_df = df.filter(df.city == "Mumbai")
mumbai_df.write.json("mumbai_trips_json/")


In [8]:

json_df = spark.read.json("mumbai_trips_json/")
json_df = json_df.withColumn("fare_per_km", json_df.trip_fare / json_df.distance_km)
json_df.write.json("mumbai_trips_with_fare_json/")


In [9]:
json_df.filter((json_df.payment_mode == "Card") & (json_df.vehicle_type == "SUV")).show()

+----+-----------+-----------+------------+----------+---------------------+---------+-------+-----------+------------+-----------+
|city|distance_km|driver_name|payment_mode|rider_name|trip_duration_minutes|trip_fare|trip_id|trip_status|vehicle_type|fare_per_km|
+----+-----------+-----------+------------+----------+---------------------+---------+-------+-----------+------------+-----------+
+----+-----------+-----------+------------+----------+---------------------+---------+-------+-----------+------------+-----------+



In [10]:
json_df.coalesce(1).write.json("single_partition_json/")

In [11]:
df.write.parquet("trips_parquet/")

In [12]:

parquet_df = spark.read.parquet("trips_parquet/")


In [14]:
p10_df = parquet_df.orderBy(parquet_df.distance_km.desc()).limit(10)
p10_df.write.parquet("top10_trips_parquet/")

In [15]:

csv_df.write.parquet("csv_to_parquet/")
json_df.write.parquet("json_to_parquet/")


In [16]:
parquet_df.write.option("header", True).option("delimiter", ",").csv("parquet_to_csv/")

In [17]:
df.groupBy("city").sum("trip_fare").orderBy("sum(trip_fare)", ascending=False).show()

+---------+--------------+
|     city|sum(trip_fare)|
+---------+--------------+
|   Mumbai|           560|
|Hyderabad|           320|
|Bangalore|           210|
|  Chennai|           200|
|    Delhi|           120|
+---------+--------------+



In [18]:
df.groupBy("vehicle_type").avg("trip_fare").orderBy("avg(trip_fare)", ascending=False).show()

+------------+--------------+
|vehicle_type|avg(trip_fare)|
+------------+--------------+
|         SUV|         560.0|
|       Sedan|         320.0|
|        Mini|         205.0|
|        Bike|         120.0|
+------------+--------------+



In [19]:

df.filter(df.trip_status == "Completed") \
  .groupBy("driver_name").count() \
  .orderBy("count", ascending=False).show()


+-----------+-----+
|driver_name|count|
+-----------+-----+
|     Ramesh|    1|
|     Suresh|    1|
|      Kumar|    1|
|       Anil|    1|
+-----------+-----+



In [20]:
df.repartition(4).write.parquet("repartitioned_parquet/")

In [22]:
from pyspark.sql import functions as F

summary_df = df.groupBy("city").agg(
    F.count("*").alias("total_trips"),
    F.sum("trip_fare").alias("total_revenue"),
    F.avg("trip_duration_minutes").alias("average_trip_duration")
)
summary_df.write.parquet("city_summary_parquet/")