In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Read CSV Example") \
    .getOrCreate()

In [None]:
data = """id,name,city,age,salary
1,Arjun,Hyderabad,25,45000
2,Meera,Chennai,32,52000
3,Rajesh,Bangalore,29,61000
4,Priya,Delhi,22,38000
5,Sanjay,Mumbai,35,72000
6,Kavya,Hyderabad,28,48000
7,Imran,Delhi,31,53000
8,Divya,Chennai,27,45000
9,Anil,Bangalore,40,85000
10,Ritu,Mumbai,23,39000
"""

with open("employees.csv","w") as f:
  f.write(data)

In [None]:
rdd = spark.sparkContext.textFile("employees.csv")
rdd.take(5)


['id,name,city,age,salary',
 '1,Arjun,Hyderabad,25,45000',
 '2,Meera,Chennai,32,52000',
 '3,Rajesh,Bangalore,29,61000',
 '4,Priya,Delhi,22,38000']

In [None]:
header = rdd.first()
data_rdd = rdd.filter(lambda row: row != header)
data_rdd.collect()

['1,Arjun,Hyderabad,25,45000',
 '2,Meera,Chennai,32,52000',
 '3,Rajesh,Bangalore,29,61000',
 '4,Priya,Delhi,22,38000',
 '5,Sanjay,Mumbai,35,72000',
 '6,Kavya,Hyderabad,28,48000',
 '7,Imran,Delhi,31,53000',
 '8,Divya,Chennai,27,45000',
 '9,Anil,Bangalore,40,85000',
 '10,Ritu,Mumbai,23,39000']

In [None]:
split_rdd = data_rdd.map(lambda row: row.split(","))
split_rdd.take(3)

[['1', 'Arjun', 'Hyderabad', '25', '45000'],
 ['2', 'Meera', 'Chennai', '32', '52000'],
 ['3', 'Rajesh', 'Bangalore', '29', '61000']]

In [None]:
city_salary_rdd = split_rdd.map(lambda x: (x[2], int(x[4])))
city_salary_rdd.collect()

[('Hyderabad', 45000),
 ('Chennai', 52000),
 ('Bangalore', 61000),
 ('Delhi', 38000),
 ('Mumbai', 72000),
 ('Hyderabad', 48000),
 ('Delhi', 53000),
 ('Chennai', 45000),
 ('Bangalore', 85000),
 ('Mumbai', 39000)]

In [None]:
total_salary_per_city = city_salary_rdd.reduceByKey(lambda a,b: a+b)
total_salary_per_city.collect()

[('Hyderabad', 93000),
 ('Delhi', 91000),
 ('Mumbai', 111000),
 ('Chennai', 97000),
 ('Bangalore', 146000)]

In [None]:
highest_city = total_salary_per_city.reduce(
    lambda a,b: a if a[1] > b[1] else b
)
highest_city

('Bangalore', 146000)

PYSPARK ASSIGNMENT


In [11]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Read CSV Example") \
    .getOrCreate()

In [12]:
data = """call_id,caller,receiver,city,call_type,duration_seconds,cost
C001,Amit,Rahul,Hyderabad,Local,180,2.5
C002,Neha,Arjun,Bangalore,STD,320,6.0
C003,Rahul,Pooja,Delhi,Local,60,1.0
C004,Pooja,Neha,Mumbai,ISD,900,25.0
C005,Arjun,Amit,Chennai,STD,400,7.5
C006,Sneha,Karan,Hyderabad,Local,240,3.0
C007,Karan,Sneha,Delhi,Local,120,2.0
C008,Riya,Vikas,Bangalore,STD,360,6.5
C009,Vikas,Riya,Mumbai,ISD,1100,30.0
C010,Anjali,Sanjay,Chennai,Local,90,1.5
C011,Farhan,Ayesha,Delhi,STD,420,7.0
C012,Ayesha,Farhan,Hyderabad,ISD,950,28.0
C013,Suresh,Divya,Bangalore,Local,150,2.0
C014,Divya,Suresh,Mumbai,STD,380,6.8
C015,Nikhil,Priya,Delhi,Local,200,2.8
C016,Priya,Nikhil,Chennai,STD,410,7.2
C017,Rohit,Kavya,Hyderabad,Local,170,2.3
C018,Kavya,Rohit,Bangalore,Local,140,2.1
C019,Manish,Tina,Mumbai,ISD,1000,27.0
C020,Tina,Manish,Delhi,STD,350,6.2
"""

with open("call_records.csv","w") as f:
  f.write(data)

In [14]:
record_rdd_raw = spark.sparkContext.textFile("call_records.csv")
header_calls = record_rdd_raw.first()
record_rdd = record_rdd_raw.filter(lambda row: row != header_calls)
total_calls = record_rdd.count()
print("Total Calls:",total_calls)

Total Calls: 20


In [16]:
isd_calls = record_rdd.filter(lambda x: x.split(',')[4] =="ISD")
isd_calls.collect()

['C004,Pooja,Neha,Mumbai,ISD,900,25.0',
 'C009,Vikas,Riya,Mumbai,ISD,1100,30.0',
 'C012,Ayesha,Farhan,Hyderabad,ISD,950,28.0',
 'C019,Manish,Tina,Mumbai,ISD,1000,27.0']

In [18]:
city_duration = record_rdd.map(lambda x: (x.split(',')[3], int(x.split(',')[5]))) \
                          .reduceByKey(lambda a,b: a+b)
city_duration.collect()

[('Hyderabad', 1540),
 ('Delhi', 1150),
 ('Mumbai', 3380),
 ('Bangalore', 970),
 ('Chennai', 900)]

In [21]:
highest_cost_call = record_rdd.map(lambda x: (float(x.split(',')[6]), x)) \
                               .max(lambda x: x[0])
print("Highest Cost Call:", highest_cost_call)

Highest Cost Call: (30.0, 'C009,Vikas,Riya,Mumbai,ISD,1100,30.0')


In [23]:
std_calls = record_rdd.filter(lambda x: x.split(',')[4] == "STD") \
                       .map(lambda x: float(x.split(',')[6]))
total_std_cost = std_calls.reduce(lambda a, b: a + b)
avg_std_cost = total_std_cost / std_calls.count()
print("Average STD Call Cost:", avg_std_cost)

Average STD Call Cost: 6.742857142857143


In [25]:
calls_by_type = record_rdd.map(lambda x: (x.split(',')[4], 1)) \
                           .reduceByKey(lambda a, b: a + b)
calls_by_type.collect()

[('Local', 9), ('STD', 7), ('ISD', 4)]

In [27]:
city_cost = record_rdd.map(lambda x: (x.split(',')[3], float(x.split(',')[6]))) \
                       .reduceByKey(lambda a, b: a + b)
top_3_cities = city_cost.takeOrdered(3, key=lambda x: -x[1])
print("Top 3 Cities by Cost:", top_3_cities)

Top 3 Cities by Cost: [('Mumbai', 88.8), ('Hyderabad', 35.8), ('Delhi', 19.0)]


In [29]:
total_revenue = record_rdd.map(lambda x: float(x.split(',')[6])) \
                           .reduce(lambda a, b: a + b)
print("Total Revenue:", total_revenue)

Total Revenue: 176.39999999999998


Partitioning and Repartitioning

In [30]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Read CSV Example") \
    .getOrCreate()

In [31]:
data = [
    ("O001","Hyderabad","Electronics",1200,"Delivered"),
    ("O002","Delhi","Clothing",800,"Delivered"),
    ("O003","Mumbai","Electronics",1500,"Cancelled"),
    ("O004","Bangalore","Grocery",400,"Delivered"),
    ("O005","Hyderabad","Grocery",300,"Delivered"),
    ("O006","Delhi","Electronics",2000,"Delivered"),
    ("O007","Mumbai","Clothing",700,"Delivered"),
    ("O008","Bangalore","Electronics",1800,"Delivered"),
    ("O009","Delhi","Grocery",350,"Cancelled"),
    ("O010","Hyderabad","Clothing",900,"Delivered")
]

columns = ["order_id","city","category","order_amount","status"]

df = spark.createDataFrame(data, columns)
df.show()
df.printSchema()

+--------+---------+-----------+------------+---------+
|order_id|     city|   category|order_amount|   status|
+--------+---------+-----------+------------+---------+
|    O001|Hyderabad|Electronics|        1200|Delivered|
|    O002|    Delhi|   Clothing|         800|Delivered|
|    O003|   Mumbai|Electronics|        1500|Cancelled|
|    O004|Bangalore|    Grocery|         400|Delivered|
|    O005|Hyderabad|    Grocery|         300|Delivered|
|    O006|    Delhi|Electronics|        2000|Delivered|
|    O007|   Mumbai|   Clothing|         700|Delivered|
|    O008|Bangalore|Electronics|        1800|Delivered|
|    O009|    Delhi|    Grocery|         350|Cancelled|
|    O010|Hyderabad|   Clothing|         900|Delivered|
+--------+---------+-----------+------------+---------+

root
 |-- order_id: string (nullable = true)
 |-- city: string (nullable = true)
 |-- category: string (nullable = true)
 |-- order_amount: long (nullable = true)
 |-- status: string (nullable = true)



In [32]:
df.rdd.getNumPartitions()


2

In [33]:
df_repart = df.repartition(4)
df_repart.rdd.getNumPartitions()

4

In [34]:
df_coalesce = df_repart.coalesce(1)
df_coalesce.rdd.getNumPartitions()

1

**Transforamtion VS Action**

*Transformation*

Definition: A transformation is an operation on an RDD that produces another RDD.
Lazy Evaluation: Transformations are lazy, meaning they do not execute immediately. They only define a computation plan.
Examples:

map()
filter()
flatMap()
reduceByKey()
groupByKey()


Output: Always returns a new RDD.
Key Point: No actual computation happens until an action is called.


 *Action*

Definition: An action triggers the execution of the transformations and returns a result to the driver program or writes data to storage.
Immediate Execution: Actions execute immediately and materialize the computation.
Examples:

collect() → Returns all elements to the driver.
count() → Returns the number of elements.
take(n) → Returns first n elements.
reduce() → Aggregates elements.
saveAsTextFile() → Writes data to storage.


Output: Returns a value (not an RDD) or writes to external storage.




In [35]:
filtered_df = df.filter(df.city == "Delhi")
selected_df = filtered_df.select("order_id","order_amount")

In [38]:
from pyspark.sql.functions import broadcast

broadcast_join_df = filtered_df.join(
    broadcast(city_df),
    on = "city",
    how = "inner"
)

final_broadcast_df = broadcast_join_df.select(
    "order_id",
    "city",
    "city_category",
    "order_amount"

)

NameError: name 'city_df' is not defined

In [39]:
city_df = df.select("city").distinct()
city_df.show()

+---------+
|     city|
+---------+
|Bangalore|
|   Mumbai|
|    Delhi|
|Hyderabad|
+---------+



In [40]:
orders_data = [
    ("O001","Hyderabad",1200),
    ("O002","Delhi",800),
    ("O003","Mumbai",1500),
    ("O004","Bangalore",400),
    ("O005","Hyderabad",300),
    ("O006","Delhi",2000),
    ("O007","Mumbai",700),
    ("O008","Bangalore",1800),
    ("O009","Delhi",350),
    ("O010","Hyderabad",900)
]

orders_cols = ["order_id","city","order_amount"]

orders_df = spark.createDataFrame(orders_data, orders_cols)
orders_df.show()

+--------+---------+------------+
|order_id|     city|order_amount|
+--------+---------+------------+
|    O001|Hyderabad|        1200|
|    O002|    Delhi|         800|
|    O003|   Mumbai|        1500|
|    O004|Bangalore|         400|
|    O005|Hyderabad|         300|
|    O006|    Delhi|        2000|
|    O007|   Mumbai|         700|
|    O008|Bangalore|        1800|
|    O009|    Delhi|         350|
|    O010|Hyderabad|         900|
+--------+---------+------------+



In [41]:
city_data = [
    ("Hyderabad","Tier-1"),
    ("Delhi","Tier-1"),
    ("Mumbai","Tier-1"),
    ("Bangalore","Tier-1")
]

city_cols = ["city","city_category"]

In [42]:
city_df = spark.createDataFrame(city_data, city_cols)
city_df.show()

+---------+-------------+
|     city|city_category|
+---------+-------------+
|Hyderabad|       Tier-1|
|    Delhi|       Tier-1|
|   Mumbai|       Tier-1|
|Bangalore|       Tier-1|
+---------+-------------+



In [45]:
from pyspark.sql.functions import col

filtered_orders = orders_df.filter(col("order_amount") > 500)

joined_df = filtered_orders.join(
    city_df,
    on="city",
    how="inner"
)

final_df = joined_df.select(
    "order_id",
    "city",
    "city_category",
    "order_amount"
)


In [46]:
final_df.explain(True)

== Parsed Logical Plan ==
'Project ['order_id, 'city, 'city_category, 'order_amount]
+- Project [city#26, order_id#25, order_amount#27L, city_category#39]
   +- Join Inner, (city#26 = city#38)
      :- Filter (order_amount#27L > cast(500 as bigint))
      :  +- LogicalRDD [order_id#25, city#26, order_amount#27L], false
      +- LogicalRDD [city#38, city_category#39], false

== Analyzed Logical Plan ==
order_id: string, city: string, city_category: string, order_amount: bigint
Project [order_id#25, city#26, city_category#39, order_amount#27L]
+- Project [city#26, order_id#25, order_amount#27L, city_category#39]
   +- Join Inner, (city#26 = city#38)
      :- Filter (order_amount#27L > cast(500 as bigint))
      :  +- LogicalRDD [order_id#25, city#26, order_amount#27L], false
      +- LogicalRDD [city#38, city_category#39], false

== Optimized Logical Plan ==
Project [order_id#25, city#26, city_category#39, order_amount#27L]
+- Join Inner, (city#26 = city#38)
   :- Filter ((isnotnull(orde

In [48]:
from pyspark.sql.functions import broadcast
broadcast_join_df = filtered_orders.join(
 broadcast(city_df),
 on = "city",
 how = "inner"
)

final_broadcast_df = broadcast_join_df.select(
    "order_id",
    "city",
    "city_category",
    "order_amount"
)

In [49]:
final_broadcast_df.explain(True)

== Parsed Logical Plan ==
'Project ['order_id, 'city, 'city_category, 'order_amount]
+- Project [city#26, order_id#25, order_amount#27L, city_category#39]
   +- Join Inner, (city#26 = city#38)
      :- Filter (order_amount#27L > cast(500 as bigint))
      :  +- LogicalRDD [order_id#25, city#26, order_amount#27L], false
      +- ResolvedHint (strategy=broadcast)
         +- LogicalRDD [city#38, city_category#39], false

== Analyzed Logical Plan ==
order_id: string, city: string, city_category: string, order_amount: bigint
Project [order_id#25, city#26, city_category#39, order_amount#27L]
+- Project [city#26, order_id#25, order_amount#27L, city_category#39]
   +- Join Inner, (city#26 = city#38)
      :- Filter (order_amount#27L > cast(500 as bigint))
      :  +- LogicalRDD [order_id#25, city#26, order_amount#27L], false
      +- ResolvedHint (strategy=broadcast)
         +- LogicalRDD [city#38, city_category#39], false

== Optimized Logical Plan ==
Project [order_id#25, city#26, city_cat