<a href="https://colab.research.google.com/github/kareemullah123456789/big_data_advanced/blob/main/pyspark_tutorial.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!ls /content/drive/MyDrive/cde | grep -i cde

16-Nov-CDE
20-JAN-25-CDE-BUN-031-WDM0700-ONL
CDE_AWS
cde_data
CDE_docker_DEA
CDE_interview
CDE_manjunatha
Copy of CDE Internship Project Submission Form  .gform
Copy of CDE Internship Project Submission Form   (Responses).gsheet
Copy of CDE recording status report.gsheet


In [None]:
!ls /content/drive/MyDrive/cde_data/data.csv

/content/drive/MyDrive/cde_data/data.csv


# PySpark vs Pandas Tutorial

This notebook provides an easy-to-understand comparison between Pandas and PySpark.

In [None]:
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.window import Window

# Initialize Spark Session
spark = SparkSession.builder.appName("PySpark_Tutorial").getOrCreate()

## 1. Reading CSV Files

In [None]:
df_pandas = pd.read_csv("/content/drive/MyDrive/cde_data/data.csv")
print("Pandas DataFrame:")
display(df_pandas.head())


Pandas DataFrame:


Unnamed: 0,id,name,age,department,salary
0,1,Alice,25,HR,50000
1,2,Bob,35,IT,70000
2,3,Charlie,45,Finance,90000
3,4,David,23,IT,60000
4,5,Eva,30,HR,65000


+---+-------+---+----------+------+
| id|   name|age|department|salary|
+---+-------+---+----------+------+
|  1|  Alice| 25|        HR| 50000|
|  2|    Bob| 35|        IT| 70000|
|  3|Charlie| 45|   Finance| 90000|
|  4|  David| 23|        IT| 60000|
|  5|    Eva| 30|        HR| 65000|
+---+-------+---+----------+------+



In [None]:

df_spark = spark.read.csv("/content/drive/MyDrive/cde_data/data.csv", header=True, inferSchema=False)
df_spark.show(5)

+---+-------+---+----------+------+
| id|   name|age|department|salary|
+---+-------+---+----------+------+
|  1|  Alice| 25|        HR| 50000|
|  2|    Bob| 35|        IT| 70000|
|  3|Charlie| 45|   Finance| 90000|
|  4|  David| 23|        IT| 60000|
|  5|    Eva| 30|        HR| 65000|
+---+-------+---+----------+------+



In [None]:
df_spark.printSchema()

root
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- age: string (nullable = true)
 |-- department: string (nullable = true)
 |-- salary: string (nullable = true)



In [None]:
from pyspark.sql.types import IntegerType, FloatType,StringType,StructType,StructField

schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("name", StringType(), True),
    StructField("age", IntegerType(), True),
    StructField("department", StringType(), True),
    StructField("salary", FloatType(), True),
])

df_spark = spark.read.csv("/content/drive/MyDrive/cde_data/data.csv", header=True, schema=schema)
df_spark.printSchema()



root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- department: string (nullable = true)
 |-- salary: float (nullable = true)



## 2. Filtering Data

In [None]:
df_pandas_filtered = df_pandas[df_pandas['age'] > 30]
display(df_pandas_filtered)


Unnamed: 0,id,name,age,department,salary
1,2,Bob,35,IT,70000
2,3,Charlie,45,Finance,90000


In [None]:

df_spark_filtered = df_spark.filter(df_spark['age'] > 30)
df_spark_filtered.show()

+---+-------+---+----------+-------+
| id|   name|age|department| salary|
+---+-------+---+----------+-------+
|  2|    Bob| 35|        IT|70000.0|
|  3|Charlie| 45|   Finance|90000.0|
+---+-------+---+----------+-------+



In [None]:
df_pandas_filter_alter = df_spark.where("age > 30")
df_pandas_filter_alter.show(2)

+---+-------+---+----------+-------+
| id|   name|age|department| salary|
+---+-------+---+----------+-------+
|  2|    Bob| 35|        IT|70000.0|
|  3|Charlie| 45|   Finance|90000.0|
+---+-------+---+----------+-------+



In [None]:
df_spark.where(col("age") > 30).show(2)

+---+-------+---+----------+-------+
| id|   name|age|department| salary|
+---+-------+---+----------+-------+
|  2|    Bob| 35|        IT|70000.0|
|  3|Charlie| 45|   Finance|90000.0|
+---+-------+---+----------+-------+



In [None]:
df_spark.where(df_spark.age > 30).show(2)

+---+-------+---+----------+-------+
| id|   name|age|department| salary|
+---+-------+---+----------+-------+
|  2|    Bob| 35|        IT|70000.0|
|  3|Charlie| 45|   Finance|90000.0|
+---+-------+---+----------+-------+



## 3. Grouping Data

In [None]:
df_spark.show()

+---+-------+---+----------+-------+
| id|   name|age|department| salary|
+---+-------+---+----------+-------+
|  1|  Alice| 25|        HR|50000.0|
|  2|    Bob| 35|        IT|70000.0|
|  3|Charlie| 45|   Finance|90000.0|
|  4|  David| 23|        IT|60000.0|
|  5|    Eva| 30|        HR|65000.0|
+---+-------+---+----------+-------+



In [None]:
df_pandas_grouped = df_pandas.groupby("department")["salary"].mean()
display(df_pandas_grouped)


Unnamed: 0_level_0,salary
department,Unnamed: 1_level_1
Finance,90000.0
HR,57500.0
IT,65000.0


In [None]:

df_spark_grouped = df_spark.groupBy("department").agg({"salary": "avg"})
df_spark_grouped.show()

+----------+-----------+
|department|avg(salary)|
+----------+-----------+
|        HR|    57500.0|
|   Finance|    90000.0|
|        IT|    65000.0|
+----------+-----------+



In [None]:
(df_spark
.groupBy("department")
.agg(round(min("salary"),2)
.alias("min")).show())

+----------+-------+
|department|    min|
+----------+-------+
|        HR|50000.0|
|   Finance|90000.0|
|        IT|60000.0|
+----------+-------+



## 4. SQL Queries

In [None]:
df_spark.show()

+---+-------+---+----------+-------+
| id|   name|age|department| salary|
+---+-------+---+----------+-------+
|  1|  Alice| 25|        HR|50000.0|
|  2|    Bob| 35|        IT|70000.0|
|  3|Charlie| 45|   Finance|90000.0|
|  4|  David| 23|        IT|60000.0|
|  5|    Eva| 30|        HR|65000.0|
+---+-------+---+----------+-------+



In [None]:
df_spark.createOrReplaceTempView("employees")
sql_result = spark.sql("SELECT name, age FROM employees WHERE age > 30")
sql_result.show()

+-------+---+
|   name|age|
+-------+---+
|    Bob| 35|
|Charlie| 45|
+-------+---+



## 5. Adding a New Column

In [None]:
df_pandas["salary_increase"] = df_pandas["salary"] * 1.10
display(df_pandas.head())


Unnamed: 0,id,name,age,department,salary,salary_increase
0,1,Alice,25,HR,50000,55000.0
1,2,Bob,35,IT,70000,77000.0
2,3,Charlie,45,Finance,90000,99000.0
3,4,David,23,IT,60000,66000.0
4,5,Eva,30,HR,65000,71500.0


In [None]:

df_spark = df_spark.withColumn("salary_increase", df_spark["salary"] * 1.10)
df_spark.show()

+---+-------+---+----------+-------+-----------------+
| id|   name|age|department| salary|  salary_increase|
+---+-------+---+----------+-------+-----------------+
|  1|  Alice| 25|        HR|50000.0|55000.00000000001|
|  2|    Bob| 35|        IT|70000.0|          77000.0|
|  3|Charlie| 45|   Finance|90000.0|99000.00000000001|
|  4|  David| 23|        IT|60000.0|          66000.0|
|  5|    Eva| 30|        HR|65000.0|          71500.0|
+---+-------+---+----------+-------+-----------------+



## 6. Handling Missing Data

In [None]:
df_pandas.dropna()



Unnamed: 0,id,name,age,department,salary,salary_increase
0,1,Alice,25,HR,50000,55000.0
1,2,Bob,35,IT,70000,77000.0
2,3,Charlie,45,Finance,90000,99000.0
3,4,David,23,IT,60000,66000.0
4,5,Eva,30,HR,65000,71500.0


In [None]:

df_spark.dropna().show()

+---+-------+---+----------+-------+-----------------+
| id|   name|age|department| salary|  salary_increase|
+---+-------+---+----------+-------+-----------------+
|  1|  Alice| 25|        HR|50000.0|55000.00000000001|
|  2|    Bob| 35|        IT|70000.0|          77000.0|
|  3|Charlie| 45|   Finance|90000.0|99000.00000000001|
|  4|  David| 23|        IT|60000.0|          66000.0|
|  5|    Eva| 30|        HR|65000.0|          71500.0|
+---+-------+---+----------+-------+-----------------+



## 7. Window Functions

In [None]:
# windowSpec = Window.partitionBy("department").orderBy("salary")
# df_spark = df_spark.withColumn("rank", rank().over(windowSpec))
# df_spark.show()

## 8. Joins

In [None]:
df1_spark = df_spark.alias("df1")
df1_spark.show()

+---+-------+---+----------+-------+-----------------+
| id|   name|age|department| salary|  salary_increase|
+---+-------+---+----------+-------+-----------------+
|  1|  Alice| 25|        HR|50000.0|55000.00000000001|
|  2|    Bob| 35|        IT|70000.0|          77000.0|
|  3|Charlie| 45|   Finance|90000.0|99000.00000000001|
|  4|  David| 23|        IT|60000.0|          66000.0|
|  5|    Eva| 30|        HR|65000.0|          71500.0|
+---+-------+---+----------+-------+-----------------+



In [None]:

df2_spark = df_spark.alias("df2")
df2_spark.show()

+---+-------+---+----------+-------+-----------------+
| id|   name|age|department| salary|  salary_increase|
+---+-------+---+----------+-------+-----------------+
|  1|  Alice| 25|        HR|50000.0|55000.00000000001|
|  2|    Bob| 35|        IT|70000.0|          77000.0|
|  3|Charlie| 45|   Finance|90000.0|99000.00000000001|
|  4|  David| 23|        IT|60000.0|          66000.0|
|  5|    Eva| 30|        HR|65000.0|          71500.0|
+---+-------+---+----------+-------+-----------------+



In [None]:


df_joined = df1_spark.join(df2_spark, df1_spark.id == df2_spark.id, "inner")
df_joined.show()

+---+-------+---+----------+-------+-----------------+---+-------+---+----------+-------+-----------------+
| id|   name|age|department| salary|  salary_increase| id|   name|age|department| salary|  salary_increase|
+---+-------+---+----------+-------+-----------------+---+-------+---+----------+-------+-----------------+
|  1|  Alice| 25|        HR|50000.0|55000.00000000001|  1|  Alice| 25|        HR|50000.0|55000.00000000001|
|  2|    Bob| 35|        IT|70000.0|          77000.0|  2|    Bob| 35|        IT|70000.0|          77000.0|
|  3|Charlie| 45|   Finance|90000.0|99000.00000000001|  3|Charlie| 45|   Finance|90000.0|99000.00000000001|
|  4|  David| 23|        IT|60000.0|          66000.0|  4|  David| 23|        IT|60000.0|          66000.0|
|  5|    Eva| 30|        HR|65000.0|          71500.0|  5|    Eva| 30|        HR|65000.0|          71500.0|
+---+-------+---+----------+-------+-----------------+---+-------+---+----------+-------+-----------------+



## 9. Data Partitioning

In [None]:
# it is better for parallelism
#core utilization
#ensures even distribustion of data
# can control number of outputs csv
#it will shuffle the data

In [None]:
df_spark_repartitioned = df_spark.repartition(5)

In [None]:
df_spark_repartitioned.coalesce(1)

DataFrame[id: int, name: string, age: int, department: string, salary: float, salary_increase: double]

In [None]:
!ls /content/drive/MyDrive/cde_data/ipl2025

IPL2025Batters.csv  IPL2025Bowlers.csv


In [None]:
df_spark.repartition(3).write.csv("/content/drive/MyDrive/cde_data/ipl2025/repartitioned_data.csv")

AnalysisException: [PATH_ALREADY_EXISTS] Path file:/content/drive/MyDrive/cde_data/ipl2025/repartitioned_data.csv already exists. Set mode as "overwrite" to overwrite the existing path.

## 10. Caching Data

In [None]:
df_spark.cache()
df_spark.show()

+---+-------+---+----------+-------+-----------------+
| id|   name|age|department| salary|  salary_increase|
+---+-------+---+----------+-------+-----------------+
|  1|  Alice| 25|        HR|50000.0|55000.00000000001|
|  2|    Bob| 35|        IT|70000.0|          77000.0|
|  3|Charlie| 45|   Finance|90000.0|99000.00000000001|
|  4|  David| 23|        IT|60000.0|          66000.0|
|  5|    Eva| 30|        HR|65000.0|          71500.0|
+---+-------+---+----------+-------+-----------------+



In [None]:
df_spark.is_cached

True

In [None]:
df_spark.unpersist()

DataFrame[id: int, name: string, age: int, department: string, salary: float, salary_increase: double]

In [None]:
df_spark.is_cached

False

In [None]:
from pyspark import StorageLevel
# memory only  stores in ram
#memory and disk  >> stores in ram , it will spill to disk

#memory only  and disk >>stores serialized

#

Persist and Unpersist in Spark
Persist caches RDD/DataFrame in memory or disk for reuse, avoiding recomputation. Unpersist removes cached data to free up memory.
Storage Levels

MEMORY_ONLY: Store in JVM heap (default)

MEMORY_AND_DISK: Spill to disk if memory full

DISK_ONLY: Store only on disk

MEMORY_ONLY_SER: Serialized objects in memory

MEMORY_AND_DISK_SER: Serialized with disk fallback

## Stopping Spark Session

In [None]:
spark.stop()