<a href="https://colab.research.google.com/github/kareemullah123456789/big_data_advanced/blob/main/pyspark_tutorial.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# PySpark vs Pandas Tutorial

This notebook provides an easy-to-understand comparison between Pandas and PySpark.

In [None]:
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.window import Window

# Initialize Spark Session
spark = SparkSession.builder.appName("PySpark_Tutorial").getOrCreate()

## 1. Reading CSV Files

In [None]:
df_pandas = pd.read_csv("data.csv")
print("Pandas DataFrame:")
display(df_pandas.head())

df_spark = spark.read.csv("data.csv", header=True, inferSchema=True)
df_spark.show(5)

## 2. Filtering Data

In [None]:
df_pandas_filtered = df_pandas[df_pandas['age'] > 30]
display(df_pandas_filtered)

df_spark_filtered = df_spark.filter(df_spark['age'] > 30)
df_spark_filtered.show()

## 3. Grouping Data

In [None]:
df_pandas_grouped = df_pandas.groupby("department")["salary"].mean()
display(df_pandas_grouped)

df_spark_grouped = df_spark.groupBy("department").agg({"salary": "avg"})
df_spark_grouped.show()

## 4. SQL Queries

In [None]:
df_spark.createOrReplaceTempView("employees")
sql_result = spark.sql("SELECT name, age FROM employees WHERE age > 30")
sql_result.show()

## 5. Adding a New Column

In [None]:
df_pandas["salary_increase"] = df_pandas["salary"] * 1.10
display(df_pandas.head())

df_spark = df_spark.withColumn("salary_increase", df_spark["salary"] * 1.10)
df_spark.show()

## 6. Handling Missing Data

In [None]:
df_pandas.dropna()
df_spark.dropna().show()

## 7. Window Functions

In [None]:
windowSpec = Window.partitionBy("department").orderBy("salary")
df_spark = df_spark.withColumn("rank", rank().over(windowSpec))
df_spark.show()

## 8. Joins

In [None]:
df1_spark = df_spark.alias("df1")
df2_spark = df_spark.alias("df2")
df_joined = df1_spark.join(df2_spark, df1_spark.id == df2_spark.id, "inner")
df_joined.show()

## 9. Data Partitioning

In [None]:
df_spark_repartitioned = df_spark.repartition(5)

## 10. Caching Data

In [None]:
df_spark.cache()

## Stopping Spark Session

In [None]:
spark.stop()