In [0]:
import pyspark
from pyspark.sql import SparkSession
import pandas as pd
spark = SparkSession.builder \
    .appName("SparkSQL Joins and Pandas Functions") \
    .getOrCreate()

In [0]:
#Create Sample DataFrames in PySpark
data1 = [("Alice", 1), ("Bob", 2), ("Charlie", 3)]
data2 = [("Alice", 25), ("Bob", 30), ("David", 35)]

df1 = spark.createDataFrame(data1, ["name", "age"])
df2 = spark.createDataFrame(data2, ["name", "salary"])
df1.show()
df2.show()

+-------+---+
|   name|age|
+-------+---+
|  Alice|  1|
|    Bob|  2|
|Charlie|  3|
+-------+---+

+-----+------+
| name|salary|
+-----+------+
|Alice|    25|
|  Bob|    30|
|David|    35|
+-----+------+



In [0]:
# Perform Various Types of Joins
# Inner Join
inner_join_df = df1.join(df2, ['name'], 'inner')
inner_join_df.show()

+-----+---+------+
| name|age|salary|
+-----+---+------+
|Alice|  1|    25|
|  Bob|  2|    30|
+-----+---+------+



In [0]:
# Outer Join
outer_join_df = df1.join(df2, ['name'], 'outer')
outer_join_df.show()

+-------+----+------+
|   name| age|salary|
+-------+----+------+
|  Alice|   1|    25|
|    Bob|   2|    30|
|Charlie|   3|  NULL|
|  David|NULL|    35|
+-------+----+------+



In [0]:
# Left Join
left_join_df = df1.join(df2, ['name'], 'left')
left_join_df.show()

+-------+---+------+
|   name|age|salary|
+-------+---+------+
|  Alice|  1|    25|
|    Bob|  2|    30|
|Charlie|  3|  NULL|
+-------+---+------+



In [0]:
# Right Join
right_join_df = df1.join(df2, ['name'], 'right')
right_join_df.show()

+-----+----+------+
| name| age|salary|
+-----+----+------+
|Alice|   1|    25|
|  Bob|   2|    30|
|David|NULL|    35|
+-----+----+------+



In [0]:
# Left Semi Join
left_semi_join_df = df1.join(df2, ['name'], 'left_semi')
left_semi_join_df.show()

+-----+---+
| name|age|
+-----+---+
|Alice|  1|
|  Bob|  2|
+-----+---+



In [0]:
# Left Anti Join
left_anti_join_df = df1.join(df2, ['name'], 'left_anti')
left_anti_join_df.show()

+-------+---+
|   name|age|
+-------+---+
|Charlie|  3|
+-------+---+



In [0]:
# Convert PySpark DataFrame to Pandas DataFrame and Apply Function
pandas_df = inner_join_df.toPandas()

# Apply a function to a Pandas DataFrame column
def age_group(age):
    if age < 20:
        return "Young"
    elif age >= 20 and age < 40:
        return "Middle-aged"
    else:
        return "Elderly"

pandas_df['age_group'] = pandas_df['age'].apply(age_group)
print(pandas_df)

    name  age  salary age_group
0  Alice    1      25     Young
1    Bob    2      30     Young
