In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .appName("MyApp") \
    .master("local[*]") \
    .getOrCreate()

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/06/14 21:04:32 WARN Utils: Your hostname, kirans-mac.local, resolves to a loopback address: 127.0.0.1; using 172.18.197.149 instead (on interface en0)
25/06/14 21:04:32 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/06/14 21:04:33 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/06/14 21:04:33 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


# Create DataFrame

In [3]:
df1 = spark.createDataFrame(data=((1,'Robert'),(2,'Ria'),(3,'James')),schema='empid int,empname string')
df2 = spark.createDataFrame(data=((2,'USA'),(4,'India')),schema='empid int,country string')
df1.show()
df2.show()

                                                                                

+-----+-------+
|empid|empname|
+-----+-------+
|    1| Robert|
|    2|    Ria|
|    3|  James|
+-----+-------+

+-----+-------+
|empid|country|
+-----+-------+
|    2|    USA|
|    4|  India|
+-----+-------+



# join

In [4]:
df1.join(df2,df1.empid == df2.empid).show()

+-----+-------+-----+-------+
|empid|empname|empid|country|
+-----+-------+-----+-------+
|    2|    Ria|    2|    USA|
+-----+-------+-----+-------+



In [5]:
df1.join(df2,df1.empid == df2.empid,'left').show()

+-----+-------+-----+-------+
|empid|empname|empid|country|
+-----+-------+-----+-------+
|    1| Robert| NULL|   NULL|
|    2|    Ria|    2|    USA|
|    3|  James| NULL|   NULL|
+-----+-------+-----+-------+



In [6]:
df1.join(df2,df1.empid == df2.empid,'right').show()

+-----+-------+-----+-------+
|empid|empname|empid|country|
+-----+-------+-----+-------+
|    2|    Ria|    2|    USA|
| NULL|   NULL|    4|  India|
+-----+-------+-----+-------+



In [7]:
df1.join(df2,df1.empid == df2.empid,'full').show()

+-----+-------+-----+-------+
|empid|empname|empid|country|
+-----+-------+-----+-------+
|    1| Robert| NULL|   NULL|
|    2|    Ria|    2|    USA|
|    3|  James| NULL|   NULL|
| NULL|   NULL|    4|  India|
+-----+-------+-----+-------+



# cross join

In [9]:
df1.crossJoin(df2).show()



+-----+-------+-----+-------+
|empid|empname|empid|country|
+-----+-------+-----+-------+
|    1| Robert|    2|    USA|
|    1| Robert|    4|  India|
|    2|    Ria|    2|    USA|
|    2|    Ria|    4|  India|
|    3|  James|    2|    USA|
|    3|  James|    4|  India|
+-----+-------+-----+-------+



                                                                                

# self join

In [13]:
from pyspark.sql.functions import col

data = [
    (1, "John", 3),
    (2, "Alice", 3),
    (3, "Bob", None)
]
columns = ["emp_id", "emp_name", "manager_id"]

df = spark.createDataFrame(data, columns)

df.show()

+------+--------+----------+
|emp_id|emp_name|manager_id|
+------+--------+----------+
|     1|    John|         3|
|     2|   Alice|         3|
|     3|     Bob|      NULL|
+------+--------+----------+



In [26]:
df1 = df.alias("emp")
df2 = df.alias("mgr")

df1.join(df2, col("emp.manager_id") == col("mgr.emp_id"), "left") \
   .select(
       col("emp.emp_id").alias("emp_id"),
       col("emp.emp_name").alias("Employee"),
       col("mgr.emp_name").alias("Manager"),
       col("emp.manager_id")
   ) \
   .show()

+------+--------+-------+----------+
|emp_id|Employee|Manager|manager_id|
+------+--------+-------+----------+
|     1|    John|    Bob|         3|
|     2|   Alice|    Bob|         3|
|     3|     Bob|   NULL|      NULL|
+------+--------+-------+----------+



# multi column joins

In [28]:
df1 = spark.createDataFrame(data=((1,101,'Robert'),(2,102,'Ria'),(3,103,'James')),schema='empid int,deptid int,empname string')
df2 = spark.createDataFrame(data=((2,102,'USA'),(4,104,'India')),schema='empid int,deptid int,country string')
df1.show()
df2.show()

+-----+------+-------+
|empid|deptid|empname|
+-----+------+-------+
|    1|   101| Robert|
|    2|   102|    Ria|
|    3|   103|  James|
+-----+------+-------+

+-----+------+-------+
|empid|deptid|country|
+-----+------+-------+
|    2|   102|    USA|
|    4|   104|  India|
+-----+------+-------+



In [32]:
df1.join(df2, (df1.empid == df2.empid) & (df1.deptid == df2.deptid)).show()

+-----+------+-------+-----+------+-------+
|empid|deptid|empname|empid|deptid|country|
+-----+------+-------+-----+------+-------+
|    2|   102|    Ria|    2|   102|    USA|
+-----+------+-------+-----+------+-------+



# multi dataframe join

In [35]:
df1 = spark.createDataFrame(data=((1,'Robert'),(2,'Ria'),(3,'James')),schema='empid int,empname string')
df2 = spark.createDataFrame(data=((2,'USA'),(4,'India')),schema='empid int,country string')
df3 = spark.createDataFrame(data=((1,'01-jan-2021'),(2,'01-feb-2021'),(3,'01-mar-2021')),schema='empid int,joindate string')
df1.show()
df2.show()
df3.show()

+-----+-------+
|empid|empname|
+-----+-------+
|    1| Robert|
|    2|    Ria|
|    3|  James|
+-----+-------+

+-----+-------+
|empid|country|
+-----+-------+
|    2|    USA|
|    4|  India|
+-----+-------+

+-----+-----------+
|empid|   joindate|
+-----+-----------+
|    1|01-jan-2021|
|    2|01-feb-2021|
|    3|01-mar-2021|
+-----+-----------+



In [36]:
df1.join(df2,df1.empid==df2.empid).join(df3,df1.empid == df3.empid).show()

+-----+-------+-----+-------+-----+-----------+
|empid|empname|empid|country|empid|   joindate|
+-----+-------+-----+-------+-----+-----------+
|    2|    Ria|    2|    USA|    2|01-feb-2021|
+-----+-------+-----+-------+-----+-----------+



In [37]:
spark.stop()