<a href="https://colab.research.google.com/github/jugalpanchal/bd-chef/blob/main/spark_cheat_sheet.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Follow the steps to install the dependencies:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null # install java
!wget -q https://downloads.apache.org/spark/spark-3.1.2/spark-3.1.2-bin-hadoop3.2.tgz # spark package download
!tar xf spark-3.1.2-bin-hadoop3.2.tgz # unzip spark package
!pip install -q findspark # install spark

# Set the location of Java and Spark:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.2-bin-hadoop3.2"

import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession

# create or get spark session
spark = SparkSession.builder \
        .master("local[*]") \
        .appName("Spark_App1") \
        .getOrCreate()

sc = spark.sparkContext

##Join

In [59]:
# default inner
# cross
# inner 
# left(leftouter, left_outer) 
# right(rightouter, right_outer)
# outer(full, fullouter, full_outer)
# semi(leftsemi, left_semi)
# anti(leftanti, left_anti)

# Note: The join inside () are just a alias in spark. Spark makes everyone happy!!
# If you have not applied an alias to the columns of a DataFrame, you will get an error after creating a joined DataFrame. 
# When two columns are named the same, accessing one of the duplicates named columns returns an error, which basically means that it doesn't know which column you chose.

# def join(other, on=None, how=None)
# on: In the second parameter, you use the &(ampersand) symbol for AND the |(pipe) symbol for OR between columns.

In [60]:
df1 = spark.createDataFrame([('TM1', 'Jugal', 'T101'), ('TM2', 'Garvik', 'T102'), ('TM3', 'John', 'T103'), ('TM4', 'Mike', 'T102'), ('TM5', 'Mark', 'T104')]) # pass some tuples within a collection
df1 = df1.toDF('tm_id', 'tm_name', 'tm_team_id')

df2 = spark.createDataFrame([('T101', 'Sun'), ('T102', 'Moon'), ('T103', 'Mars'), ('T105', 'Jupiter')])
df2 = df2.toDF('team_id', 'team_name')

df1.show()
df2.show()

+-----+-------+----------+
|tm_id|tm_name|tm_team_id|
+-----+-------+----------+
|  TM1|  Jugal|      T101|
|  TM2| Garvik|      T102|
|  TM3|   John|      T103|
|  TM4|   Mike|      T102|
|  TM5|   Mark|      T104|
+-----+-------+----------+

+-------+---------+
|team_id|team_name|
+-------+---------+
|   T101|      Sun|
|   T102|     Moon|
|   T103|     Mars|
|   T105|  Jupiter|
+-------+---------+



In [61]:
# cross
df3 = df1.crossJoin(df2)
df3.show()

+-----+-------+----------+-------+---------+
|tm_id|tm_name|tm_team_id|team_id|team_name|
+-----+-------+----------+-------+---------+
|  TM1|  Jugal|      T101|   T101|      Sun|
|  TM1|  Jugal|      T101|   T102|     Moon|
|  TM2| Garvik|      T102|   T101|      Sun|
|  TM2| Garvik|      T102|   T102|     Moon|
|  TM1|  Jugal|      T101|   T103|     Mars|
|  TM1|  Jugal|      T101|   T105|  Jupiter|
|  TM2| Garvik|      T102|   T103|     Mars|
|  TM2| Garvik|      T102|   T105|  Jupiter|
|  TM3|   John|      T103|   T101|      Sun|
|  TM3|   John|      T103|   T102|     Moon|
|  TM4|   Mike|      T102|   T101|      Sun|
|  TM4|   Mike|      T102|   T102|     Moon|
|  TM5|   Mark|      T104|   T101|      Sun|
|  TM5|   Mark|      T104|   T102|     Moon|
|  TM3|   John|      T103|   T103|     Mars|
|  TM3|   John|      T103|   T105|  Jupiter|
|  TM4|   Mike|      T102|   T103|     Mars|
|  TM4|   Mike|      T102|   T105|  Jupiter|
|  TM5|   Mark|      T104|   T103|     Mars|
|  TM5|   

In [62]:
# inner
df3 = df1.join(df2, df1.tm_team_id == df2.team_id, 'inner')
df3.show()

+-----+-------+----------+-------+---------+
|tm_id|tm_name|tm_team_id|team_id|team_name|
+-----+-------+----------+-------+---------+
|  TM3|   John|      T103|   T103|     Mars|
|  TM2| Garvik|      T102|   T102|     Moon|
|  TM4|   Mike|      T102|   T102|     Moon|
|  TM1|  Jugal|      T101|   T101|      Sun|
+-----+-------+----------+-------+---------+



In [63]:
# left(leftouter, left_outer) 
df3 = df1.join(df2, df1.tm_team_id == df2.team_id, 'left')
df3.show()

+-----+-------+----------+-------+---------+
|tm_id|tm_name|tm_team_id|team_id|team_name|
+-----+-------+----------+-------+---------+
|  TM3|   John|      T103|   T103|     Mars|
|  TM5|   Mark|      T104|   null|     null|
|  TM2| Garvik|      T102|   T102|     Moon|
|  TM4|   Mike|      T102|   T102|     Moon|
|  TM1|  Jugal|      T101|   T101|      Sun|
+-----+-------+----------+-------+---------+



In [64]:
# right(rightouter, right_outer)
df3 = df1.join(df2, df1.tm_team_id == df2.team_id, 'right')
df3.show()

+-----+-------+----------+-------+---------+
|tm_id|tm_name|tm_team_id|team_id|team_name|
+-----+-------+----------+-------+---------+
|  TM3|   John|      T103|   T103|     Mars|
|  TM2| Garvik|      T102|   T102|     Moon|
|  TM4|   Mike|      T102|   T102|     Moon|
|  TM1|  Jugal|      T101|   T101|      Sun|
| null|   null|      null|   T105|  Jupiter|
+-----+-------+----------+-------+---------+



In [65]:
# outer(full, fullouter, full_outer) 
df3 = df1.join(df2, df1.tm_team_id == df2.team_id, 'outer')
df3.show()

+-----+-------+----------+-------+---------+
|tm_id|tm_name|tm_team_id|team_id|team_name|
+-----+-------+----------+-------+---------+
|  TM3|   John|      T103|   T103|     Mars|
|  TM5|   Mark|      T104|   null|     null|
|  TM2| Garvik|      T102|   T102|     Moon|
|  TM4|   Mike|      T102|   T102|     Moon|
|  TM1|  Jugal|      T101|   T101|      Sun|
| null|   null|      null|   T105|  Jupiter|
+-----+-------+----------+-------+---------+



In [66]:
# semi(leftsemi, left_semi) - it's same like the left join but without the right hand side table.
df3 = df1.join(df2, df1.tm_team_id == df2.team_id, 'leftsemi')
df3.show()

+-----+-------+----------+
|tm_id|tm_name|tm_team_id|
+-----+-------+----------+
|  TM3|   John|      T103|
|  TM2| Garvik|      T102|
|  TM4|   Mike|      T102|
|  TM1|  Jugal|      T101|
+-----+-------+----------+



In [67]:
# anti(leftanti, left_anti) - opposite of the left semi join. The data is not matched with the right side.
df3 = df1.join(df2, df1.tm_team_id == df2.team_id, 'anti')
df3.show()

+-----+-------+----------+
|tm_id|tm_name|tm_team_id|
+-----+-------+----------+
|  TM5|   Mark|      T104|
+-----+-------+----------+

