In [0]:
#Save emp_data and dept_data in csv or write these df into csv and also find out the second highest salary department wise and write that into the csv as well

# GC9 How to write into CSV | Databricks Tutorial |

###This one for on premise jupyter: in any folder path type cmd and then jupyter notebook then jupyter notebook will open in that folder
from pyspark.sql.functions import col, rank, dense_rank
from pyspark.sql.window import Window


# Sample data for employees and departments
emp_data = [
    (1, "Alice", 5000, 1),
    (2, "Bob", 7000, 1),
    (3, "Charlie", 6000, 1),
    (4, "David", 8000, 2),
    (5, "Eve", 9000, 2),
    (6, "Frank", 7500, 2),
]

dept_data = [
    (1, "HR"),
    (2, "IT"),
]

# Define schema for employee and department data
emp_columns = ["id", "name", "salary", "dept_id"]
dept_columns = ["dept_id", "dept_name"]

# Create DataFrames
emp_df = spark.createDataFrame(emp_data, emp_columns)
dept_df = spark.createDataFrame(dept_data, dept_columns)

window_spec=Window.partitionBy("dept_id").orderBy(col("salary").desc())

ranked_df=emp_df.withColumn("rnk", dense_rank().over(window_spec))

second_highest_df=ranked_df.filter(col("rnk")==2)

joined_df=second_highest_df.join(dept_df,"dept_id").select(col("name"),col("salary"),col("dept_name")).show()

emp_df.write.mode("overwrite").csv("dbfs:/FileStore/EMP_DATA.csv")# we can use append mode also

dept_df.write.mode("overwrite").csv("dbfs:/FileStore/DEPT_DATA.csv")



+-------+------+---------+
|   name|salary|dept_name|
+-------+------+---------+
|Charlie|  6000|       HR|
|  David|  8000|       IT|
+-------+------+---------+



In [0]:
# GC10 How to merge two DataFrame using PySpark | Databricks Tutorial 

simpleData=[
    ("Sagar","CSE","UP",80),
    ("Shivam","IT","MP",86)]

columns= ["StudentName","Department","City","Marks"]
df_1=spark.createDataFrame(data=simpleData, schema=columns)
df_1.show()

simpleData_2=[
    ("Sagar","CSE","UP",80),
    ("Muni","Mech","AP",70)]

columns= ["Student_Name","Department_Name","City","Marks"]
df_2=spark.createDataFrame(data=simpleData_2, schema=columns)
df_2.show()

df_1.union(df_2).show() #It is just like unionall

df_3=df_1.union(df_2).distinct() #It will show unique rows. And union only apply when there is same no. of columns and datatypes also should be same.
#It shows Column name of that df which comes 1st in union
df_3.show()


+-----------+----------+----+-----+
|StudentName|Department|City|Marks|
+-----------+----------+----+-----+
|      Sagar|       CSE|  UP|   80|
|     Shivam|        IT|  MP|   86|
|       Muni|      Mech|  AP|   70|
+-----------+----------+----+-----+



In [0]:
# GC11 How to use WHEN Otherwise in PySpark  | Databricks Tutorial |

from pyspark.sql.functions import when,col
df_4=df_3.withColumn("State",when(col("City")=="UP","Uttar Pradesh").when(col("City")=="AP","Andhra Pradesh").when(col("City")=="MP","Madhya Pradesh").otherwise("Unknown"))
df_4.show()

df_5=df_3.select(col("*"),when(col("City")=="UP","Uttar Pradesh").when(col("City")=="AP","Andhra Pradesh").when(col("City")=="MP","Madhya Pradesh").otherwise("Unknown").alias("State"))
df_5.show()


+-----------+----------+----+-----+--------------+
|StudentName|Department|City|Marks|         State|
+-----------+----------+----+-----+--------------+
|      Sagar|       CSE|  UP|   80| Uttar Pradesh|
|     Shivam|        IT|  MP|   86|Madhya Pradesh|
|       Muni|      Mech|  AP|   70|Andhra Pradesh|
+-----------+----------+----+-----+--------------+



In [0]:
from pyspark.sql.functions import *
# GC12 How to join two DataFrames in PySpark | Databricks Tutorial |
simpleData=[
    (1,"Sagar","CSE","UP",80),
    (2,"Shivam","IT","MP",86),
    (3,"Muni","Mech","AP",70)]

columns= ["ID","Student_Name","Department_Name","City","Marks"]
df_1=spark.createDataFrame(data=simpleData, schema=columns)
df_1.show()

simpleData_2=[
    (1,"Sagar","CSE","UP",80),
    (3,"Muni","Mech","AP",70)]

columns= ["ID","StudentName","DepartmentName","City","Marks"]
df_2=spark.createDataFrame(data=simpleData_2, schema=columns)
df_2.show()

df_1.join(df_2,df_1.ID==df_2.ID,"inner").show()  #we can use left right full self cross, if we don't give inner it will consider as inner by default

df_1.alias("A").join(df_2.alias("B"),col("A.ID")==col("B.ID"),"inner").show() #another way of writting same prog


+---+------------+---------------+----+-----+
| ID|Student_Name|Department_Name|City|Marks|
+---+------------+---------------+----+-----+
|  1|       Sagar|            CSE|  UP|   80|
|  2|      Shivam|             IT|  MP|   86|
|  3|        Muni|           Mech|  AP|   70|
+---+------------+---------------+----+-----+

+---+-----------+--------------+----+-----+
| ID|StudentName|DepartmentName|City|Marks|
+---+-----------+--------------+----+-----+
|  1|      Sagar|           CSE|  UP|   80|
|  3|       Muni|          Mech|  AP|   70|
+---+-----------+--------------+----+-----+

+---+------------+---------------+----+-----+---+-----------+--------------+----+-----+
| ID|Student_Name|Department_Name|City|Marks| ID|StudentName|DepartmentName|City|Marks|
+---+------------+---------------+----+-----+---+-----------+--------------+----+-----+
|  1|       Sagar|            CSE|  UP|   80|  1|      Sagar|           CSE|  UP|   80|
|  3|        Muni|           Mech|  AP|   70|  3|       Muni