In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

### Read the CSV file and display its schema

In [0]:
df = spark.read.csv("dbfs:/mockarrow/MOCK_DATA.csv", header=True, inferSchema=True)
df.printSchema()

root
 |-- Emp ID: integer (nullable = true)
 |-- Emp Name: string (nullable = true)
 |-- contract: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- DOJ: date (nullable = true)
 |-- salary: string (nullable = true)
 |-- Relocation: boolean (nullable = true)



### Handle missing or null values

In [0]:
print(f"missing rows: {df.dropna(subset='salary').count()}/ total rows: {df.count()}")
print()
df.filter("salary is null").show(3)
df.na.fill({"Salary":0}).filter('salary ==0').show(3)
# df.filter(df['salary'].isNull()).show(3)

missing rows: 926/ total rows: 1000

+------+---------+-----------------+------+---------+----------+------+----------+
|Emp ID| Emp Name|         contract|gender|  Country|       DOJ|salary|Relocation|
+------+---------+-----------------+------+---------+----------+------+----------+
|     2|  Hillier|  +7 511 334 2980|  Male|   Russia|2022-08-20|  null|     false|
|     5|Merrielle| +62 862 968 5847|Female|Indonesia|2022-03-24|  null|      null|
|    35|      Rae|+351 240 640 3610|Female| Portugal|2023-08-16|  null|      true|
+------+---------+-----------------+------+---------+----------+------+----------+
only showing top 3 rows

+------+---------+-----------------+------+---------+----------+------+----------+
|Emp ID| Emp Name|         contract|gender|  Country|       DOJ|salary|Relocation|
+------+---------+-----------------+------+---------+----------+------+----------+
|     2|  Hillier|  +7 511 334 2980|  Male|   Russia|2022-08-20|     0|     false|
|     5|Merrielle| +62 86

### group data by a column and make aggregations

In [0]:
df_year = df.withColumn("year",split(df['DOJ'],'-').getItem(0))

df_year.groupBy('year').agg(max('salary').alias('MaxSalary'),count("emp id").alias('TotalEmployees')).show()

+----+---------+--------------+
|year|MaxSalary|TotalEmployees|
+----+---------+--------------+
|2021|$99628.01|           248|
|2022|$94158.32|           270|
|2023|$95988.63|           249|
|2024|$96040.96|           233|
+----+---------+--------------+



### Join 2 dataframes

In [0]:
# df1.join(df2, df1.id == df2.id, "inner")

remove duplicate rows

In [0]:
print(df.count())
print(df.dropDuplicates(subset=['DOJ']).count())
print(df.select('DOJ').distinct().count())

1000
713
713


### filter rows based on a condition

In [0]:
print(df.filter("relocation is null").count())
print(df.filter(df['relocation']==True).count())
print(df.where(df['relocation']==0).count())

32
471
497


### Add new column to the dataframe

In [0]:
dfx = df.withColumn("Month", split(df['DOJ'],'-').getItem(1))\
    .withColumn('StaticCol', lit('static-value'))

dfx.show(3)

+------+--------+-----------------+------+--------+----------+----------+----------+-----+------------+
|Emp ID|Emp Name|         contract|gender| Country|       DOJ|    salary|Relocation|Month|   StaticCol|
+------+--------+-----------------+------+--------+----------+----------+----------+-----+------------+
|     1|    Kara|+853 410 196 7317|Female|   Macao|2021-06-27|$709117.57|      true|   06|static-value|
|     2| Hillier|  +7 511 334 2980|  Male|  Russia|2022-08-20|      null|     false|   08|static-value|
|     3|   Sayre| +57 983 888 4293|Female|Colombia|2022-11-30| $78402.39|      true|   11|static-value|
+------+--------+-----------------+------+--------+----------+----------+----------+-----+------------+
only showing top 3 rows



### Sort dataframe by multiple columns

In [0]:
dfx.orderBy(["Month", "Emp ID"][::-1], ascending=[True, False]).show(3)
dfx.sort(["Month", "Emp ID"], ascending=[True, False]).show(3)

+------+--------+-----------------+------+--------+----------+----------+----------+-----+------------+
|Emp ID|Emp Name|         contract|gender| Country|       DOJ|    salary|Relocation|Month|   StaticCol|
+------+--------+-----------------+------+--------+----------+----------+----------+-----+------------+
|     1|    Kara|+853 410 196 7317|Female|   Macao|2021-06-27|$709117.57|      true|   06|static-value|
|     2| Hillier|  +7 511 334 2980|  Male|  Russia|2022-08-20|      null|     false|   08|static-value|
|     3|   Sayre| +57 983 888 4293|Female|Colombia|2022-11-30| $78402.39|      true|   11|static-value|
+------+--------+-----------------+------+--------+----------+----------+----------+-----+------------+
only showing top 3 rows

+------+--------+----------------+------+--------+----------+----------+----------+-----+------------+
|Emp ID|Emp Name|        contract|gender| Country|       DOJ|    salary|Relocation|Month|   StaticCol|
+------+--------+----------------+------+

### alternate way of reversing a list without using inbuilt function .reverse()

In [0]:
lst = ["Month", "Emp ID"]
lst2=[]
for i in range(len(lst)-1, -1, -1):
    lst2.append(lst[i])
lst2

Out[116]: ['Emp ID', 'Month']