In [1]:
from pyspark.sql import SparkSession
import getpass
username = getpass.getuser()
spark = SparkSession. \
builder. \
config('spark.ui.port', '0'). \
config("spark.sql.warehouse.dir", f"/user/{username}/warehouse"). \
appName("Spark").\
enableHiveSupport(). \
master('yarn'). \
getOrCreate()

In [70]:
from pyspark.sql.types import *

In [51]:
### 1. Drop Duplicates

In [6]:
schema = StructType([
    StructField("emp_id", IntegerType(), True),
    StructField("emp_name", StringType(), True),
    StructField("emp_gender", StringType(), True),
    StructField("emp_age", IntegerType(), True),
    StructField("emp_salary", IntegerType(), True),
    StructField("emp_manager", StringType(), True)
])

In [7]:
data = [
    (1, "Arjun Patel", "Male", 30, 60000, "Aarav Sharma"),
    (2, "Aarav Sharma", "Male", 28, 55000, "Zara Singh"),
    (3, "Zara Singh", "Female", 35, 70000, "Arjun Patel"),
    (4, "Priya Reddy", "Female", 32, 65000, "Aarav Sharma"),
    (1, "Arjun Patel", "Male", 30, 60000, "Aarav Sharma"),
    (6, "Naina Verma", "Female", 31, 72000, "Arjun Patel"),
    (1, "Arjun Patel", "Male", 30, 60000, "Aarav Sharma"),
    (4, "Priya Reddy", "Female", 32, 65000, "Aarav Sharma"),
    (5, "Aditya Kapoor", "Male", 28, 58000, "Zara Singh"),
    (10, "Anaya Joshi", "Female", 27, 59000, "Aarav Sharma"),
    (11, "Rohan Malhotra", "Male", 36, 73000, "Zara Singh"),
    (3, "Zara Singh", "Female", 35, 70000, "Arjun Patel")
]

In [8]:
df=spark.createDataFrame(data,schema)

In [19]:
from pyspark.sql.functions import count,col

In [21]:
group_df=df.groupBy("emp_id").agg(count("*")).alias("count_emp").filter(col("count(1)")>1)

In [22]:
group_df.show()

+------+--------+
|emp_id|count(1)|
+------+--------+
|     1|       3|
|     3|       2|
|     4|       2|
+------+--------+



In [23]:
dup_df=df.groupby(df.columns).count()

In [29]:
dup_df.filter(col("count")>1).orderBy("emp_id").drop("count").show()

+------+-----------+----------+-------+----------+------------+
|emp_id|   emp_name|emp_gender|emp_age|emp_salary| emp_manager|
+------+-----------+----------+-------+----------+------------+
|     1|Arjun Patel|      Male|     30|     60000|Aarav Sharma|
|     3| Zara Singh|    Female|     35|     70000| Arjun Patel|
|     4|Priya Reddy|    Female|     32|     65000|Aarav Sharma|
+------+-----------+----------+-------+----------+------------+



In [27]:
### Window functions

In [34]:
from pyspark.sql.window import Window

In [41]:
window_spec=Window.partitionBy(df.columns)

In [44]:
window_df=df.withColumn("count",count(col("emp_id")).over(window_spec))

In [47]:
new_df=window_df.dropDuplicates()

In [50]:
new_df.orderBy("emp_id").drop("count").show()

+------+--------------+----------+-------+----------+------------+
|emp_id|      emp_name|emp_gender|emp_age|emp_salary| emp_manager|
+------+--------------+----------+-------+----------+------------+
|     1|   Arjun Patel|      Male|     30|     60000|Aarav Sharma|
|     2|  Aarav Sharma|      Male|     28|     55000|  Zara Singh|
|     3|    Zara Singh|    Female|     35|     70000| Arjun Patel|
|     4|   Priya Reddy|    Female|     32|     65000|Aarav Sharma|
|     5| Aditya Kapoor|      Male|     28|     58000|  Zara Singh|
|     6|   Naina Verma|    Female|     31|     72000| Arjun Patel|
|    10|   Anaya Joshi|    Female|     27|     59000|Aarav Sharma|
|    11|Rohan Malhotra|      Male|     36|     73000|  Zara Singh|
+------+--------------+----------+-------+----------+------------+



In [None]:
####  Question 2: Union and Union All

In [71]:
from pyspark.sql import Row

In [72]:
data_1=[Row(employee_name="Alice", employee_gender="F", employee_salary=70000),Row(employee_name="Bob", employee_gender="M", employee_salary=80000),
         Row(employee_name="Charlie", employee_gender="M", employee_salary=55000),
         Row(employee_name="David", employee_gender="M", employee_salary=45000),
         Row(employee_name="Eve", employee_gender="F", employee_salary=50000),
          Row(employee_name="Eve", employee_gender="F", employee_salary=50000)
         ]

In [73]:
data_2 = [Row(employee_name="Frank", employee_gender="M", employee_salary=60000),
         Row(employee_name="Grace", employee_gender="F", employee_salary=65000),
         Row(employee_name="Hannah", employee_gender="F", employee_salary=70000),
         Row(employee_name="Ian", employee_gender="M", employee_salary=48000),
         Row(employee_name="Jill", employee_gender="F", employee_salary=53000),
          Row(employee_name="Eve", employee_gender="F", employee_salary=50000)
        ]

In [74]:
df1=spark.createDataFrame(data_1)

In [75]:
df2=spark.createDataFrame(data_2)

In [77]:
result_df=df1.union(df2)

In [78]:
result_df.show()

+-------------+---------------+---------------+
|employee_name|employee_gender|employee_salary|
+-------------+---------------+---------------+
|        Alice|              F|          70000|
|          Bob|              M|          80000|
|      Charlie|              M|          55000|
|        David|              M|          45000|
|          Eve|              F|          50000|
|          Eve|              F|          50000|
|        Frank|              M|          60000|
|        Grace|              F|          65000|
|       Hannah|              F|          70000|
|          Ian|              M|          48000|
|         Jill|              F|          53000|
|          Eve|              F|          50000|
+-------------+---------------+---------------+



In [79]:
## unique records while using union

In [80]:
df_unique=result_df.distinct()

In [82]:
df_unique.show(truncate=False)

+-------------+---------------+---------------+
|employee_name|employee_gender|employee_salary|
+-------------+---------------+---------------+
|Eve          |F              |50000          |
|Frank        |M              |60000          |
|Hannah       |F              |70000          |
|David        |M              |45000          |
|Bob          |M              |80000          |
|Alice        |F              |70000          |
|Jill         |F              |53000          |
|Charlie      |M              |55000          |
|Ian          |M              |48000          |
|Grace        |F              |65000          |
+-------------+---------------+---------------+



In [83]:
## If we have diff schema then we need to use unionByName()

In [2]:
spark

In [3]:
from pyspark.sql.types import *

In [4]:
emp_data = [
    (101, "Varun", "Sales", 75000),
    (102, "Alia", "HR", 46000),
    (103, "David", "IT", 55000),
    (104, "Steve", "Sales", 75000),
    (105, "Soham", "HR", 46000),
    (106, "Kiron", "IT", 50000),
    (107, "Dhoni", "Sales", 68000),
    (108, "Tiger", "HR", 45000),
    (109, "Rock", "IT", 53000),
    (110, "Khali", "Sales", 75000)
]

In [5]:
emp_schema=StructType([
    StructField("emp_id",IntegerType(),True),
    StructField("emp_name",StringType(),True),
    StructField("emp_dept",StringType(),True),
    StructField("emp_sal",LongType(),True)
])

In [6]:
emp_df=spark.createDataFrame(emp_data,emp_schema)

In [7]:
emp_df.show()

+------+--------+--------+-------+
|emp_id|emp_name|emp_dept|emp_sal|
+------+--------+--------+-------+
|   101|   Varun|   Sales|  75000|
|   102|    Alia|      HR|  46000|
|   103|   David|      IT|  55000|
|   104|   Steve|   Sales|  75000|
|   105|   Soham|      HR|  46000|
|   106|   Kiron|      IT|  50000|
|   107|   Dhoni|   Sales|  68000|
|   108|   Tiger|      HR|  45000|
|   109|    Rock|      IT|  53000|
|   110|   Khali|   Sales|  75000|
+------+--------+--------+-------+



In [12]:
from pyspark.sql.window import *
from pyspark.sql.functions import *

In [13]:
window_spec=Window.partitionBy("emp_dept").orderBy(col("emp_sal").desc())

In [14]:
## Rank()

In [17]:
rank_df=emp_df.withColumn("Rank",rank().over(window_spec))
rank_df.show()

+------+--------+--------+-------+----+
|emp_id|emp_name|emp_dept|emp_sal|Rank|
+------+--------+--------+-------+----+
|   110|   Khali|   Sales|  75000|   1|
|   101|   Varun|   Sales|  75000|   1|
|   104|   Steve|   Sales|  75000|   1|
|   107|   Dhoni|   Sales|  68000|   4|
|   102|    Alia|      HR|  46000|   1|
|   105|   Soham|      HR|  46000|   1|
|   108|   Tiger|      HR|  45000|   3|
|   103|   David|      IT|  55000|   1|
|   109|    Rock|      IT|  53000|   2|
|   106|   Kiron|      IT|  50000|   3|
+------+--------+--------+-------+----+



In [18]:
## Dense_rank()

In [19]:
dense_rank_df=emp_df.withColumn("Rank",dense_rank().over(window_spec))
dense_rank_df.show()

+------+--------+--------+-------+----+
|emp_id|emp_name|emp_dept|emp_sal|Rank|
+------+--------+--------+-------+----+
|   101|   Varun|   Sales|  75000|   1|
|   104|   Steve|   Sales|  75000|   1|
|   110|   Khali|   Sales|  75000|   1|
|   107|   Dhoni|   Sales|  68000|   2|
|   102|    Alia|      HR|  46000|   1|
|   105|   Soham|      HR|  46000|   1|
|   108|   Tiger|      HR|  45000|   2|
|   103|   David|      IT|  55000|   1|
|   109|    Rock|      IT|  53000|   2|
|   106|   Kiron|      IT|  50000|   3|
+------+--------+--------+-------+----+



In [20]:
## Row Number

In [22]:
row_number_df=emp_df.withColumn("Rank",row_number().over(window_spec))
row_number_df.show()

+------+--------+--------+-------+----+
|emp_id|emp_name|emp_dept|emp_sal|Rank|
+------+--------+--------+-------+----+
|   110|   Khali|   Sales|  75000|   1|
|   101|   Varun|   Sales|  75000|   2|
|   104|   Steve|   Sales|  75000|   3|
|   107|   Dhoni|   Sales|  68000|   4|
|   102|    Alia|      HR|  46000|   1|
|   105|   Soham|      HR|  46000|   2|
|   108|   Tiger|      HR|  45000|   3|
|   103|   David|      IT|  55000|   1|
|   109|    Rock|      IT|  53000|   2|
|   106|   Kiron|      IT|  50000|   3|
+------+--------+--------+-------+----+



In [2]:
spark.stop()