In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *

In [3]:
spark = SparkSession.builder.appName('pr').getOrCreate()
spark

In [6]:
# Identify the customers who brought the same product more than once but on different days
# ( if same product is purchased multiple times but on same date shouldn't be counted)

data = [(333, 1122, 9, '2022-02-06T01:00:00.000+00:00'),
        (333,1122,10,'2022-02-06T02:00:00.000+00:00'), 
        (536,1435, 10,'2022-03-02T08:40:00.000+00:00'),
        (536,3223,5,'2022-03-02T09:33:28.000+00:00'),
        (536, 3223, 6,'2022-01-11T12:33:44.000+00:00'),
        (827, 2452, 45,'2022-03-02T00:00:00.000+00:00'), 
        (827, 3585, 35,'2022-02-20T14:05:26.000+00:00')]
df = spark.createDataFrame(data = data, schema=['uid', 'pid', 'qunt', 'pur_dt'])
df.show()

+---+----+----+--------------------+
|uid| pid|qunt|              pur_dt|
+---+----+----+--------------------+
|333|1122|   9|2022-02-06T01:00:...|
|333|1122|  10|2022-02-06T02:00:...|
|536|1435|  10|2022-03-02T08:40:...|
|536|3223|   5|2022-03-02T09:33:...|
|536|3223|   6|2022-01-11T12:33:...|
|827|2452|  45|2022-03-02T00:00:...|
|827|3585|  35|2022-02-20T14:05:...|
+---+----+----+--------------------+



In [7]:
df1 = df.withColumn('dt', to_date('pur_dt')).groupBy('uid', 'pid', 'dt').count()
df1.show()

+---+----+----------+-----+
|uid| pid|        dt|count|
+---+----+----------+-----+
|333|1122|2022-02-06|    2|
|536|1435|2022-03-02|    1|
|536|3223|2022-03-02|    1|
|536|3223|2022-01-11|    1|
|827|2452|2022-03-02|    1|
|827|3585|2022-02-20|    1|
+---+----+----------+-----+



In [8]:
df1.groupBy('uid', 'pid').count().filter(col('count') >= 2).show()

+---+----+-----+
|uid| pid|count|
+---+----+-----+
|536|3223|    2|
+---+----+-----+



In [9]:
# In the given dataset, names contain for some names and space for some names, extract the first name and last name 

data = [(1, 'sagar-prajapati'), (2, 'alex-john'), (3, 'john cena'), (4, 'kim joe')]
schema = ['Id','name']
df = spark.createDataFrame(data=data, schema=schema)
df.show(truncate=True)

+---+---------------+
| Id|           name|
+---+---------------+
|  1|sagar-prajapati|
|  2|      alex-john|
|  3|      john cena|
|  4|        kim joe|
+---+---------------+



In [29]:
replace_regex = "((,)?\s|[-])"
df1 = df.withColumn("A", regexp_replace(col("name"), replace_regex, " "))
df1.show()

+---+---------------+---------------+
| Id|           name|              A|
+---+---------------+---------------+
|  1|sagar-prajapati|sagar prajapati|
|  2|      alex-john|      alex john|
|  3|      john cena|      john cena|
|  4|        kim joe|        kim joe|
+---+---------------+---------------+



In [30]:
df2 = df1.withColumn('First_name', split(df1['A'], '').getItem(0)).withColumn('Last_Name', split(df1['A'], '').getItem(1))
df2.drop('A').show()

+---+---------------+----------+---------+
| Id|           name|First_name|Last_Name|
+---+---------------+----------+---------+
|  1|sagar-prajapati|         s|        a|
|  2|      alex-john|         a|        l|
|  3|      john cena|         j|        o|
|  4|        kim joe|         k|        i|
+---+---------------+----------+---------+



In [31]:
split_regex = "((,)?\s|[-])"
df1 = df.withColumn("A", split(col("name"), split_regex))
df1.show()

+---+---------------+------------------+
| Id|           name|                 A|
+---+---------------+------------------+
|  1|sagar-prajapati|[sagar, prajapati]|
|  2|      alex-john|      [alex, john]|
|  3|      john cena|      [john, cena]|
|  4|        kim joe|        [kim, joe]|
+---+---------------+------------------+



In [32]:
# call_duration 

data = [(10, 20, 58), (20,10,12), (10,30, 20),(30,40,100),(30, 40, 200), (30, 40, 200), (40, 30, 500)]
df = spark.createDataFrame(data = data, schema=['person1', 'person2', 'call_duration'])
df.show()

+-------+-------+-------------+
|person1|person2|call_duration|
+-------+-------+-------------+
|     10|     20|           58|
|     20|     10|           12|
|     10|     30|           20|
|     30|     40|          100|
|     30|     40|          200|
|     30|     40|          200|
|     40|     30|          500|
+-------+-------+-------------+



In [33]:
df1 = df.alias('t1').unionAll(df.alias('t2')).filter(col('person1') < col('person2'))
df1.show()

+-------+-------+-------------+
|person1|person2|call_duration|
+-------+-------+-------------+
|     10|     20|           58|
|     10|     30|           20|
|     30|     40|          100|
|     30|     40|          200|
|     30|     40|          200|
|     10|     20|           58|
|     10|     30|           20|
|     30|     40|          100|
|     30|     40|          200|
|     30|     40|          200|
+-------+-------+-------------+



In [34]:
df2 = df1.groupBy('person1','person2').agg(count(col('call_duration')).alias('call_count'), sum(col('call_duration')).alias('total_duration'))
df2.show()

+-------+-------+----------+--------------+
|person1|person2|call_count|total_duration|
+-------+-------+----------+--------------+
|     10|     20|         2|           116|
|     10|     30|         2|            40|
|     30|     40|         6|          1000|
+-------+-------+----------+--------------+



In [35]:
# Select the teachers who teaches only math and not any other subject 

data = [(1, "MATH"), (2,'MATH'), (4, 'CHEM'),(5, 'MATH'),(2, 'ENG'), (3, 'PHY')]
df = spark.createDataFrame(data = data, schema=['id', 'sub'])
df.show()

+---+----+
| id| sub|
+---+----+
|  1|MATH|
|  2|MATH|
|  4|CHEM|
|  5|MATH|
|  2| ENG|
|  3| PHY|
+---+----+



In [36]:
df1 = df.groupBy('id').count().filter(col('count') == 1)
df1.show()

+---+-----+
| id|count|
+---+-----+
|  1|    1|
|  4|    1|
|  5|    1|
|  3|    1|
+---+-----+



In [37]:
df.join(df1, df.id == df1.id, how = 'inner').filter(df['sub'] == 'MATH').select(df['*']).show()

+---+----+
| id| sub|
+---+----+
|  1|MATH|
|  5|MATH|
+---+----+



In [38]:
# Find out the companies where revenue has only increased over the years and there was no decrease at all for any point.

data = [('ABC', 2000, 100),
('ABC', 2001, 110),
('ABC', 2002, 120),
('XYZ', 2000, 100),
('XYZ', 2001, 90),
('XYZ', 2002, 120),
('RXC', 2000, 500),
('RXC', 2001, 400),
('RXC', 2002, 600),
('RXC', 2003, 800)]
schema = StructType([StructField('COMPANY', StringType(), True),
                     StructField('YEAR', IntegerType(), True),
                     StructField('REVENUE', IntegerType(), True)]) 
df = spark.createDataFrame(data=data, schema=schema)
df.show()

+-------+----+-------+
|COMPANY|YEAR|REVENUE|
+-------+----+-------+
|    ABC|2000|    100|
|    ABC|2001|    110|
|    ABC|2002|    120|
|    XYZ|2000|    100|
|    XYZ|2001|     90|
|    XYZ|2002|    120|
|    RXC|2000|    500|
|    RXC|2001|    400|
|    RXC|2002|    600|
|    RXC|2003|    800|
+-------+----+-------+



In [39]:
from pyspark.sql.window import Window
window = Window.partitionBy('COMPANY').orderBy('YEAR')
df1 = df.withColumn('lag', col('REVENUE')-lag(col('REVENUE'), 1, 0).over(window))
df1.show()

+-------+----+-------+----+
|COMPANY|YEAR|REVENUE| lag|
+-------+----+-------+----+
|    ABC|2000|    100| 100|
|    ABC|2001|    110|  10|
|    ABC|2002|    120|  10|
|    RXC|2000|    500| 500|
|    RXC|2001|    400|-100|
|    RXC|2002|    600| 200|
|    RXC|2003|    800| 200|
|    XYZ|2000|    100| 100|
|    XYZ|2001|     90| -10|
|    XYZ|2002|    120|  30|
+-------+----+-------+----+



In [41]:
df2 = df1.groupBy('COMPANY').agg(min(col('lag')).alias('diff')).filter(col('diff') > 0)
df2.show()

+-------+----+
|COMPANY|diff|
+-------+----+
|    ABC|  10|
+-------+----+



In [42]:
# lIst down the movies with an odd ID and which is not boring and order by id desc 

data = [(1, 'war', 'great ed',8.9),
    (2,'science','fiction',8.5),
    (3,'irish', 'boring', 6.2),
    (4, 'Ice song', 'fantacy', 8.6),
    (5, "house card", 'interesting', 9.1)]
sch = ['ID', 'Movie', 'Type', 'Rating']
df = spark.createDataFrame(data=data, schema=sch)
df.show()

+---+----------+-----------+------+
| ID|     Movie|       Type|Rating|
+---+----------+-----------+------+
|  1|       war|   great ed|   8.9|
|  2|   science|    fiction|   8.5|
|  3|     irish|     boring|   6.2|
|  4|  Ice song|    fantacy|   8.6|
|  5|house card|interesting|   9.1|
+---+----------+-----------+------+



In [43]:
df1 = df.filter(((df['ID'] %2 ) != 0 ) & (col('Type') != 'boring')).orderBy(col('ID').desc())
df1.show()

+---+----------+-----------+------+
| ID|     Movie|       Type|Rating|
+---+----------+-----------+------+
|  5|house card|interesting|   9.1|
|  1|       war|   great ed|   8.9|
+---+----------+-----------+------+



In [44]:
# Find the employees earning more than managers 

data = [(1, "John", 6000, 4), (2,'Kevin',11000,4), (3, 'Bob',8000, 5),(4, 'Laura',9000,None),(5, 'Sarah',10000, None)]
df = spark.createDataFrame(data = data, schema=['id', 'name', 'salary','mid'])
df.show()

+---+-----+------+----+
| id| name|salary| mid|
+---+-----+------+----+
|  1| John|  6000|   4|
|  2|Kevin| 11000|   4|
|  3|  Bob|  8000|   5|
|  4|Laura|  9000|NULL|
|  5|Sarah| 10000|NULL|
+---+-----+------+----+



In [45]:
df.alias('emp').join(df.alias('mgr'), col('emp.mid')  == col('mgr.id'), 'inner').filter(col('emp.salary') > col('mgr.salary'))\
.select(col('emp.id'), col('emp.name'), col('emp.salary'), col('emp.mid')).show()

+---+-----+------+---+
| id| name|salary|mid|
+---+-----+------+---+
|  2|Kevin| 11000|  4|
+---+-----+------+---+



"""𝐐𝐮𝐞𝐬𝐭𝐢𝐨𝐧
You are working as a Data Engineer at a fintech company. Your team is working on integrating two datasets:

1. Customer Transactions Data (transactions_df) - Contains customer transactions with columns: customer_id, txn_id, amount, and txn_date. 

2. Customer Profile Data (profile_df) - Contains customer information with columns: customer_id, name, age, and txn_id (latest transaction ID for reference).

 The requirement is to merge these two DataFrames on customer_id while keeping track of:

Conflicting column names (txn_id) should be renamed properly.

If a customer exists in profile_df but not in transactions_df, the row should still be present with NULL values for transaction-related columns.

Your task is to write an optimized PySpark code to achieve this."""

In [4]:
transactions_data = [ (101, "T001", 500, "2024-08-10"), (102, "T002", 1200, "2024-08-09"), 
                     (103, "T003", 300, "2024-08-08"), (104, "T004", 450, "2024-08-07"), ] 

profile_data = [ (101, "John", 30, "T001"), (102, "Emma", 27, "T005"), 
                (103, "Alex", 35, "T003"), (105, "Sam", 40, "T006"), ]

transactions_df = spark.createDataFrame(transactions_data, ["customer_id", "txn_id", "amount", "txn_date"])
transactions_df.show()

profile_df = spark.createDataFrame(profile_data, ["customer_id", "name", "age", "txn_id"])
profile_df.show()

+-----------+------+------+----------+
|customer_id|txn_id|amount|  txn_date|
+-----------+------+------+----------+
|        101|  T001|   500|2024-08-10|
|        102|  T002|  1200|2024-08-09|
|        103|  T003|   300|2024-08-08|
|        104|  T004|   450|2024-08-07|
+-----------+------+------+----------+

+-----------+----+---+------+
|customer_id|name|age|txn_id|
+-----------+----+---+------+
|        101|John| 30|  T001|
|        102|Emma| 27|  T005|
|        103|Alex| 35|  T003|
|        105| Sam| 40|  T006|
+-----------+----+---+------+



In [7]:
profile_df = profile_df.withColumnRenamed('txn_id', 'last_txn_id')
profile_df.show()

+-----------+----+---+-----------+
|customer_id|name|age|last_txn_id|
+-----------+----+---+-----------+
|        101|John| 30|       T001|
|        102|Emma| 27|       T005|
|        103|Alex| 35|       T003|
|        105| Sam| 40|       T006|
+-----------+----+---+-----------+



In [8]:
profile_df.join(transactions_df, on = "customer_id", how = "full_outer").show()

+-----------+----+----+-----------+------+------+----------+
|customer_id|name| age|last_txn_id|txn_id|amount|  txn_date|
+-----------+----+----+-----------+------+------+----------+
|        101|John|  30|       T001|  T001|   500|2024-08-10|
|        102|Emma|  27|       T005|  T002|  1200|2024-08-09|
|        103|Alex|  35|       T003|  T003|   300|2024-08-08|
|        104|NULL|NULL|       NULL|  T004|   450|2024-08-07|
|        105| Sam|  40|       T006|  NULL|  NULL|      NULL|
+-----------+----+----+-----------+------+------+----------+



𝐐𝐮𝐞𝐬𝐭𝐢𝐨𝐧

You are given an employee dataset containing information about employees and their managers. Each employee has a manager_id that refers to another employee in the same table. Your task is to use self-join to find hierarchical relationships between employees, such as finding all employees under a specific manager or the reporting hierarchy of an employee.

Interview Task
- Write a PySpark self-join query to find the direct reports of each manager. Additionally, extend the logic to find all hierarchical relationships up to any level.


In [9]:
data = [ (1, "Alice", None), (2, "Bob", 1),
(3, "Charlie", 1), (4, "David", 2),
(5, "Eva", 2), (6, "Frank", 3), (7, "Grace", 3) ]

columns = ["employee_id", "employee_name", "manager_id"]
df = spark.createDataFrame(data, columns)
df.show()

+-----------+-------------+----------+
|employee_id|employee_name|manager_id|
+-----------+-------------+----------+
|          1|        Alice|      NULL|
|          2|          Bob|         1|
|          3|      Charlie|         1|
|          4|        David|         2|
|          5|          Eva|         2|
|          6|        Frank|         3|
|          7|        Grace|         3|
+-----------+-------------+----------+



In [17]:
df.alias('emp').join(df.alias('mgr'), col('mgr.employee_id') == col('emp.manager_id'), 'left')\
           .select(col('mgr.employee_name'), col('emp.employee_name')).show()

+-------------+-------------+
|employee_name|employee_name|
+-------------+-------------+
|         NULL|        Alice|
|        Alice|          Bob|
|        Alice|      Charlie|
|          Bob|        David|
|          Bob|          Eva|
|      Charlie|        Frank|
|      Charlie|        Grace|
+-------------+-------------+



𝐐𝐮𝐞𝐬𝐭𝐢𝐨𝐧

You are working with large datasets in PySpark and need to join two DataFrames. However, one of the tables has highly skewed data, causing performance issues due to data shuffling. How would you optimize this join using salting techniques?
You are given the following sample datasets:

sales_df (Fact Table - Large Dataset, Highly Skewed on store_id)
Your task is to perform an optimized join between sales_df and store_df on store_id, ensuring that the skewness does not degrade performance.

In [18]:
sales_data = [ (101, "P001", 100), (101, "P002", 200), (101, "P003", 150), (102, "P004", 300), 
              (103, "P005", 400), (101, "P006", 500), (104, "P007", 250) ] 

sales_df = spark.createDataFrame(sales_data, ["store_id", "product_id", "amount"]) 
sales_df.show()

store_data = [(101, "Walmart"), (102, "Target"), (103, "Costco"), (104, "BestBuy")] 
store_df = spark.createDataFrame(store_data, ["store_id", "store_name"]) 
store_df.show()

+--------+----------+------+
|store_id|product_id|amount|
+--------+----------+------+
|     101|      P001|   100|
|     101|      P002|   200|
|     101|      P003|   150|
|     102|      P004|   300|
|     103|      P005|   400|
|     101|      P006|   500|
|     104|      P007|   250|
+--------+----------+------+

+--------+----------+
|store_id|store_name|
+--------+----------+
|     101|   Walmart|
|     102|    Target|
|     103|    Costco|
|     104|   BestBuy|
+--------+----------+



In [23]:
# Step 1: Adding Salt to skewed 'sales_df'

num_salt_keys = 3  # Define the range of salt keys 

sales_df_salted = sales_df.withColumn('salt', floor(rand() * num_salt_keys))\
                          .withColumn('salted_store_id', concat_ws("_", col('store_id'), col('salt')))
sales_df_salted.show()

+--------+----------+------+----+---------------+
|store_id|product_id|amount|salt|salted_store_id|
+--------+----------+------+----+---------------+
|     101|      P001|   100|   2|          101_2|
|     101|      P002|   200|   0|          101_0|
|     101|      P003|   150|   2|          101_2|
|     102|      P004|   300|   0|          102_0|
|     103|      P005|   400|   1|          103_1|
|     101|      P006|   500|   2|          101_2|
|     104|      P007|   250|   1|          104_1|
+--------+----------+------+----+---------------+



In [24]:
# Step 2: Expanding 'store_df' for Join compatibility 

expanded_store_df = store_df.crossJoin(spark.range(0, num_salt_keys).toDF('salt'))\
                    .withColumn('salted_store_id', concat_ws('_', col('store_id'), col('salt')))
expanded_store_df.show()

+--------+----------+----+---------------+
|store_id|store_name|salt|salted_store_id|
+--------+----------+----+---------------+
|     101|   Walmart|   0|          101_0|
|     101|   Walmart|   1|          101_1|
|     101|   Walmart|   2|          101_2|
|     102|    Target|   0|          102_0|
|     102|    Target|   1|          102_1|
|     102|    Target|   2|          102_2|
|     103|    Costco|   0|          103_0|
|     103|    Costco|   1|          103_1|
|     103|    Costco|   2|          103_2|
|     104|   BestBuy|   0|          104_0|
|     104|   BestBuy|   1|          104_1|
|     104|   BestBuy|   2|          104_2|
+--------+----------+----+---------------+



In [25]:
# Step 3: Performing the Optimized Join on Salted Keys

joined_df = sales_df_salted.join(expanded_store_df,'salted_store_id', 'inner').drop('salted_store_id', 'salt')
joined_df.show()

+--------+----------+------+--------+----------+
|store_id|product_id|amount|store_id|store_name|
+--------+----------+------+--------+----------+
|     101|      P002|   200|     101|   Walmart|
|     101|      P001|   100|     101|   Walmart|
|     101|      P003|   150|     101|   Walmart|
|     101|      P006|   500|     101|   Walmart|
|     102|      P004|   300|     102|    Target|
|     103|      P005|   400|     103|    Costco|
|     104|      P007|   250|     104|   BestBuy|
+--------+----------+------+--------+----------+



In [None]:
https://www.linkedin.com/company/seekho-bigdata-institute/posts/?feedView=all