In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql.window import Window

In [3]:
spark = SparkSession.builder.appName('pr').getOrCreate()
spark

In [6]:
# Identify the customers who brought the same product more than once but on different days
# ( if same product is purchased multiple times but on same date shouldn't be counted)

data = [(333, 1122, 9, '2022-02-06T01:00:00.000+00:00'),
        (333,1122,10,'2022-02-06T02:00:00.000+00:00'), 
        (536,1435, 10,'2022-03-02T08:40:00.000+00:00'),
        (536,3223,5,'2022-03-02T09:33:28.000+00:00'),
        (536, 3223, 6,'2022-01-11T12:33:44.000+00:00'),
        (827, 2452, 45,'2022-03-02T00:00:00.000+00:00'), 
        (827, 3585, 35,'2022-02-20T14:05:26.000+00:00')]
df = spark.createDataFrame(data = data, schema=['uid', 'pid', 'qunt', 'pur_dt'])
df.show()

+---+----+----+--------------------+
|uid| pid|qunt|              pur_dt|
+---+----+----+--------------------+
|333|1122|   9|2022-02-06T01:00:...|
|333|1122|  10|2022-02-06T02:00:...|
|536|1435|  10|2022-03-02T08:40:...|
|536|3223|   5|2022-03-02T09:33:...|
|536|3223|   6|2022-01-11T12:33:...|
|827|2452|  45|2022-03-02T00:00:...|
|827|3585|  35|2022-02-20T14:05:...|
+---+----+----+--------------------+



In [7]:
df1 = df.withColumn('dt', to_date('pur_dt')).groupBy('uid', 'pid', 'dt').count()
df1.show()

+---+----+----------+-----+
|uid| pid|        dt|count|
+---+----+----------+-----+
|333|1122|2022-02-06|    2|
|536|1435|2022-03-02|    1|
|536|3223|2022-03-02|    1|
|536|3223|2022-01-11|    1|
|827|2452|2022-03-02|    1|
|827|3585|2022-02-20|    1|
+---+----+----------+-----+



In [8]:
df1.groupBy('uid', 'pid').count().filter(col('count') >= 2).show()

+---+----+-----+
|uid| pid|count|
+---+----+-----+
|536|3223|    2|
+---+----+-----+



In [4]:
# In the given dataset, names contain for some names and space for some names, extract the first name and last name 

data = [(1, 'sagar-prajapati'), (2, 'alex-john'), (3, 'john cena'), (4, 'kim joe')]
schema = ['Id','name']
df = spark.createDataFrame(data=data, schema=schema)
df.show(truncate=True)

+---+---------------+
| Id|           name|
+---+---------------+
|  1|sagar-prajapati|
|  2|      alex-john|
|  3|      john cena|
|  4|        kim joe|
+---+---------------+



In [5]:
replace_regexp = "((,)?\s|[-])"
df1 = df.withColumn("A", regexp_replace(col("name"), replace_regexp, " "))
df1.show()

+---+---------------+---------------+
| Id|           name|              A|
+---+---------------+---------------+
|  1|sagar-prajapati|sagar prajapati|
|  2|      alex-john|      alex john|
|  3|      john cena|      john cena|
|  4|        kim joe|        kim joe|
+---+---------------+---------------+



In [9]:
df2 = df1.withColumn('First_name', split(df1['A'], ' ').getItem(0)).withColumn('Last_Name', split(df1['A'], ' ').getItem(1))
df2.drop('A').show()

+---+---------------+----------+---------+
| Id|           name|First_name|Last_Name|
+---+---------------+----------+---------+
|  1|sagar-prajapati|     sagar|prajapati|
|  2|      alex-john|      alex|     john|
|  3|      john cena|      john|     cena|
|  4|        kim joe|       kim|      joe|
+---+---------------+----------+---------+



In [31]:
split_regex = "((,)?\s|[-])"
df1 = df.withColumn("A", split(col("name"), split_regex))
df1.show()

+---+---------------+------------------+
| Id|           name|                 A|
+---+---------------+------------------+
|  1|sagar-prajapati|[sagar, prajapati]|
|  2|      alex-john|      [alex, john]|
|  3|      john cena|      [john, cena]|
|  4|        kim joe|        [kim, joe]|
+---+---------------+------------------+



In [32]:
# call_duration 

data = [(10, 20, 58), (20,10,12), (10,30, 20),(30,40,100),(30, 40, 200), (30, 40, 200), (40, 30, 500)]
df = spark.createDataFrame(data = data, schema=['person1', 'person2', 'call_duration'])
df.show()

+-------+-------+-------------+
|person1|person2|call_duration|
+-------+-------+-------------+
|     10|     20|           58|
|     20|     10|           12|
|     10|     30|           20|
|     30|     40|          100|
|     30|     40|          200|
|     30|     40|          200|
|     40|     30|          500|
+-------+-------+-------------+



In [33]:
df1 = df.alias('t1').unionAll(df.alias('t2')).filter(col('person1') < col('person2'))
df1.show()

+-------+-------+-------------+
|person1|person2|call_duration|
+-------+-------+-------------+
|     10|     20|           58|
|     10|     30|           20|
|     30|     40|          100|
|     30|     40|          200|
|     30|     40|          200|
|     10|     20|           58|
|     10|     30|           20|
|     30|     40|          100|
|     30|     40|          200|
|     30|     40|          200|
+-------+-------+-------------+



In [34]:
df2 = df1.groupBy('person1','person2').agg(count(col('call_duration')).alias('call_count'), sum(col('call_duration')).alias('total_duration'))
df2.show()

+-------+-------+----------+--------------+
|person1|person2|call_count|total_duration|
+-------+-------+----------+--------------+
|     10|     20|         2|           116|
|     10|     30|         2|            40|
|     30|     40|         6|          1000|
+-------+-------+----------+--------------+



In [35]:
# Select the teachers who teaches only math and not any other subject 

data = [(1, "MATH"), (2,'MATH'), (4, 'CHEM'),(5, 'MATH'),(2, 'ENG'), (3, 'PHY')]
df = spark.createDataFrame(data = data, schema=['id', 'sub'])
df.show()

+---+----+
| id| sub|
+---+----+
|  1|MATH|
|  2|MATH|
|  4|CHEM|
|  5|MATH|
|  2| ENG|
|  3| PHY|
+---+----+



In [36]:
df1 = df.groupBy('id').count().filter(col('count') == 1)
df1.show()

+---+-----+
| id|count|
+---+-----+
|  1|    1|
|  4|    1|
|  5|    1|
|  3|    1|
+---+-----+



In [37]:
df.join(df1, df.id == df1.id, how = 'inner').filter(df['sub'] == 'MATH').select(df['*']).show()

+---+----+
| id| sub|
+---+----+
|  1|MATH|
|  5|MATH|
+---+----+



In [38]:
# Find out the companies where revenue has only increased over the years and there was no decrease at all for any point.

data = [('ABC', 2000, 100),
('ABC', 2001, 110),
('ABC', 2002, 120),
('XYZ', 2000, 100),
('XYZ', 2001, 90),
('XYZ', 2002, 120),
('RXC', 2000, 500),
('RXC', 2001, 400),
('RXC', 2002, 600),
('RXC', 2003, 800)]
schema = StructType([StructField('COMPANY', StringType(), True),
                     StructField('YEAR', IntegerType(), True),
                     StructField('REVENUE', IntegerType(), True)]) 
df = spark.createDataFrame(data=data, schema=schema)
df.show()

+-------+----+-------+
|COMPANY|YEAR|REVENUE|
+-------+----+-------+
|    ABC|2000|    100|
|    ABC|2001|    110|
|    ABC|2002|    120|
|    XYZ|2000|    100|
|    XYZ|2001|     90|
|    XYZ|2002|    120|
|    RXC|2000|    500|
|    RXC|2001|    400|
|    RXC|2002|    600|
|    RXC|2003|    800|
+-------+----+-------+



In [39]:
from pyspark.sql.window import Window
window = Window.partitionBy('COMPANY').orderBy('YEAR')
df1 = df.withColumn('lag', col('REVENUE')-lag(col('REVENUE'), 1, 0).over(window))
df1.show()

+-------+----+-------+----+
|COMPANY|YEAR|REVENUE| lag|
+-------+----+-------+----+
|    ABC|2000|    100| 100|
|    ABC|2001|    110|  10|
|    ABC|2002|    120|  10|
|    RXC|2000|    500| 500|
|    RXC|2001|    400|-100|
|    RXC|2002|    600| 200|
|    RXC|2003|    800| 200|
|    XYZ|2000|    100| 100|
|    XYZ|2001|     90| -10|
|    XYZ|2002|    120|  30|
+-------+----+-------+----+



In [41]:
df2 = df1.groupBy('COMPANY').agg(min(col('lag')).alias('diff')).filter(col('diff') > 0)
df2.show()

+-------+----+
|COMPANY|diff|
+-------+----+
|    ABC|  10|
+-------+----+



In [42]:
# lIst down the movies with an odd ID and which is not boring and order by id desc 

data = [(1, 'war', 'great ed',8.9),
    (2,'science','fiction',8.5),
    (3,'irish', 'boring', 6.2),
    (4, 'Ice song', 'fantacy', 8.6),
    (5, "house card", 'interesting', 9.1)]
sch = ['ID', 'Movie', 'Type', 'Rating']
df = spark.createDataFrame(data=data, schema=sch)
df.show()

+---+----------+-----------+------+
| ID|     Movie|       Type|Rating|
+---+----------+-----------+------+
|  1|       war|   great ed|   8.9|
|  2|   science|    fiction|   8.5|
|  3|     irish|     boring|   6.2|
|  4|  Ice song|    fantacy|   8.6|
|  5|house card|interesting|   9.1|
+---+----------+-----------+------+



In [43]:
df1 = df.filter(((df['ID'] %2 ) != 0 ) & (col('Type') != 'boring')).orderBy(col('ID').desc())
df1.show()

+---+----------+-----------+------+
| ID|     Movie|       Type|Rating|
+---+----------+-----------+------+
|  5|house card|interesting|   9.1|
|  1|       war|   great ed|   8.9|
+---+----------+-----------+------+



In [44]:
# Find the employees earning more than managers 

data = [(1, "John", 6000, 4), (2,'Kevin',11000,4), (3, 'Bob',8000, 5),(4, 'Laura',9000,None),(5, 'Sarah',10000, None)]
df = spark.createDataFrame(data = data, schema=['id', 'name', 'salary','mid'])
df.show()

+---+-----+------+----+
| id| name|salary| mid|
+---+-----+------+----+
|  1| John|  6000|   4|
|  2|Kevin| 11000|   4|
|  3|  Bob|  8000|   5|
|  4|Laura|  9000|NULL|
|  5|Sarah| 10000|NULL|
+---+-----+------+----+



In [45]:
df.alias('emp').join(df.alias('mgr'), col('emp.mid')  == col('mgr.id'), 'inner').filter(col('emp.salary') > col('mgr.salary'))\
.select(col('emp.id'), col('emp.name'), col('emp.salary'), col('emp.mid')).show()

+---+-----+------+---+
| id| name|salary|mid|
+---+-----+------+---+
|  2|Kevin| 11000|  4|
+---+-----+------+---+



In [28]:
# Remove special charactors

data = [['Mavs^', 18], 
        ['Ne%ts', 33], 
        ['Hawk**s', 12], 
        ['Mavs@', 15], 
        ['Hawks!', 19],
        ['(Cavs)', 24],
        ['Magic', 28]] 
columns = ['team', 'points'] 
df = spark.createDataFrame(data, columns) 
df.show()

+-------+------+
|   team|points|
+-------+------+
|  Mavs^|    18|
|  Ne%ts|    33|
|Hawk**s|    12|
|  Mavs@|    15|
| Hawks!|    19|
| (Cavs)|    24|
|  Magic|    28|
+-------+------+



In [30]:
df1 = df.withColumn('team2', regexp_replace('team', '[^a-zA-Z0-9]', ''))
df1.show()

+-------+------+-----+
|   team|points|team2|
+-------+------+-----+
|  Mavs^|    18| Mavs|
|  Ne%ts|    33| Nets|
|Hawk**s|    12|Hawks|
|  Mavs@|    15| Mavs|
| Hawks!|    19|Hawks|
| (Cavs)|    24| Cavs|
|  Magic|    28|Magic|
+-------+------+-----+



In [31]:
df1.withColumn('count_of_spe_char',length(col('team')) - length(col('team2'))).show()

+-------+------+-----+-----------------+
|   team|points|team2|count_of_spe_char|
+-------+------+-----+-----------------+
|  Mavs^|    18| Mavs|                1|
|  Ne%ts|    33| Nets|                1|
|Hawk**s|    12|Hawks|                2|
|  Mavs@|    15| Mavs|                1|
| Hawks!|    19|Hawks|                1|
| (Cavs)|    24| Cavs|                2|
|  Magic|    28|Magic|                0|
+-------+------+-----+-----------------+



https://www.linkedin.com/company/seekho-bigdata-institute/posts/?feedView=all

29 The columns contain different types of data, including numeric, categorical, and string values. Your objective is to:

1. Fill numeric columns with the median value.
2. Fill categorical columns with the most frequent value.
3. Fill string columns with "Unknown".

In [48]:
data = [ (1, 25, 'North', 'M', '2025-01-01', 150),
(2, None, 'East', None, '2025-01-02', None),
(3, 30, 'South', 'F', None, 200),
(4, 22, None, 'M', '2025-01-03', 180),
(5, 28, 'West', 'F', None, None), ]
schema = StructType([StructField('Customer_ID', IntegerType
                                 (), True),
                    StructField('Age', IntegerType(), True),
                    StructField('Region', StringType(), True),
                    StructField('Gender', StringType(), True),
                     StructField('Last_Visit', StringType(), True),
                     StructField('Purchase_Amount', IntegerType(), True),
                    ])

#columns = ['Customer_ID', 'Age', 'Region', 'Gender', 'Last_Visit', 'Purchase_Amount'] 
df = spark.createDataFrame(data, schema)
print(df.printSchema())
df.show()

root
 |-- Customer_ID: integer (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Region: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Last_Visit: string (nullable = true)
 |-- Purchase_Amount: integer (nullable = true)

None
+-----------+----+------+------+----------+---------------+
|Customer_ID| Age|Region|Gender|Last_Visit|Purchase_Amount|
+-----------+----+------+------+----------+---------------+
|          1|  25| North|     M|2025-01-01|            150|
|          2|NULL|  East|  NULL|2025-01-02|           NULL|
|          3|  30| South|     F|      NULL|            200|
|          4|  22|  NULL|     M|2025-01-03|            180|
|          5|  28|  West|     F|      NULL|           NULL|
+-----------+----+------+------+----------+---------------+



In [50]:
# Define a function to fill missing values dynamically
def fill_missing_values(df):
    
    column_types = df.dtypes
    print('column_types: ', column_types)
    
    # loop through each column based on type
    for column, dtype in column_types:
        
        if dtype == 'int' or dtype == 'double' or dtype == 'long':
            median_value = df.approxQuantile(column, [0.5], 0)[0]
            df = df.fillna({column: median_value})
            
        elif dtype == 'string':
            df = df.fillna({column: 'Unknown'})
            
        else:
            df = df.fillna({column: 'Unknown'})
        
    return df
filled_df = fill_missing_values(df)
filled_df.show()

column_types:  [('Customer_ID', 'int'), ('Age', 'int'), ('Region', 'string'), ('Gender', 'string'), ('Last_Visit', 'string'), ('Purchase_Amount', 'int')]
+-----------+---+-------+-------+----------+---------------+
|Customer_ID|Age| Region| Gender|Last_Visit|Purchase_Amount|
+-----------+---+-------+-------+----------+---------------+
|          1| 25|  North|      M|2025-01-01|            150|
|          2| 25|   East|Unknown|2025-01-02|            180|
|          3| 30|  South|      F|   Unknown|            200|
|          4| 22|Unknown|      M|2025-01-03|            180|
|          5| 28|   West|      F|   Unknown|            180|
+-----------+---+-------+-------+----------+---------------+



𝐐𝐮𝐞𝐬𝐭𝐢𝐨𝐧 34

You are given a dataset of sales transactions for multiple stores and products.
- Calculate the percentage contribution of each product's sales to the total sales of its store.

In [4]:
data = [ ("S1", "P1", 100), ("S1", "P2", 200),
("S1", "P3", 300), ("S2", "P1", 400),
("S2", "P2", 100), ("S2", "P3", 500) ]
columns = ["StoreID", "Product", "Sales"] 
df = spark.createDataFrame(data, columns)
df.show()

+-------+-------+-----+
|StoreID|Product|Sales|
+-------+-------+-----+
|     S1|     P1|  100|
|     S1|     P2|  200|
|     S1|     P3|  300|
|     S2|     P1|  400|
|     S2|     P2|  100|
|     S2|     P3|  500|
+-------+-------+-----+



In [5]:
df1 = df.groupBy('StoreID').agg(sum('Sales').alias('Total_Sales'))
df1.show()

+-------+-----------+
|StoreID|Total_Sales|
+-------+-----------+
|     S1|        600|
|     S2|       1000|
+-------+-----------+



In [7]:
join_df = df.join(df1, on='StoreID', how= 'inner')
join_df.show()

+-------+-------+-----+-----------+
|StoreID|Product|Sales|Total_Sales|
+-------+-------+-----+-----------+
|     S1|     P1|  100|        600|
|     S1|     P2|  200|        600|
|     S1|     P3|  300|        600|
|     S2|     P1|  400|       1000|
|     S2|     P2|  100|       1000|
|     S2|     P3|  500|       1000|
+-------+-------+-----+-----------+



In [8]:
join_df.withColumn('percnt', round((col('Sales') / col('Total_sales'))*100,2)).show()

+-------+-------+-----+-----------+------+
|StoreID|Product|Sales|Total_Sales|percnt|
+-------+-------+-----+-----------+------+
|     S1|     P1|  100|        600| 16.67|
|     S1|     P2|  200|        600| 33.33|
|     S1|     P3|  300|        600|  50.0|
|     S2|     P1|  400|       1000|  40.0|
|     S2|     P2|  100|       1000|  10.0|
|     S2|     P3|  500|       1000|  50.0|
+-------+-------+-----+-----------+------+



𝐐𝐮𝐞𝐬𝐭𝐢𝐨𝐧 35

You are working as a Data Engineer at a retail company. The marketing team has provided a dataset of customer purchases to analyze the relationship between the amount spent on advertisements and the revenue generated. 
- Using PySpark, compute the correlation between the "Ad_Spend" and "Revenue" columns to determine if there's a linear relationship.

In [9]:
schema = StructType([ StructField("Customer_ID", StringType(), True), StructField("Ad_Spend", IntegerType(), True), StructField("Revenue", IntegerType(), True) ])
data = [ ("C001", 2000, 25000), ("C002", 1500, 23000),
("C003", 3000, 40000), ("C004", 1200, 18000),
("C005", 2500, 30000) ] 
df = spark.createDataFrame(data, schema)
df.show() 

+-----------+--------+-------+
|Customer_ID|Ad_Spend|Revenue|
+-----------+--------+-------+
|       C001|    2000|  25000|
|       C002|    1500|  23000|
|       C003|    3000|  40000|
|       C004|    1200|  18000|
|       C005|    2500|  30000|
+-----------+--------+-------+



In [11]:
correlation = df.stat.corr('Ad_Spend', 'Revenue')
print('The correlation between ad_spend and revenue is:' ,correlation)

The correlation between ad_spend and revenue is: 0.9704535552410213


𝐐𝐮𝐞𝐬𝐭𝐢𝐨𝐧 36

You are given a large e-commerce transaction dataset stored in a partitioned format based on country. 
- Count the distinct number of products purchased (product_id) for each customer_id in every country. The result should include the country, customer ID, and the distinct product count.

In [12]:
data = [ ("USA", 101, "P001"), 
("USA", 101, "P002"), ("USA", 101, "P001"), 
("USA", 102, "P003"), ("USA", 102, "P003"), 
("UK", 201, "P004"), ("UK", 201, "P005"), 
("UK", 202, "P004"), ("UK", 202, "P005"), ("UK", 202, "P004") ]

columns = ["country", "customer_id", "product_id"]
df = spark.createDataFrame(data, columns)
df.show() 

+-------+-----------+----------+
|country|customer_id|product_id|
+-------+-----------+----------+
|    USA|        101|      P001|
|    USA|        101|      P002|
|    USA|        101|      P001|
|    USA|        102|      P003|
|    USA|        102|      P003|
|     UK|        201|      P004|
|     UK|        201|      P005|
|     UK|        202|      P004|
|     UK|        202|      P005|
|     UK|        202|      P004|
+-------+-----------+----------+



In [13]:
df.groupBy('country', 'customer_id').agg(countDistinct(col('product_id'))).show()

+-------+-----------+--------------------------+
|country|customer_id|count(DISTINCT product_id)|
+-------+-----------+--------------------------+
|    USA|        101|                         2|
|     UK|        202|                         2|
|     UK|        201|                         2|
|    USA|        102|                         1|
+-------+-----------+--------------------------+



$Broadcast$ the smaller DataFrame (product_data). 37

In [37]:
sales_data = [ (1, 101, 5, '2025-01-01'), (2, 102, 3, '2025-01-02'),
(3, 103, 2, '2025-01-03'), (4, 101, 1, '2025-01-04'),
(5, 104, 4, '2025-01-05'), (6, 105, 6, '2025-01-06'), ]

# product_data (Small DataFrame)

product_data = [ (101, 'Laptop', 'Electronics', 1000),
(102, 'Phone', 'Electronics', 500),
(103, 'Headphones', 'Accessories', 150),
(104, 'Tablet', 'Electronics', 600),
(105, 'Smartwatch', 'Accessories', 200), ] 

sales_df = spark.createDataFrame(sales_data, ['sale_id', 'product_id','quantity', 'sale_date'])
product_df= spark.createDataFrame(product_data, ['product_id', 'product_name', 'category', 'price'])
sales_df.show()
product_df.show()

+-------+----------+--------+----------+
|sale_id|product_id|quantity| sale_date|
+-------+----------+--------+----------+
|      1|       101|       5|2025-01-01|
|      2|       102|       3|2025-01-02|
|      3|       103|       2|2025-01-03|
|      4|       101|       1|2025-01-04|
|      5|       104|       4|2025-01-05|
|      6|       105|       6|2025-01-06|
+-------+----------+--------+----------+

+----------+------------+-----------+-----+
|product_id|product_name|   category|price|
+----------+------------+-----------+-----+
|       101|      Laptop|Electronics| 1000|
|       102|       Phone|Electronics|  500|
|       103|  Headphones|Accessories|  150|
|       104|      Tablet|Electronics|  600|
|       105|  Smartwatch|Accessories|  200|
+----------+------------+-----------+-----+



In [39]:
from pyspark.sql.functions import broadcast

prod_df_broadcast = broadcast(product_df)

joined_df = sales_df.join(prod_df_broadcast, on='product_id', how='inner')
joined_df.show(truncate=False)


+----------+-------+--------+----------+------------+-----------+-----+
|product_id|sale_id|quantity|sale_date |product_name|category   |price|
+----------+-------+--------+----------+------------+-----------+-----+
|101       |1      |5       |2025-01-01|Laptop      |Electronics|1000 |
|102       |2      |3       |2025-01-02|Phone       |Electronics|500  |
|103       |3      |2       |2025-01-03|Headphones  |Accessories|150  |
|101       |4      |1       |2025-01-04|Laptop      |Electronics|1000 |
|104       |5      |4       |2025-01-05|Tablet      |Electronics|600  |
|105       |6      |6       |2025-01-06|Smartwatch  |Accessories|200  |
+----------+-------+--------+----------+------------+-----------+-----+



𝐐𝐮𝐞𝐬𝐭𝐢𝐨𝐧 39

You are working with large datasets in PySpark and need to join two DataFrames. However, one of the tables has highly skewed data, causing performance issues due to data shuffling. How would you optimize this join using salting techniques?
You are given the following sample datasets:

sales_df (Fact Table - Large Dataset, Highly Skewed on store_id)
Your task is to perform an optimized join between sales_df and store_df on store_id, ensuring that the skewness does not degrade performance.

In [32]:
sales_data = [ (101, "P001", 100), (101, "P002", 200), (101, "P003", 150), (102, "P004", 300), 
              (103, "P005", 400), (101, "P006", 500), (104, "P007", 250) ] 

sales_df = spark.createDataFrame(sales_data, ["store_id", "product_id", "amount"]) 
sales_df.show()

store_data = [(101, "Walmart"), (102, "Target"), (103, "Costco"), (104, "BestBuy")] 
store_df = spark.createDataFrame(store_data, ["store_id", "store_name"]) 
store_df.show()

+--------+----------+------+
|store_id|product_id|amount|
+--------+----------+------+
|     101|      P001|   100|
|     101|      P002|   200|
|     101|      P003|   150|
|     102|      P004|   300|
|     103|      P005|   400|
|     101|      P006|   500|
|     104|      P007|   250|
+--------+----------+------+

+--------+----------+
|store_id|store_name|
+--------+----------+
|     101|   Walmart|
|     102|    Target|
|     103|    Costco|
|     104|   BestBuy|
+--------+----------+



In [33]:
# Step 1: Adding Salt to skewed 'sales_df'

num_salt_keys = 3  # Define the range of salt keys 

sales_df_salted = sales_df.withColumn('salt', floor(rand() * num_salt_keys))\
                          .withColumn('salted_store_id', concat_ws("_", col('store_id'), col('salt')))
sales_df_salted.show()

+--------+----------+------+----+---------------+
|store_id|product_id|amount|salt|salted_store_id|
+--------+----------+------+----+---------------+
|     101|      P001|   100|   1|          101_1|
|     101|      P002|   200|   0|          101_0|
|     101|      P003|   150|   0|          101_0|
|     102|      P004|   300|   0|          102_0|
|     103|      P005|   400|   0|          103_0|
|     101|      P006|   500|   2|          101_2|
|     104|      P007|   250|   0|          104_0|
+--------+----------+------+----+---------------+



In [34]:
# Step 2: Expanding 'store_df' for Join compatibility 

expanded_store_df = store_df.crossJoin(spark.range(0, num_salt_keys).toDF('salt'))\
                    .withColumn('salted_store_id', concat_ws('_', col('store_id'), col('salt')))
expanded_store_df.show()

+--------+----------+----+---------------+
|store_id|store_name|salt|salted_store_id|
+--------+----------+----+---------------+
|     101|   Walmart|   0|          101_0|
|     101|   Walmart|   1|          101_1|
|     101|   Walmart|   2|          101_2|
|     102|    Target|   0|          102_0|
|     102|    Target|   1|          102_1|
|     102|    Target|   2|          102_2|
|     103|    Costco|   0|          103_0|
|     103|    Costco|   1|          103_1|
|     103|    Costco|   2|          103_2|
|     104|   BestBuy|   0|          104_0|
|     104|   BestBuy|   1|          104_1|
|     104|   BestBuy|   2|          104_2|
+--------+----------+----+---------------+



In [36]:
# Step 3: Performing the Optimized Join on Salted Keys

joined_df = sales_df_salted.join(expanded_store_df,'salted_store_id', 'inner').drop('salted_store_id', 'salt')
joined_df.show()

+--------+----------+------+--------+----------+
|store_id|product_id|amount|store_id|store_name|
+--------+----------+------+--------+----------+
|     101|      P002|   200|     101|   Walmart|
|     101|      P003|   150|     101|   Walmart|
|     101|      P001|   100|     101|   Walmart|
|     101|      P006|   500|     101|   Walmart|
|     102|      P004|   300|     102|    Target|
|     103|      P005|   400|     103|    Costco|
|     104|      P007|   250|     104|   BestBuy|
+--------+----------+------+--------+----------+



"""𝐐𝐮𝐞𝐬𝐭𝐢𝐨𝐧 40

You are working as a Data Engineer at a fintech company. Your team is working on integrating two datasets:

1. Customer Transactions Data (transactions_df) - Contains customer transactions with columns: customer_id, txn_id, amount, and txn_date. 

2. Customer Profile Data (profile_df) - Contains customer information with columns: customer_id, name, age, and txn_id (latest transaction ID for reference).

 The requirement is to merge these two DataFrames on customer_id while keeping track of:

Conflicting column names (txn_id) should be renamed properly.

If a customer exists in profile_df but not in transactions_df, the row should still be present with NULL values for transaction-related columns.

Your task is to write an optimized PySpark code to achieve this."""

In [4]:
transactions_data = [ (101, "T001", 500, "2024-08-10"), (102, "T002", 1200, "2024-08-09"), 
                     (103, "T003", 300, "2024-08-08"), (104, "T004", 450, "2024-08-07"), ] 

profile_data = [ (101, "John", 30, "T001"), (102, "Emma", 27, "T005"), 
                (103, "Alex", 35, "T003"), (105, "Sam", 40, "T006"), ]

transactions_df = spark.createDataFrame(transactions_data, ["customer_id", "txn_id", "amount", "txn_date"])
transactions_df.show()

profile_df = spark.createDataFrame(profile_data, ["customer_id", "name", "age", "txn_id"])
profile_df.show()

+-----------+------+------+----------+
|customer_id|txn_id|amount|  txn_date|
+-----------+------+------+----------+
|        101|  T001|   500|2024-08-10|
|        102|  T002|  1200|2024-08-09|
|        103|  T003|   300|2024-08-08|
|        104|  T004|   450|2024-08-07|
+-----------+------+------+----------+

+-----------+----+---+------+
|customer_id|name|age|txn_id|
+-----------+----+---+------+
|        101|John| 30|  T001|
|        102|Emma| 27|  T005|
|        103|Alex| 35|  T003|
|        105| Sam| 40|  T006|
+-----------+----+---+------+



In [7]:
profile_df = profile_df.withColumnRenamed('txn_id', 'last_txn_id')
profile_df.show()

+-----------+----+---+-----------+
|customer_id|name|age|last_txn_id|
+-----------+----+---+-----------+
|        101|John| 30|       T001|
|        102|Emma| 27|       T005|
|        103|Alex| 35|       T003|
|        105| Sam| 40|       T006|
+-----------+----+---+-----------+



In [8]:
profile_df.join(transactions_df, on = "customer_id", how = "full_outer").show()

+-----------+----+----+-----------+------+------+----------+
|customer_id|name| age|last_txn_id|txn_id|amount|  txn_date|
+-----------+----+----+-----------+------+------+----------+
|        101|John|  30|       T001|  T001|   500|2024-08-10|
|        102|Emma|  27|       T005|  T002|  1200|2024-08-09|
|        103|Alex|  35|       T003|  T003|   300|2024-08-08|
|        104|NULL|NULL|       NULL|  T004|   450|2024-08-07|
|        105| Sam|  40|       T006|  NULL|  NULL|      NULL|
+-----------+----+----+-----------+------+------+----------+



𝐐𝐮𝐞𝐬𝐭𝐢𝐨𝐧 42

You are given an employee dataset containing information about employees and their managers. Each employee has a manager_id that refers to another employee in the same table. Your task is to use self-join to find hierarchical relationships between employees, such as finding all employees under a specific manager or the reporting hierarchy of an employee.

Interview Task
- Write a PySpark self-join query to find the direct reports of each manager. Additionally, extend the logic to find all hierarchical relationships up to any level.


In [9]:
data = [ (1, "Alice", None), (2, "Bob", 1),
(3, "Charlie", 1), (4, "David", 2),
(5, "Eva", 2), (6, "Frank", 3), (7, "Grace", 3) ]

columns = ["employee_id", "employee_name", "manager_id"]
df = spark.createDataFrame(data, columns)
df.show()

+-----------+-------------+----------+
|employee_id|employee_name|manager_id|
+-----------+-------------+----------+
|          1|        Alice|      NULL|
|          2|          Bob|         1|
|          3|      Charlie|         1|
|          4|        David|         2|
|          5|          Eva|         2|
|          6|        Frank|         3|
|          7|        Grace|         3|
+-----------+-------------+----------+



In [17]:
df.alias('emp').join(df.alias('mgr'), col('mgr.employee_id') == col('emp.manager_id'), 'left')\
           .select(col('mgr.employee_name'), col('emp.employee_name')).show()

+-------------+-------------+
|employee_name|employee_name|
+-------------+-------------+
|         NULL|        Alice|
|        Alice|          Bob|
|        Alice|      Charlie|
|          Bob|        David|
|          Bob|          Eva|
|      Charlie|        Frank|
|      Charlie|        Grace|
+-------------+-------------+



𝐐𝐮𝐞𝐬𝐭𝐢𝐨𝐧 43

You are working as a Data Engineer, and the company has a log system where timestamps are recorded for every user action (e.g., when the user logs in and logs out). Your manager wants to know how much time each user spends between log in and log out.

calculate the difference between the logout_timestamp and login_timestamp in hours, minutes, and seconds. The result should be formatted like "HH:mm:ss".


In [14]:
data = [ (1, "2025-01-31 08:00:00", "2025-01-31 10:30:45"),
(2, "2025-01-31 09:00:30", "2025-01-31 12:15:10"),
(3, "2025-01-31 07:45:00", "2025-01-31 09:00:15") ]

columns = ["user_id", "login_timestamp", "logout_timestamp"] 
df = spark.createDataFrame(data, columns)
df.show()

+-------+-------------------+-------------------+
|user_id|    login_timestamp|   logout_timestamp|
+-------+-------------------+-------------------+
|      1|2025-01-31 08:00:00|2025-01-31 10:30:45|
|      2|2025-01-31 09:00:30|2025-01-31 12:15:10|
|      3|2025-01-31 07:45:00|2025-01-31 09:00:15|
+-------+-------------------+-------------------+



In [15]:
df = df.withColumn('login_time', unix_timestamp('login_timestamp'))
df = df.withColumn('logout_time', unix_timestamp('logout_timestamp'))
df.show()

+-------+-------------------+-------------------+----------+-----------+
|user_id|    login_timestamp|   logout_timestamp|login_time|logout_time|
+-------+-------------------+-------------------+----------+-----------+
|      1|2025-01-31 08:00:00|2025-01-31 10:30:45|1738290600| 1738299645|
|      2|2025-01-31 09:00:30|2025-01-31 12:15:10|1738294230| 1738305910|
|      3|2025-01-31 07:45:00|2025-01-31 09:00:15|1738289700| 1738294215|
+-------+-------------------+-------------------+----------+-----------+



In [16]:
# Calculate difference 

df = df.withColumn('duration_seconds', col('logout_time')-col('login_time'))
df.show()

+-------+-------------------+-------------------+----------+-----------+----------------+
|user_id|    login_timestamp|   logout_timestamp|login_time|logout_time|duration_seconds|
+-------+-------------------+-------------------+----------+-----------+----------------+
|      1|2025-01-31 08:00:00|2025-01-31 10:30:45|1738290600| 1738299645|            9045|
|      2|2025-01-31 09:00:30|2025-01-31 12:15:10|1738294230| 1738305910|           11680|
|      3|2025-01-31 07:45:00|2025-01-31 09:00:15|1738289700| 1738294215|            4515|
+-------+-------------------+-------------------+----------+-----------+----------------+



In [17]:
# Calculate hours, minutes, and seconds

df = df.withColumn('hours', (col('duration_seconds') / 3600).cast('int'))
df = df.withColumn('minutes', ((col('duration_seconds') % 3600) / 60).cast('int'))
df = df.withColumn('seconds', (col('duration_seconds') % 60).cast('int'))
df.show()

+-------+-------------------+-------------------+----------+-----------+----------------+-----+-------+-------+
|user_id|    login_timestamp|   logout_timestamp|login_time|logout_time|duration_seconds|hours|minutes|seconds|
+-------+-------------------+-------------------+----------+-----------+----------------+-----+-------+-------+
|      1|2025-01-31 08:00:00|2025-01-31 10:30:45|1738290600| 1738299645|            9045|    2|     30|     45|
|      2|2025-01-31 09:00:30|2025-01-31 12:15:10|1738294230| 1738305910|           11680|    3|     14|     40|
|      3|2025-01-31 07:45:00|2025-01-31 09:00:15|1738289700| 1738294215|            4515|    1|     15|     15|
+-------+-------------------+-------------------+----------+-----------+----------------+-----+-------+-------+



In [18]:
df = df.withColumn('formatted_duration', expr("lpad(hours, 2,'0') ||':' || lpad(minutes, 2,'0') ||':' || lpad(seconds, 2,'0')"))
df.show()

+-------+-------------------+-------------------+----------+-----------+----------------+-----+-------+-------+------------------+
|user_id|    login_timestamp|   logout_timestamp|login_time|logout_time|duration_seconds|hours|minutes|seconds|formatted_duration|
+-------+-------------------+-------------------+----------+-----------+----------------+-----+-------+-------+------------------+
|      1|2025-01-31 08:00:00|2025-01-31 10:30:45|1738290600| 1738299645|            9045|    2|     30|     45|          02:30:45|
|      2|2025-01-31 09:00:30|2025-01-31 12:15:10|1738294230| 1738305910|           11680|    3|     14|     40|          03:14:40|
|      3|2025-01-31 07:45:00|2025-01-31 09:00:15|1738289700| 1738294215|            4515|    1|     15|     15|          01:15:15|
+-------+-------------------+-------------------+----------+-----------+----------------+-----+-------+-------+------------------+



In [19]:
df.select('user_id', 'formatted_duration').show(truncate=False)

+-------+------------------+
|user_id|formatted_duration|
+-------+------------------+
|1      |02:30:45          |
|2      |03:14:40          |
|3      |01:15:15          |
+-------+------------------+



𝐐𝐮𝐞𝐬𝐭𝐢𝐨𝐧 44 

You have a dataset of user activities in an e-commerce application, where each row represents an activity performed by a user. The dataset contains duplicate activity entries (based on user and activity type) and you need to remove the duplicates. Furthermore, you want to keep only the most recent record for each user, based on a timestamp column.

Problem
- Remove duplicates based on user_id and activity_type.
- Keep only the most recent activity_timestamp for each user and activity type combination.

In [9]:
data = [ (1, 'login', '2025-02-01 10:00:00'), (1, 'view_product', '2025-02-01 10:05:00'), 
        (1, 'login', '2025-02-01 10:30:00'), (2, 'purchase', '2025-02-01 11:00:00'), (2, 'login', '2025-02-01 11:15:00'), 
(2, 'view_product', '2025-02-01 11:30:00'), (3, 'login', '2025-02-01 12:00:00'), (3, 'login', '2025-02-01 12:05:00') ]
 
df = spark.createDataFrame(data, ["user_id", "activity_type", "activity_timestamp"])
df.show()
df.printSchema()

+-------+-------------+-------------------+
|user_id|activity_type| activity_timestamp|
+-------+-------------+-------------------+
|      1|        login|2025-02-01 10:00:00|
|      1| view_product|2025-02-01 10:05:00|
|      1|        login|2025-02-01 10:30:00|
|      2|     purchase|2025-02-01 11:00:00|
|      2|        login|2025-02-01 11:15:00|
|      2| view_product|2025-02-01 11:30:00|
|      3|        login|2025-02-01 12:00:00|
|      3|        login|2025-02-01 12:05:00|
+-------+-------------+-------------------+

root
 |-- user_id: long (nullable = true)
 |-- activity_type: string (nullable = true)
 |-- activity_timestamp: string (nullable = true)



In [11]:
df = df.withColumn('activity_timestamp', col('activity_timestamp').cast('timestamp'))
df.show()

+-------+-------------+-------------------+
|user_id|activity_type| activity_timestamp|
+-------+-------------+-------------------+
|      1|        login|2025-02-01 10:00:00|
|      1| view_product|2025-02-01 10:05:00|
|      1|        login|2025-02-01 10:30:00|
|      2|     purchase|2025-02-01 11:00:00|
|      2|        login|2025-02-01 11:15:00|
|      2| view_product|2025-02-01 11:30:00|
|      3|        login|2025-02-01 12:00:00|
|      3|        login|2025-02-01 12:05:00|
+-------+-------------+-------------------+



In [21]:
window_spec = Window.partitionBy('user_id', 'activity_type').orderBy(col('activity_timestamp').desc())
# Add a row number to each partition
df_with_row_num = df.withColumn('row_num', row_number().over(window_spec))
df_with_row_num.show()

+-------+-------------+-------------------+-------+
|user_id|activity_type| activity_timestamp|row_num|
+-------+-------------+-------------------+-------+
|      1|        login|2025-02-01 10:30:00|      1|
|      1|        login|2025-02-01 10:00:00|      2|
|      1| view_product|2025-02-01 10:05:00|      1|
|      2|        login|2025-02-01 11:15:00|      1|
|      2|     purchase|2025-02-01 11:00:00|      1|
|      2| view_product|2025-02-01 11:30:00|      1|
|      3|        login|2025-02-01 12:05:00|      1|
|      3|        login|2025-02-01 12:00:00|      2|
+-------+-------------+-------------------+-------+



In [22]:
df_filtered = df_with_row_num.filter(col('row_num') == 1).drop('row_num')
df_filtered.show()

+-------+-------------+-------------------+
|user_id|activity_type| activity_timestamp|
+-------+-------------+-------------------+
|      1|        login|2025-02-01 10:30:00|
|      1| view_product|2025-02-01 10:05:00|
|      2|        login|2025-02-01 11:15:00|
|      2|     purchase|2025-02-01 11:00:00|
|      2| view_product|2025-02-01 11:30:00|
|      3|        login|2025-02-01 12:05:00|
+-------+-------------+-------------------+

