https://pravash-techie.medium.com/pyspark-interview-questions-coding-part-1-b4b7cea4d2f5

In [2]:
import findspark
findspark.init()

In [4]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import * 

spark = SparkSession.builder.appName('Med').getOrCreate()
spark

In [5]:
from pyspark.sql.window import Window

### Q1. ClickStream
Given a clickstream of user activity data , find the relevant user session for each click event.

In [10]:
schema = "click_time STRING, user_id STRING" 

data = [
    ("2018-01-01 11:00:00", "u1"),
    ("2018-01-01 12:00:00", "u1"),
    ("2018-01-01 13:00:00", "u1"),
    ("2018-01-01 13:00:00", "u1"),
    ("2018-01-01 14:00:00", "u1"),
    ("2018-01-01 15:00:00", "u1"),
    ("2018-01-01 11:00:00", "u2"),
    ("2018-01-02 11:00:00", "u2")
]
df = spark.createDataFrame(data=data, schema=schema)
df.printSchema()
df.show()

root
 |-- click_time: string (nullable = true)
 |-- user_id: string (nullable = true)

+-------------------+-------+
|         click_time|user_id|
+-------------------+-------+
|2018-01-01 11:00:00|     u1|
|2018-01-01 12:00:00|     u1|
|2018-01-01 13:00:00|     u1|
|2018-01-01 13:00:00|     u1|
|2018-01-01 14:00:00|     u1|
|2018-01-01 15:00:00|     u1|
|2018-01-01 11:00:00|     u2|
|2018-01-02 11:00:00|     u2|
+-------------------+-------+



In [12]:
# Convert click_time to Unix timestamp for easier calculations 

df = df.withColumn("click_timestamp", unix_timestamp("click_time"))
df.show()

+-------------------+-------+---------------+
|         click_time|user_id|click_timestamp|
+-------------------+-------+---------------+
|2018-01-01 11:00:00|     u1|     1514784600|
|2018-01-01 12:00:00|     u1|     1514788200|
|2018-01-01 13:00:00|     u1|     1514791800|
|2018-01-01 13:00:00|     u1|     1514791800|
|2018-01-01 14:00:00|     u1|     1514795400|
|2018-01-01 15:00:00|     u1|     1514799000|
|2018-01-01 11:00:00|     u2|     1514784600|
|2018-01-02 11:00:00|     u2|     1514871000|
+-------------------+-------+---------------+



In [14]:
window = Window.partitionBy('user_id').orderBy('click_timestamp')

# Getting the previous row value using lag

df = df.withColumn("prev_click_timestamp", lag('click_timestamp', 1).over(window))
df.show()

+-------------------+-------+---------------+--------------------+
|         click_time|user_id|click_timestamp|prev_click_timestamp|
+-------------------+-------+---------------+--------------------+
|2018-01-01 11:00:00|     u1|     1514784600|                NULL|
|2018-01-01 12:00:00|     u1|     1514788200|          1514784600|
|2018-01-01 13:00:00|     u1|     1514791800|          1514788200|
|2018-01-01 13:00:00|     u1|     1514791800|          1514791800|
|2018-01-01 14:00:00|     u1|     1514795400|          1514791800|
|2018-01-01 15:00:00|     u1|     1514799000|          1514795400|
|2018-01-01 11:00:00|     u2|     1514784600|                NULL|
|2018-01-02 11:00:00|     u2|     1514871000|          1514784600|
+-------------------+-------+---------------+--------------------+



In [17]:
# Difference between click time and dividing that with 60

df = df.withColumn("timestamp_diff", (col('click_timestamp') - col('prev_click_timestamp')) / 60)
df.show()

+-------------------+-------+---------------+--------------------+--------------+
|         click_time|user_id|click_timestamp|prev_click_timestamp|timestamp_diff|
+-------------------+-------+---------------+--------------------+--------------+
|2018-01-01 11:00:00|     u1|     1514784600|                NULL|          NULL|
|2018-01-01 12:00:00|     u1|     1514788200|          1514784600|          60.0|
|2018-01-01 13:00:00|     u1|     1514791800|          1514788200|          60.0|
|2018-01-01 13:00:00|     u1|     1514791800|          1514791800|           0.0|
|2018-01-01 14:00:00|     u1|     1514795400|          1514791800|          60.0|
|2018-01-01 15:00:00|     u1|     1514799000|          1514795400|          60.0|
|2018-01-01 11:00:00|     u2|     1514784600|                NULL|          NULL|
|2018-01-02 11:00:00|     u2|     1514871000|          1514784600|        1440.0|
+-------------------+-------+---------------+--------------------+--------------+



In [22]:
# Updating null with 0 
#df.na.fill(value = 0).show()

df = df.withColumn("timestamp_diff", when(col('timestamp_diff').isNull(), 0).otherwise(col('timestamp_diff')))
df.show()

+-------------------+-------+---------------+--------------------+--------------+
|         click_time|user_id|click_timestamp|prev_click_timestamp|timestamp_diff|
+-------------------+-------+---------------+--------------------+--------------+
|2018-01-01 11:00:00|     u1|     1514784600|                NULL|           0.0|
|2018-01-01 12:00:00|     u1|     1514788200|          1514784600|          60.0|
|2018-01-01 13:00:00|     u1|     1514791800|          1514788200|          60.0|
|2018-01-01 13:00:00|     u1|     1514791800|          1514791800|           0.0|
|2018-01-01 14:00:00|     u1|     1514795400|          1514791800|          60.0|
|2018-01-01 15:00:00|     u1|     1514799000|          1514795400|          60.0|
|2018-01-01 11:00:00|     u2|     1514784600|                NULL|           0.0|
|2018-01-02 11:00:00|     u2|     1514871000|          1514784600|        1440.0|
+-------------------+-------+---------------+--------------------+--------------+



In [26]:
# Check for new session 

df = df.withColumn("session_new", when(col("timestamp_diff") >30, 1).otherwise(0))
df.show()

+-------------------+-------+---------------+--------------------+--------------+-----------+
|         click_time|user_id|click_timestamp|prev_click_timestamp|timestamp_diff|session_new|
+-------------------+-------+---------------+--------------------+--------------+-----------+
|2018-01-01 11:00:00|     u1|     1514784600|                NULL|          NULL|          0|
|2018-01-01 12:00:00|     u1|     1514788200|          1514784600|          60.0|          1|
|2018-01-01 13:00:00|     u1|     1514791800|          1514788200|          60.0|          1|
|2018-01-01 13:00:00|     u1|     1514791800|          1514791800|           0.0|          0|
|2018-01-01 14:00:00|     u1|     1514795400|          1514791800|          60.0|          1|
|2018-01-01 15:00:00|     u1|     1514799000|          1514795400|          60.0|          1|
|2018-01-01 11:00:00|     u2|     1514784600|                NULL|          NULL|          0|
|2018-01-02 11:00:00|     u2|     1514871000|          15147

In [28]:
# New session names 

df.withColumn("session_new_name", concat(col('user_id'), lit("--S"), sum(col('session_new')).over(window))).show()

+-------------------+-------+---------------+--------------------+--------------+-----------+----------------+
|         click_time|user_id|click_timestamp|prev_click_timestamp|timestamp_diff|session_new|session_new_name|
+-------------------+-------+---------------+--------------------+--------------+-----------+----------------+
|2018-01-01 11:00:00|     u1|     1514784600|                NULL|          NULL|          0|          u1--S0|
|2018-01-01 12:00:00|     u1|     1514788200|          1514784600|          60.0|          1|          u1--S1|
|2018-01-01 13:00:00|     u1|     1514791800|          1514788200|          60.0|          1|          u1--S2|
|2018-01-01 13:00:00|     u1|     1514791800|          1514791800|           0.0|          0|          u1--S2|
|2018-01-01 14:00:00|     u1|     1514795400|          1514791800|          60.0|          1|          u1--S3|
|2018-01-01 15:00:00|     u1|     1514799000|          1514795400|          60.0|          1|          u1--S4|
|

### Q2. Max Salary
Ask is to find the job titles of the highest-paid employees. output should include the highest-paid title or multiple titles with the same salary.

In [29]:
worker_data = [(1, 'John', 'Doe', 10000, '2023-01-01', 'Engineering'),
        (2, 'Jane', 'Smith', 12000, '2022-12-01', 'Marketing'),
        (3, 'Alice', 'Johnson', 12000, '2022-11-01', 'Engineering')]
columns = ['worker_id', 'first_name', 'last_name', 'salary', 'joining_date', 'department']
worker = spark.createDataFrame(worker_data, columns)
worker.show()

title_data = [(1, 'Engineer', '2022-01-01'),
        (2, 'Manager', '2022-01-01'),
        (3, 'Engineer', '2022-01-01')]
columns = ['worker_ref_id', 'worker_title', 'affected_from']
title = spark.createDataFrame(title_data, columns)
title.show()

+---------+----------+---------+------+------------+-----------+
|worker_id|first_name|last_name|salary|joining_date| department|
+---------+----------+---------+------+------------+-----------+
|        1|      John|      Doe| 10000|  2023-01-01|Engineering|
|        2|      Jane|    Smith| 12000|  2022-12-01|  Marketing|
|        3|     Alice|  Johnson| 12000|  2022-11-01|Engineering|
+---------+----------+---------+------+------------+-----------+

+-------------+------------+-------------+
|worker_ref_id|worker_title|affected_from|
+-------------+------------+-------------+
|            1|    Engineer|   2022-01-01|
|            2|     Manager|   2022-01-01|
|            3|    Engineer|   2022-01-01|
+-------------+------------+-------------+



In [37]:
df = worker.join(title, worker.worker_id == title.worker_ref_id)
df.show()

+---------+----------+---------+------+------------+-----------+-------------+------------+-------------+
|worker_id|first_name|last_name|salary|joining_date| department|worker_ref_id|worker_title|affected_from|
+---------+----------+---------+------+------------+-----------+-------------+------------+-------------+
|        1|      John|      Doe| 10000|  2023-01-01|Engineering|            1|    Engineer|   2022-01-01|
|        2|      Jane|    Smith| 12000|  2022-12-01|  Marketing|            2|     Manager|   2022-01-01|
|        3|     Alice|  Johnson| 12000|  2022-11-01|Engineering|            3|    Engineer|   2022-01-01|
+---------+----------+---------+------+------------+-----------+-------------+------------+-------------+



In [40]:
rank_df = df.withColumn('salary_rank', rank().over(Window.orderBy(df.salary.desc())))
rank_df.show()

+---------+----------+---------+------+------------+-----------+-------------+------------+-------------+-----------+
|worker_id|first_name|last_name|salary|joining_date| department|worker_ref_id|worker_title|affected_from|salary_rank|
+---------+----------+---------+------+------------+-----------+-------------+------------+-------------+-----------+
|        2|      Jane|    Smith| 12000|  2022-12-01|  Marketing|            2|     Manager|   2022-01-01|          1|
|        3|     Alice|  Johnson| 12000|  2022-11-01|Engineering|            3|    Engineer|   2022-01-01|          1|
|        1|      John|      Doe| 10000|  2023-01-01|Engineering|            1|    Engineer|   2022-01-01|          3|
+---------+----------+---------+------+------------+-----------+-------------+------------+-------------+-----------+



In [43]:
highest_paid_df = rank_df.filter(rank_df['salary_rank'] == 1)
highest_paid_df.show()

+---------+----------+---------+------+------------+-----------+-------------+------------+-------------+-----------+
|worker_id|first_name|last_name|salary|joining_date| department|worker_ref_id|worker_title|affected_from|salary_rank|
+---------+----------+---------+------+------------+-----------+-------------+------------+-------------+-----------+
|        2|      Jane|    Smith| 12000|  2022-12-01|  Marketing|            2|     Manager|   2022-01-01|          1|
|        3|     Alice|  Johnson| 12000|  2022-11-01|Engineering|            3|    Engineer|   2022-01-01|          1|
+---------+----------+---------+------+------------+-----------+-------------+------------+-------------+-----------+



In [44]:
result_df = highest_paid_df.select('worker_id', 'first_name', 'last_name', 'worker_title', 'salary')\
                           .withColumnRenamed('worker_title', 'best_paid_title' )
result_df.show()

+---------+----------+---------+---------------+------+
|worker_id|first_name|last_name|best_paid_title|salary|
+---------+----------+---------+---------------+------+
|        2|      Jane|    Smith|        Manager| 12000|
|        3|     Alice|  Johnson|       Engineer| 12000|
+---------+----------+---------+---------------+------+



### Q3. Highest and Lowest Salary

In [45]:
worker_data = [
    (1, 'John', 'Doe', 5000, '2023-01-01', 'Engineering'),
    (2, 'Jane', 'Smith', 6000, '2023-01-15', 'Marketing'),
    (3, 'Alice', 'Johnson', 4500, '2023-02-05', 'Engineering')
]
title_data = [
    (1, 'Engineer', '2022-01-01'),
    (2, 'Manager', '2022-01-01'),
    (3, 'Engineer', '2022-01-01')
]
worker_columns = ['worker_id', 'first_name', 'last_name', 'salary', 'joining_date', 'department']
title_columns = ['worker_ref_id', 'worker_title', 'affected_from']
worker_df = spark.createDataFrame(worker_data, worker_columns)
title_df = spark.createDataFrame(title_data, title_columns)
worker_df.show()
title_df.show()

+---------+----------+---------+------+------------+-----------+
|worker_id|first_name|last_name|salary|joining_date| department|
+---------+----------+---------+------+------------+-----------+
|        1|      John|      Doe|  5000|  2023-01-01|Engineering|
|        2|      Jane|    Smith|  6000|  2023-01-15|  Marketing|
|        3|     Alice|  Johnson|  4500|  2023-02-05|Engineering|
+---------+----------+---------+------+------------+-----------+

+-------------+------------+-------------+
|worker_ref_id|worker_title|affected_from|
+-------------+------------+-------------+
|            1|    Engineer|   2022-01-01|
|            2|     Manager|   2022-01-01|
|            3|    Engineer|   2022-01-01|
+-------------+------------+-------------+



In [46]:
df = worker.join(title, worker.worker_id == title.worker_ref_id, 'inner')
df.show()

+---------+----------+---------+------+------------+-----------+-------------+------------+-------------+
|worker_id|first_name|last_name|salary|joining_date| department|worker_ref_id|worker_title|affected_from|
+---------+----------+---------+------+------------+-----------+-------------+------------+-------------+
|        1|      John|      Doe| 10000|  2023-01-01|Engineering|            1|    Engineer|   2022-01-01|
|        2|      Jane|    Smith| 12000|  2022-12-01|  Marketing|            2|     Manager|   2022-01-01|
|        3|     Alice|  Johnson| 12000|  2022-11-01|Engineering|            3|    Engineer|   2022-01-01|
+---------+----------+---------+------+------------+-----------+-------------+------------+-------------+



In [47]:
result_df = df.groupBy('worker_id','first_name','last_name',"salary", "department")\
              .agg(max('salary').alias('max_salary'),min('salary').alias('min_salary'))
result_df.show()

+---------+----------+---------+------+-----------+----------+----------+
|worker_id|first_name|last_name|salary| department|max_salary|min_salary|
+---------+----------+---------+------+-----------+----------+----------+
|        1|      John|      Doe| 10000|Engineering|     10000|     10000|
|        2|      Jane|    Smith| 12000|  Marketing|     12000|     12000|
|        3|     Alice|  Johnson| 12000|Engineering|     12000|     12000|
+---------+----------+---------+------+-----------+----------+----------+



In [48]:
result_df = result_df.withColumn('salary_type', 
                    when(result_df['salary'] == result_df['max_salary'], 'Highest Salary')
                    .when(result_df['salary'] == result_df['min_salary'], 'Lowest Salary'))
result_df.show()

+---------+----------+---------+------+-----------+----------+----------+--------------+
|worker_id|first_name|last_name|salary| department|max_salary|min_salary|   salary_type|
+---------+----------+---------+------+-----------+----------+----------+--------------+
|        1|      John|      Doe| 10000|Engineering|     10000|     10000|Highest Salary|
|        2|      Jane|    Smith| 12000|  Marketing|     12000|     12000|Highest Salary|
|        3|     Alice|  Johnson| 12000|Engineering|     12000|     12000|Highest Salary|
+---------+----------+---------+------+-----------+----------+----------+--------------+



### Q4. UNPivot the Column into Rows

In [49]:
data = [
    (123, "A", 30, 31, 32),
    (124, "B", 40, 41, 42),
    (125, "B", 50, 51, 52)
]
df = spark.createDataFrame(data, ["StudentID", "StudentName", "AScore", "BScore", "CScore"])
df.show()

+---------+-----------+------+------+------+
|StudentID|StudentName|AScore|BScore|CScore|
+---------+-----------+------+------+------+
|      123|          A|    30|    31|    32|
|      124|          B|    40|    41|    42|
|      125|          B|    50|    51|    52|
+---------+-----------+------+------+------+



In [63]:
unpvt_exp = "stack(3, 'AScore',AScore,'BScore',BScore, 'CScore', CScore) as (Subject, Score)"
unpvt_df = df.select('StudentID', 'StudentName', expr(unpvt_exp))
unpvt_df.show()

+---------+-----------+-------+-----+
|StudentID|StudentName|Subject|Score|
+---------+-----------+-------+-----+
|      123|          A| AScore|   30|
|      123|          A| BScore|   31|
|      123|          A| CScore|   32|
|      124|          B| AScore|   40|
|      124|          B| BScore|   41|
|      124|          B| CScore|   42|
|      125|          B| AScore|   50|
|      125|          B| BScore|   51|
|      125|          B| CScore|   52|
+---------+-----------+-------+-----+



In [50]:
# OR
pivot_df = df.selectExpr(
    "StudentID",
    "StudentName",
    "stack(3, 'AScore', AScore, 'BScore', BScore, 'CScore', CScore) as (Subject, Score)"
)

pivot_df.show()

+---------+-----------+-------+-----+
|StudentID|StudentName|Subject|Score|
+---------+-----------+-------+-----+
|      123|          A| AScore|   30|
|      123|          A| BScore|   31|
|      123|          A| CScore|   32|
|      124|          B| AScore|   40|
|      124|          B| BScore|   41|
|      124|          B| CScore|   42|
|      125|          B| AScore|   50|
|      125|          B| BScore|   51|
|      125|          B| CScore|   52|
+---------+-----------+-------+-----+



### Q5. Repeat IDs

In [66]:
df = spark.createDataFrame([(1,),(2,),(3,),], ['id'])
df.show()

+---+
| id|
+---+
|  1|
|  2|
|  3|
+---+



In [68]:
odf = df.selectExpr('explode(sequence(1, id)) as id')
odf.show()

+---+
| id|
+---+
|  1|
|  1|
|  2|
|  1|
|  2|
|  3|
+---+



### Q6. Group Rows

In [69]:
data = [("alpha", "aa", 1),
        ("alpha", "aa", 2),
        ("beta", "bb", 3),
        ("beta", "bb", 5),
        ("beta", "bb", 4)]
schema = ["col1", "col2", "col3"]
df = spark.createDataFrame(data, schema=schema)
df.show()

+-----+----+----+
| col1|col2|col3|
+-----+----+----+
|alpha|  aa|   1|
|alpha|  aa|   2|
| beta|  bb|   3|
| beta|  bb|   5|
| beta|  bb|   4|
+-----+----+----+



In [70]:
df_grp = df.groupBy('col1', 'col2').agg(collect_list('col3').alias('col3_list'))
df_grp.show()

+-----+----+---------+
| col1|col2|col3_list|
+-----+----+---------+
|alpha|  aa|   [1, 2]|
| beta|  bb|[3, 5, 4]|
+-----+----+---------+



### Q8. Employee Total Hours Inside
From the below data, find the total hours employee was inside office.

In [91]:
import datetime
data = [
    (11114, datetime.datetime.strptime('2024-06-01 08:30:00.00', "%Y-%m-%d %H:%M:%S.%f"), "I"),
    (11114, datetime.datetime.strptime('2024-06-01 10:30:00.00', "%Y-%m-%d %H:%M:%S.%f"), 'O'),
    (11114, datetime.datetime.strptime('2024-06-01 11:30:00.00', "%Y-%m-%d %H:%M:%S.%f"), 'I'),
    (11114, datetime.datetime.strptime('2024-06-01 15:30:00.00', "%Y-%m-%d %H:%M:%S.%f"), 'O'),
    (11115, datetime.datetime.strptime('2024-06-01 09:30:00.00', "%Y-%m-%d %H:%M:%S.%f"), 'I'),
    (11115, datetime.datetime.strptime('2024-06-01 17:30:00.00', "%Y-%m-%d %H:%M:%S.%f"), 'O')
]
schema = StructType([
                    StructField('emp_id', LongType(), True),
                    StructField('punch_time', TimestampType(), True),
                    StructField('flag', StringType(), True)
])
df =  spark.createDataFrame(data=data, schema=schema)
df.show()

+------+-------------------+----+
|emp_id|         punch_time|flag|
+------+-------------------+----+
| 11114|2024-06-01 08:30:00|   I|
| 11114|2024-06-01 10:30:00|   O|
| 11114|2024-06-01 11:30:00|   I|
| 11114|2024-06-01 15:30:00|   O|
| 11115|2024-06-01 09:30:00|   I|
| 11115|2024-06-01 17:30:00|   O|
+------+-------------------+----+



In [92]:
window = Window.partitionBy('emp_id').orderBy(col('punch_time'))
df = df.withColumn('prev_time', lag(col('punch_time')).over(window))
df.show()

+------+-------------------+----+-------------------+
|emp_id|         punch_time|flag|          prev_time|
+------+-------------------+----+-------------------+
| 11114|2024-06-01 08:30:00|   I|               NULL|
| 11114|2024-06-01 10:30:00|   O|2024-06-01 08:30:00|
| 11114|2024-06-01 11:30:00|   I|2024-06-01 10:30:00|
| 11114|2024-06-01 15:30:00|   O|2024-06-01 11:30:00|
| 11115|2024-06-01 09:30:00|   I|               NULL|
| 11115|2024-06-01 17:30:00|   O|2024-06-01 09:30:00|
+------+-------------------+----+-------------------+



In [95]:
df = df.withColumn('time_diff', (col('punch_time').cast('long') - col('prev_time').cast('long'))/3600)
df.show()

+------+-------------------+----+-------------------+---------+
|emp_id|         punch_time|flag|          prev_time|time_diff|
+------+-------------------+----+-------------------+---------+
| 11114|2024-06-01 08:30:00|   I|               NULL|     NULL|
| 11114|2024-06-01 10:30:00|   O|2024-06-01 08:30:00|      2.0|
| 11114|2024-06-01 11:30:00|   I|2024-06-01 10:30:00|      1.0|
| 11114|2024-06-01 15:30:00|   O|2024-06-01 11:30:00|      4.0|
| 11115|2024-06-01 09:30:00|   I|               NULL|     NULL|
| 11115|2024-06-01 17:30:00|   O|2024-06-01 09:30:00|      8.0|
+------+-------------------+----+-------------------+---------+



In [96]:
df = df.groupBy('emp_id').agg(sum(when(col('flag') == 'O', col('time_diff')).otherwise(0)).alias('total_time'))
df.show()

+------+----------+
|emp_id|total_time|
+------+----------+
| 11114|       6.0|
| 11115|       8.0|
+------+----------+



### Q9. Employee with Manager
From the given data set, Fetch the manager and their employees

In [97]:
data = [('4529', 'Nancy', '4125'),
('4238','John', '4329'),
('4329', 'Martina', '4125'),
('4009', 'Klaus', '4329'),
('4125', 'Mafalda', 'NULL'),
('4500', 'Jakub', '4529'),
('4118', 'Moira', '4952'),
('4012', 'Jon', '4952'),
('4952', 'Sandra', '4529'),
('4444', 'Seamus', '4329')]
schema = ['employee_id', 'first_name', 'manager_id']

df = spark.createDataFrame(data=data, schema=schema)
df.show()

+-----------+----------+----------+
|employee_id|first_name|manager_id|
+-----------+----------+----------+
|       4529|     Nancy|      4125|
|       4238|      John|      4329|
|       4329|   Martina|      4125|
|       4009|     Klaus|      4329|
|       4125|   Mafalda|      NULL|
|       4500|     Jakub|      4529|
|       4118|     Moira|      4952|
|       4012|       Jon|      4952|
|       4952|    Sandra|      4529|
|       4444|    Seamus|      4329|
+-----------+----------+----------+



In [108]:
df.alias('e').join(df.alias('m'), col('e.manager_id')==col('m.employee_id'),'inner').show()

+-----------+----------+----------+-----------+----------+----------+
|employee_id|first_name|manager_id|employee_id|first_name|manager_id|
+-----------+----------+----------+-----------+----------+----------+
|       4529|     Nancy|      4125|       4125|   Mafalda|      NULL|
|       4329|   Martina|      4125|       4125|   Mafalda|      NULL|
|       4238|      John|      4329|       4329|   Martina|      4125|
|       4009|     Klaus|      4329|       4329|   Martina|      4125|
|       4444|    Seamus|      4329|       4329|   Martina|      4125|
|       4500|     Jakub|      4529|       4529|     Nancy|      4125|
|       4952|    Sandra|      4529|       4529|     Nancy|      4125|
|       4118|     Moira|      4952|       4952|    Sandra|      4529|
|       4012|       Jon|      4952|       4952|    Sandra|      4529|
+-----------+----------+----------+-----------+----------+----------+



In [105]:
rdf = df.alias('e').join(df.alias('m'), col('e.manager_id')==col('m.employee_id'),'inner')\
       .select(col('e.employee_id'), col('e.first_name'), col('e.manager_id'),col('m.first_name').alias('manager_name'))
rdf.show()

+-----------+----------+----------+------------+
|employee_id|first_name|manager_id|manager_name|
+-----------+----------+----------+------------+
|       4529|     Nancy|      4125|     Mafalda|
|       4329|   Martina|      4125|     Mafalda|
|       4238|      John|      4329|     Martina|
|       4009|     Klaus|      4329|     Martina|
|       4444|    Seamus|      4329|     Martina|
|       4500|     Jakub|      4529|       Nancy|
|       4952|    Sandra|      4529|       Nancy|
|       4118|     Moira|      4952|      Sandra|
|       4012|       Jon|      4952|      Sandra|
+-----------+----------+----------+------------+



In [107]:
rdf.groupBy('manager_id', 'manager_name').count().show()

+----------+------------+-----+
|manager_id|manager_name|count|
+----------+------------+-----+
|      4125|     Mafalda|    2|
|      4329|     Martina|    3|
|      4529|       Nancy|    2|
|      4952|      Sandra|    2|
+----------+------------+-----+



In [109]:
spark.stop()