In [2]:
import findspark
findspark.init()

In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number, col, when, avg, asc, desc, coalesce, lit, explode, split, trim
from pyspark.sql.functions import *

In [4]:
spark = SparkSession.builder.appName('q2').getOrCreate()
spark

###  find records with the second highest 'insert_date' for each 'location'.

In [None]:
# This question will tell you how to use transformations in spark.

In [25]:
data = [(1, "A", "2023-01-01 10:00:00"),
         (2, "A", "2023-01-02 12:00:00"),
         (3, "B", "2023-01-01 09:00:00"),
         (4, "B", "2023-01-03 11:00:00"),
         (5, "A", "2023-01-03 13:00:00"),
         (6, "B", "2023-01-02 15:00:00"),
         (7, "C", "2023-01-08 18:00:00"),
         (8, "C", "2023-01-07 16:00:00")]

columns = ['id', 'location', 'insert_date']
df = spark.createDataFrame(data, columns)
df.show()

+---+--------+-------------------+
| id|location|        insert_date|
+---+--------+-------------------+
|  1|       A|2023-01-01 10:00:00|
|  2|       A|2023-01-02 12:00:00|
|  3|       B|2023-01-01 09:00:00|
|  4|       B|2023-01-03 11:00:00|
|  5|       A|2023-01-03 13:00:00|
|  6|       B|2023-01-02 15:00:00|
|  7|       C|2023-01-08 18:00:00|
|  8|       C|2023-01-07 16:00:00|
+---+--------+-------------------+



In [26]:
# code in spark using transformation and action 

windowspec = Window.partitionBy('location').orderBy(df['insert_date'].desc())

In [27]:
df_rnk = df.withColumn('rnk', row_number().over(windowspec))

In [28]:
result = df_rnk.filter(df_rnk.rnk == 2).select('id', 'location', 'insert_date')
result.show()

+---+--------+-------------------+
| id|location|        insert_date|
+---+--------+-------------------+
|  2|       A|2023-01-02 12:00:00|
|  6|       B|2023-01-02 15:00:00|
|  8|       C|2023-01-07 16:00:00|
+---+--------+-------------------+



In [4]:
data1 = [("Tim Voss", 19, "red", 91),
("Nicole Johnson", 20, "yellow", 95),
("Elsa Williams", 21, "green", 82),
("John james", 20, "blue", 75),
("Catherine Jones", 23, "green", 93)]
columns1 = ['name', 'age', 'favorite_color', 'grade']

student_df = spark.createDataFrame(data1, columns1)
student_df.show()

+---------------+---+--------------+-----+
|           name|age|favorite_color|grade|
+---------------+---+--------------+-----+
|       Tim Voss| 19|           red|   91|
| Nicole Johnson| 20|        yellow|   95|
|  Elsa Williams| 21|         green|   82|
|     John james| 20|          blue|   75|
|Catherine Jones| 23|         green|   93|
+---------------+---+--------------+-----+



In [9]:
def grades_colors(df):
    return df.filter((col('favorite_color').isin('red','green')) & (col('grade') > 90))

result = grades_colors(student_df)
result.show()

+---------------+---+--------------+-----+
|           name|age|favorite_color|grade|
+---------------+---+--------------+-----+
|       Tim Voss| 19|           red|   91|
|Catherine Jones| 23|         green|   93|
+---------------+---+--------------+-----+



In [12]:
data2 = [(1, 'Alice', 45),
        (2, 'Bob', 120), 
        (3, 'Charlie', 75),
        (4, 'David', 180), 
        (5, 'Eve', 220)]
columns2 = ['id','name', 'sales']
df = spark.createDataFrame(data2, columns2)
df.show()

+---+-------+-----+
| id|   name|sales|
+---+-------+-----+
|  1|  Alice|   45|
|  2|    Bob|  120|
|  3|Charlie|   75|
|  4|  David|  180|
|  5|    Eve|  220|
+---+-------+-----+



In [27]:
# Filter out records where sales are less than 50.

#df1 = df.filter(col('sales') < 50)
df1 = df.filter(col('sales') >= 50)
df1.show()

+---+-------+-----+
| id|   name|sales|
+---+-------+-----+
|  2|    Bob|  120|
|  3|Charlie|   75|
|  4|  David|  180|
|  5|    Eve|  220|
+---+-------+-----+



In [31]:
# create a new column sales_category which categorizes sales into 'Low' (50-100), "Medium" (101-200), and 'High' (>200). 

df2 = df1.withColumn('sales_category', when(col('sales') <= 100, 'Low')
                     .when((col('sales') > 100)&(col('sales')<=200), 'Medium').otherwise('High'))
df2.show()

+---+-------+-----+--------------+
| id|   name|sales|sales_category|
+---+-------+-----+--------------+
|  2|    Bob|  120|        Medium|
|  3|Charlie|   75|           Low|
|  4|  David|  180|        Medium|
|  5|    Eve|  220|          High|
+---+-------+-----+--------------+



In [32]:
# Group the data by sales_category and calculate the average sales for each category. 

df = df2.groupBy('sales_category').agg(avg('sales').alias('average_sales'))
df.show()

+--------------+-------------+
|sales_category|average_sales|
+--------------+-------------+
|        Medium|        150.0|
|           Low|         75.0|
|          High|        220.0|
+--------------+-------------+



In [34]:
df_data = [
    ('A', 'Good', 9.5, 'horror'),
    ('B', 'Good', 8.2, 'Comedy'),
    ('C', 'Avg', 9.5, 'Comedy'),
    ('D', 'Avg', 9.5, 'horror')
]
df_columns = ["title", "reviews", "ratings", "categories"]
df = spark.createDataFrame(df_data, df_columns)

casting_df_data = [
    ('Actor1', 'A', 9.0),
    ('Actor2', 'B', 8.5),
    ('Actor3', 'C', 7.5),
    ('Actor4', 'D', 8.0)
]
casting_df_columns = ["actor_name", "movie_name", "actor_rating"]

casting_df = spark.createDataFrame(casting_df_data, casting_df_columns)


In [35]:
df.show()

+-----+-------+-------+----------+
|title|reviews|ratings|categories|
+-----+-------+-------+----------+
|    A|   Good|    9.5|    horror|
|    B|   Good|    8.2|    Comedy|
|    C|    Avg|    9.5|    Comedy|
|    D|    Avg|    9.5|    horror|
+-----+-------+-------+----------+



In [36]:
casting_df.show()

+----------+----------+------------+
|actor_name|movie_name|actor_rating|
+----------+----------+------------+
|    Actor1|         A|         9.0|
|    Actor2|         B|         8.5|
|    Actor3|         C|         7.5|
|    Actor4|         D|         8.0|
+----------+----------+------------+



In [38]:
# Q1) join df and casting_df on movie name.

j_df = df.join(casting_df, df.title == casting_df.movie_name, how = 'full_outer')
j_df.show()

+-----+-------+-------+----------+----------+----------+------------+
|title|reviews|ratings|categories|actor_name|movie_name|actor_rating|
+-----+-------+-------+----------+----------+----------+------------+
|    A|   Good|    9.5|    horror|    Actor1|         A|         9.0|
|    B|   Good|    8.2|    Comedy|    Actor2|         B|         8.5|
|    C|    Avg|    9.5|    Comedy|    Actor3|         C|         7.5|
|    D|    Avg|    9.5|    horror|    Actor4|         D|         8.0|
+-----+-------+-------+----------+----------+----------+------------+



In [39]:
# Q2) Find the category of the actor and movie.

actor_movie_catogery = j_df.select('actor_name', 'movie_name', 'categories')
actor_movie_catogery.show()

+----------+----------+----------+
|actor_name|movie_name|categories|
+----------+----------+----------+
|    Actor1|         A|    horror|
|    Actor2|         B|    Comedy|
|    Actor3|         C|    Comedy|
|    Actor4|         D|    horror|
+----------+----------+----------+



In [46]:
# Q3) Find the what is the avg rating of actor

avg_r = casting_df.agg({'actor_rating':'avg'}).alias('avg_actor_rating')
avg_r.show()

+-----------------+
|avg(actor_rating)|
+-----------------+
|             8.25|
+-----------------+



In [53]:
# Q4) how to sort in pyspark?

sort_df = df.orderBy(desc('title'))
sort_df.show()

+-----+-------+-------+----------+
|title|reviews|ratings|categories|
+-----+-------+-------+----------+
|    D|    Avg|    9.5|    horror|
|    C|    Avg|    9.5|    Comedy|
|    B|   Good|    8.2|    Comedy|
|    A|   Good|    9.5|    horror|
+-----+-------+-------+----------+



In [65]:
data = [
    (1, "Alice", None),
    (2, "Bob", 1),
    (3, "Carol", 2),
    (4, "Dave", 1),
    (5, "Eve", 2),
    (6, "Frank", 4)
]
schema = ["ID", "Name", "Boss"]
df = spark.createDataFrame(data, schema)
df.show()

+---+-----+----+
| ID| Name|Boss|
+---+-----+----+
|  1|Alice|NULL|
|  2|  Bob|   1|
|  3|Carol|   2|
|  4| Dave|   1|
|  5|  Eve|   2|
|  6|Frank|   4|
+---+-----+----+



In [66]:
df_alias = df.alias("e1")
boss_alias = df.alias("e2")

df1 = df_alias.join(boss_alias, col("e1.Boss") == col("e2.ID"), 'left')
df1.show()

+---+-----+----+----+-----+----+
| ID| Name|Boss|  ID| Name|Boss|
+---+-----+----+----+-----+----+
|  1|Alice|NULL|NULL| NULL|NULL|
|  2|  Bob|   1|   1|Alice|NULL|
|  3|Carol|   2|   2|  Bob|   1|
|  4| Dave|   1|   1|Alice|NULL|
|  5|  Eve|   2|   2|  Bob|   1|
|  6|Frank|   4|   4| Dave|   1|
+---+-----+----+----+-----+----+



In [67]:
df2=df1.select(col("e1.name").alias("Employee_Name"), coalesce(col('e2.Name'), lit("No Boss")).alias('Boss'))
df2.show()

+-------------+-------+
|Employee_Name|   Boss|
+-------------+-------+
|        Alice|No Boss|
|          Bob|  Alice|
|        Carol|    Bob|
|         Dave|  Alice|
|          Eve|    Bob|
|        Frank|   Dave|
+-------------+-------+



In [68]:
df.show()

+---+-----+----+
| ID| Name|Boss|
+---+-----+----+
|  1|Alice|NULL|
|  2|  Bob|   1|
|  3|Carol|   2|
|  4| Dave|   1|
|  5|  Eve|   2|
|  6|Frank|   4|
+---+-----+----+



In [69]:
data=[(1,"Gaurav","Pune,hyd,Blr"),(2,"Ravi","hyd,Blr")]

columns=["Empid","Name","Locations"]
df=spark.createDataFrame(data,columns)

df.show()

+-----+------+------------+
|Empid|  Name|   Locations|
+-----+------+------------+
|    1|Gaurav|Pune,hyd,Blr|
|    2|  Ravi|     hyd,Blr|
+-----+------+------------+



In [74]:
df1 = df.withColumn("Location", explode(split(col('Locations'), ',')))
df1.show()

+-----+------+------------+--------+
|Empid|  Name|   Locations|Location|
+-----+------+------------+--------+
|    1|Gaurav|Pune,hyd,Blr|    Pune|
|    1|Gaurav|Pune,hyd,Blr|     hyd|
|    1|Gaurav|Pune,hyd,Blr|     Blr|
|    2|  Ravi|     hyd,Blr|     hyd|
|    2|  Ravi|     hyd,Blr|     Blr|
+-----+------+------------+--------+



In [77]:
result = df1.withColumn("Location", trim(col("Location")))
result.show()

+-----+------+------------+--------+
|Empid|  Name|   Locations|Location|
+-----+------+------------+--------+
|    1|Gaurav|Pune,hyd,Blr|    Pune|
|    1|Gaurav|Pune,hyd,Blr|     hyd|
|    1|Gaurav|Pune,hyd,Blr|     Blr|
|    2|  Ravi|     hyd,Blr|     hyd|
|    2|  Ravi|     hyd,Blr|     Blr|
+-----+------+------------+--------+



In [81]:
# Location not equal to Blr 

final = result.select("Empid", "Name", "Location").filter(col("Location") != 'Blr')
final.show()

+-----+------+--------+
|Empid|  Name|Location|
+-----+------+--------+
|    1|Gaurav|    Pune|
|    1|Gaurav|     hyd|
|    2|  Ravi|     hyd|
+-----+------+--------+



In [5]:
df = spark.createDataFrame([
                            ('The Shawshank Redemption',['Drama', 'Crime']),
                            ('The Godfather', ['Drama', 'Crime']),
                            ('Pulp Fiction', ['Drama', 'Crime','Thriller']),
                            ('The Dark Knight', ['Drama', 'Crime','Thriller','Action']), 
                           ],
                          ["name", "genres"])
df.show(truncate=False)
df.printSchema()

+------------------------+--------------------------------+
|name                    |genres                          |
+------------------------+--------------------------------+
|The Shawshank Redemption|[Drama, Crime]                  |
|The Godfather           |[Drama, Crime]                  |
|Pulp Fiction            |[Drama, Crime, Thriller]        |
|The Dark Knight         |[Drama, Crime, Thriller, Action]|
+------------------------+--------------------------------+

root
 |-- name: string (nullable = true)
 |-- genres: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [88]:
df1 = df.select('name', explode(col('genres')).alias('genre'))
df1.show()

+--------------------+--------+
|                name|   genre|
+--------------------+--------+
|The Shawshank Red...|   Drama|
|The Shawshank Red...|   Crime|
|       The Godfather|   Drama|
|       The Godfather|   Crime|
|        Pulp Fiction|   Drama|
|        Pulp Fiction|   Crime|
|        Pulp Fiction|Thriller|
|     The Dark Knight|   Drama|
|     The Dark Knight|   Crime|
|     The Dark Knight|Thriller|
|     The Dark Knight|  Action|
+--------------------+--------+



In [90]:
df1.groupBy('genre').count().show()

+--------+-----+
|   genre|count|
+--------+-----+
|   Crime|    4|
|   Drama|    4|
|Thriller|    2|
|  Action|    1|
+--------+-----+



# How would you calculate the month-wise cumulative revenue using Pyspark?

In [5]:
data = [( 
'3000' ,  '22-may'), 
('5000' ,  '23-may'),
('5000' ,  '25-may'),
('10000' , '22-june'),  
('1250'  , '03-july')]

schema = ['revenue','date']
df=spark.createDataFrame(data,schema)
df.show()

+-------+-------+
|revenue|   date|
+-------+-------+
|   3000| 22-may|
|   5000| 23-may|
|   5000| 25-may|
|  10000|22-june|
|   1250|03-july|
+-------+-------+



In [9]:
df1 = df.withColumn('month', date_format(to_date(col('date'), 'dd-MM'), 'MMM'))
df1.show()

+-------+-------+-----+
|revenue|   date|month|
+-------+-------+-----+
|   3000| 22-may| NULL|
|   5000| 23-may| NULL|
|   5000| 25-may| NULL|
|  10000|22-june| NULL|
|   1250|03-july| NULL|
+-------+-------+-----+



In [12]:
window_spec=Window.partitionBy('month').orderBy('date').rowsBetween(Window.unboundedPreceding,Window.currentRow)
df2=df1.withColumn('cumulative_sum',sum(col("revenue")).over(window_spec))
df2.show()

+-------+-------+-----+--------------+
|revenue|   date|month|cumulative_sum|
+-------+-------+-----+--------------+
|   1250|03-july| NULL|        1250.0|
|  10000|22-june| NULL|       11250.0|
|   3000| 22-may| NULL|       14250.0|
|   5000| 23-may| NULL|       19250.0|
|   5000| 25-may| NULL|       24250.0|
+-------+-------+-----+--------------+



In [13]:
# write a spark or sql code to find the employee count under each manager?
https://www.youtube.com/watch?v=8L7BIDUySfw&list=PLDbkX3qdHA3AVyxV-OUlgjgzojAtc4WSz&index=8 

In [14]:
data = [('4529', 'Nancy', 'Young', '4125'),
('4238','John', 'Simon', '4329'),
('4329', 'Martina', 'Candreva', '4125'),
('4009', 'Klaus', 'Koch', '4329'),
('4125', 'Mafalda', 'Ranieri', 'NULL'),
('4500', 'Jakub', 'Hrabal', '4529'),
('4118', 'Moira', 'Areas', '4952'),
('4012', 'Jon', 'Nilssen', '4952'),
('4952', 'Sandra', 'Rajkovic', '4529'),
('4444', 'Seamus', 'Quinn', '4329')]

schema = ['employee_id' ,'first_name', 'last_name', 'manager_id']
df = spark.createDataFrame(data=data, schema=schema)
df.show()

+-----------+----------+---------+----------+
|employee_id|first_name|last_name|manager_id|
+-----------+----------+---------+----------+
|       4529|     Nancy|    Young|      4125|
|       4238|      John|    Simon|      4329|
|       4329|   Martina| Candreva|      4125|
|       4009|     Klaus|     Koch|      4329|
|       4125|   Mafalda|  Ranieri|      NULL|
|       4500|     Jakub|   Hrabal|      4529|
|       4118|     Moira|    Areas|      4952|
|       4012|       Jon|  Nilssen|      4952|
|       4952|    Sandra| Rajkovic|      4529|
|       4444|    Seamus|    Quinn|      4329|
+-----------+----------+---------+----------+



In [24]:
df.groupBy('manager_id').count().show()
df.groupBy('manager_id').agg(count('*').alias('count1')).show()

+----------+-----+
|manager_id|count|
+----------+-----+
|      4125|    2|
|      4329|    3|
|      NULL|    1|
|      4529|    2|
|      4952|    2|
+----------+-----+

+----------+------+
|manager_id|count1|
+----------+------+
|      4125|     2|
|      4329|     3|
|      NULL|     1|
|      4529|     2|
|      4952|     2|
+----------+------+



In [25]:
df.createOrReplaceTempView('EMP')

In [26]:
query = '''select  e.manager_id as manager_id, 
count(e.employee_id) as no_of_emp,(m.First_name) as mangr_name 
from  emp e
inner join emp m on m.employee_id =e.manager_id group by 1,3 '''
result=spark.sql(query).show()

+----------+---------+----------+
|manager_id|no_of_emp|mangr_name|
+----------+---------+----------+
|      4125|        2|   Mafalda|
|      4329|        3|   Martina|
|      4529|        2|     Nancy|
|      4952|        2|    Sandra|
+----------+---------+----------+

