In [0]:
from pyspark.sql import functions as F

simpleData = [("James","Sales","NY",90000,34,10000),
    ("Michael","Sales","NY",86000,56,20000),
    ("Robert","Sales","CA",81000,30,23000),
    ("Maria","Finance","CA",90000,24,23000),
    ("Raman","Finance","CA",99000,40,24000),
    ("Scott","Finance","NY",83000,36,19000),
    ("Jen","Finance","NY",79000,53,15000),
    ("Jeff","Marketing","CA",80000,25,18000),
    ("Kumar","Marketing","NY",91000,50,21000)
  ]

schema = ["employee_name","department","state","salary","age","bonus"]
df = spark.createDataFrame(data=simpleData, schema = schema)
df.printSchema()
df.show(truncate=False)

In [0]:
df.groupBy("department").sum("salary").show(truncate=False)

In [0]:
df.groupBy("department").count().show(truncate=False)

In [0]:
df.groupBy("department","state") \
    .sum("salary","bonus") \
   .show(truncate=False)

In [0]:

df.groupBy("department") \
    .agg(
  F.sum("salary").alias("sum_salary"), \
         F.avg("salary").alias("avg_salary"), \
         F.sum("bonus").alias("sum_bonus"), \
         F.max("bonus").alias("max_bonus") \
     ) \
    .show(truncate=False)

In [0]:
# df.

In [0]:
df_pd = df.toPandas()

In [0]:
df.groupBy("department") \
    .agg(sum("salary").alias("sum_salary"), \
      avg("salary").alias("avg_salary"), \
      sum("bonus").alias("sum_bonus"), \
      max("bonus").alias("max_bonus")) \
    .where(col("sum_bonus") >= 50000) \
    .show(truncate=False)

In [0]:
simpleData = (("James", "Sales", 3000), \
    ("Michael", "Sales", 4600),  \
    ("Robert", "Sales", 4100),   \
    ("Maria", "Finance", 3000),  \
    ("James", "Sales", 3000),    \
    ("Scott", "Finance", 3300),  \
    ("Jen", "Finance", 3900),    \
    ("Jeff", "Marketing", 3000), \
    ("Kumar", "Marketing", 2000),\
    ("Saif", "Sales", 4100) \
  )
 
columns= ["employee_name", "department", "salary"]

df = spark.createDataFrame(data = simpleData, schema = columns)

df.printSchema()
df.show(truncate=False)

In [0]:
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number

windowSpec  = Window.partitionBy("department").orderBy("salary")

df.withColumn("row_number",row_number().over(windowSpec)) \
    .show(truncate=False)


In [0]:
from pyspark.sql.functions import rank
df.withColumn("rank",rank().over(windowSpec)) \
    .show()

In [0]:
from pyspark.sql.functions import dense_rank
df.withColumn("dense_rank",dense_rank().over(windowSpec)) \
    .show()

In [0]:
from pyspark.sql.functions import percent_rank
df.withColumn("percent_rank",percent_rank().over(windowSpec)) \
    .show()

In [0]:
from pyspark.sql.functions import ntile
df.withColumn("ntile",ntile(2).over(windowSpec)) \
    .show()

In [0]:
from pyspark.sql.functions import cume_dist    
df.withColumn("cume_dist",cume_dist().over(windowSpec)) \
   .show()

In [0]:
from pyspark.sql.functions import lag    
df.withColumn("lag",lag("salary",2).over(windowSpec)) \
      .show()

In [0]:
from pyspark.sql.functions import lead    
df.withColumn("lead",lead("salary",2).over(windowSpec)) \
    .show()

In [0]:
windowSpecAgg  = Window.partitionBy("department")
from pyspark.sql.functions import col,avg,sum,min,max,row_number 

df.withColumn("row",row_number().over(windowSpec)) \
  .withColumn("avg", avg(col("salary")).over(windowSpecAgg)) \
  .withColumn("sum", sum(col("salary")).over(windowSpecAgg)) \
  .withColumn("min", min(col("salary")).over(windowSpecAgg)) \
  .withColumn("max", max(col("salary")).over(windowSpecAgg)) \
  .where(col("row")==1).select("department","avg","sum","min","max") \
  .show()


In [0]:
data = [("James","","Smith","36636","M",60000),
        ("Michael","Rose","","40288","M",70000),
        ("Robert","","Williams","42114","",400000),
        ("Maria","Anne","Jones","39192","F",500000),
        ("Jen","Mary","Brown","","F",0)]

columns = ["first_name","middle_name","last_name","dob","gender","salary"]
df = spark.createDataFrame(data = data, schema = columns)
df.printSchema()
df.show(truncate=False)

In [0]:
# Using when otherwise
from pyspark.sql.functions import col, when
df2 = df.withColumn("new_gender", when(col("gender") == "M","Male")
                                 .when(col("gender") == "F","Female")
                                 .otherwise("Unknown"))
df2.show(truncate=False)

In [0]:
df22=df.select(col("*"), when(col("gender") == "M","Male")
      .when(col("gender") == "F","Female")
      .otherwise("Unknown").alias("new_gender")).show(truncate=False)

# Using case when
from pyspark.sql.functions import expr
df3 = df.withColumn("new_gender", expr("case when gender = 'M' then 'Male' " + 
                       "when gender = 'F' then 'Female' " +
                       "else 'Unknown' end"))
df3.show(truncate=False)

In [0]:
#Using case when
df4 = df.select(col("*"), expr("case when gender = 'M' then 'Male' " +
                       "when gender = 'F' then 'Female' " +
                       "else 'Unknown' end").alias("new_gender"))
df4.show(truncate=False)

In [0]:
data2 = [(66, "a", "4"), (67, "a", "0"), (70, "b", "4"), (71, "d", "4")]
df5 = spark.createDataFrame(data = data2, schema = ["id", "code", "amt"])
         

df5.withColumn("new_column", when(col("code") == "a" | col("code") == "d", "A")
      .when(col("code") == "b" & col("amt") == "4", "B")
      .otherwise("A1")).show()

In [0]:
from pyspark.sql.functions import expr

data = [("Banana",1000,"USA"), ("Carrots",1500,"USA"), ("Beans",1600,"USA"), \
      ("Orange",2000,"USA"),("Orange",2000,"USA"),("Banana",400,"China"), \
      ("Carrots",1200,"China"),("Beans",1500,"China"),("Orange",4000,"China"), \
      ("Banana",2000,"Canada"),("Carrots",2000,"Canada"),("Beans",2000,"Mexico")]

columns= ["Product","Amount","Country"]
df = spark.createDataFrame(data = data, schema = columns)
df.printSchema()
df.show(truncate=False)

In [0]:
pivotDF = df.groupBy("Product").pivot("Country").sum("Amount")
pivotDF.printSchema()
pivotDF.show(truncate=False)

In [0]:
pivotDF = df.groupBy("Product","Country") \
      .sum("Amount") \
      .groupBy("Product") \
      .pivot("Country") \
      .sum("sum(Amount)")
pivotDF.printSchema()
pivotDF.show(truncate=False)

In [0]:
""" unpivot """
unpivotExpr = "stack(3, 'Canada', Canada, 'China', China, 'Mexico', Mexico) as (Country,Total)"
unPivotDF = pivotDF.select("Product", expr(unpivotExpr)) \
    .where("Total is not null")
unPivotDF.show(truncate=False)

In [0]:
from pyspark.sql import SparkSession


spark = SparkSession.builder.master("local").getOrCreate()

data = [
    ('Thin', 'Cell phone', 6000),
    ('Normal', 'Tablet', 1500),
    ('Mini', 'Tablet', 5500),
    ('Ultra thin', 'Cell phone', 5000),
    ('Vey thin', 'Cell phone', 6000),
    ('Big', 'Tablet', 2500),
    ('Bendable', 'Cell phone', 3000),
    ('Foldable', 'Cell phone', 3000),
    ('Pro', 'Tablet', 5400),
    ('Pro2', 'Tablet', 6500)
]

products = spark.createDataFrame(data, ['product', 'category', 'revenue'])

products.show()

In [0]:
Ответьте на следующие вопросы:

1) Какой продукт является самым продаваемым в каждой категории?
2) Каковы наибольшие и вторые наибольшие по продажам продукты в каждой категории?
3) Найдите разницу между доходом от каждого продукта и самым продаваемым продуктом в той же категории продукта?
4) Найдите разницу между доходом каждого продукта и средним доходом категории, если этот продукт?

In [0]:
products.groupBy("category") \
    .agg(F.max("revenue").alias("best")).show(truncate=False)

In [0]:
from pyspark.sql.window import Window
windowSpec  = Window.partitionBy("category").orderBy("revenue")

products.withColumn("rank", F.rank().over(windowSpec)).where(F.col("rank") == 1)  \
    .show(truncate=False)

In [0]:
products.withColumn("row_number", F.row_number().over(windowSpec)).where(F.col("row_number") <= 2).drop("row_number")  \
    .show(truncate=False)

In [0]:
drop(columns=)
select("nam1", "name2")