In [132]:
from pyspark.sql import SparkSession
from pyspark.sql.window import Window
from pyspark.sql.functions import *

In [133]:
spark = SparkSession.builder.master("local[*]").appName("agg").getOrCreate()

In [135]:
map(lambda x: x + 1, [1, 2, 3])

<map at 0x20df7ee3a00>

In [134]:
df = (
    spark.read.format("csv")
    .option("header", "true")
    .option("inferSchema", "true")
    .load("../data/retail-data/all/*.csv")
    .coalesce(5)
)
df.cache()

DataFrame[InvoiceNo: string, StockCode: string, Description: string, Quantity: int, InvoiceDate: string, UnitPrice: double, CustomerID: int, Country: string]

In [None]:
df.count()

In [None]:
df.select(count("StockCode").alias("count_stock_code")).show()

In [None]:
df.select(count_distinct("StockCode").alias("count_stock_code")).show()

In [None]:
df.select(approx_count_distinct("StockCode", 0.03).alias("count_stock_code")).show()

In [None]:
df.select(first("StockCode"), last("StockCode")).show()

In [None]:
df.select(min("Quantity"), max("Quantity")).show()

In [None]:
df.select(sum("Quantity")).show()

In [None]:
df.select(sum_distinct("Quantity")).show()

In [None]:
df.select(
    count("Quantity").alias("total_transactions"),
    sum("Quantity").alias("total_purchases"),
    avg("Quantity").alias("avg_purchases"),
    expr("mean(Quantity)").alias("mean_purchases"),
).selectExpr(
    "(total_purchases/total_transactions) as ratio_purchases_transactions",
    "avg_purchases",
    "mean_purchases",
).show()

In [None]:
df.show(5)

In [None]:
df.agg(collect_set("Country"), collect_list("Country")).show()

grouping

In [None]:
(
    df.where(col("CustomerId").isNotNull())
    .groupBy("InvoiceNo", "CustomerId")
    .count()
    .orderBy(desc("count"))
).show()

In [None]:
(df.groupBy("InvoiceNo").agg(count_distinct("Quantity").alias("quan"))).show()

In [None]:
pessoas = (
    df.groupBy("InvoiceNo")
    .agg(sum("Quantity").alias("sum_quantity"))
    .where(col("sum_quantity") > 0)
    .dropDuplicates()
)

In [None]:
pessoas.agg(max(col("sum_quantity")), min(col("sum_quantity"))).show()

In [None]:
w = Window.orderBy(desc("sum_quantity"))

pessoas_10_pct = pessoas.withColumn("10_porcento", ntile(10).over(w))

In [None]:
# pessoas_10_pct.orderBy(desc("sum_quantity")).show()


pessoas_10_pct.where(col("10_porcento") == 1).agg(min(col("sum_quantity"))).show()

In [None]:
df_with_date = df.withColumn("date", to_date(col("InvoiceDate"), "M/d/y H:mm"))

In [None]:
df_with_date.show()

In [None]:
windowSpec = (
    Window.partitionBy("CustomerID", "date")
    .orderBy(desc("Quantity"))
    .rowsBetween(Window.unboundedPreceding, Window.currentRow)
)

maxPurchaseQuantity = max(col("Quantity")).over(windowSpec)
purchaseDenseRank = dense_rank().over(windowSpec)
purchaseRank = rank().over(windowSpec)
sumPurchase = sum(("Quantity")).over(windowSpec)

In [None]:
df_with_date.where("CustomerId IS NOT NULL").orderBy("CustomerId").select(
    col("CustomerId"),
    col("date"),
    col("Quantity"),
    purchaseRank.alias("quantity_rank"),
    purchaseDenseRank.alias("quantity_dense_rank"),
    maxPurchaseQuantity.alias("max_purchase_quantity"),
    sumPurchase.alias("sum_quantity"),
).show()

In [None]:
pivoted = df_with_date.groupBy("date").pivot("Country").sum()

In [None]:
pivoted.filter('date > "2011-12-05"').select("date", "`USA_sum(Quantity)`").show()

join

In [None]:
person = spark.createDataFrame(
    [
        (0, "Bill Chambers", 0, [100]),
        (1, "Matei Zaharia", 1, [500, 250, 100]),
        (2, "Michael Armbrust", 1, [250, 100]),
    ]
).toDF("id", "name", "graduate_program", "spark_status")

graduateProgram = spark.createDataFrame(
    [
        (0, "Masters", "School of Information", "UC Berkeley"),
        (2, "Masters", "EECS", "UC Berkeley"),
        (1, "Ph.D.", "EECS", "UC Berkeley"),
    ]
).toDF("id", "degree", "department", "school")

sparkStatus = spark.createDataFrame(
    [(500, "Vice President"), (250, "PMC Member"), (100, "Contributor")]
).toDF("id", "status")

In [None]:
from pyspark.sql.types import StructField, IntegerType

In [None]:
schema = StructType(
    [StructField("DESC", StringType(), False), StructField("ID", IntegerType(), False)]
)

df = spark.createDataFrame([temp1], schema)

df.show()

In [None]:
data = [
    {"Category": "A", "ID": 1, "Value": 121.44, "Truth": True},
    {"Category": "B", "ID": 2, "Value": 300.01, "Truth": False},
    {"Category": "C", "ID": 3, "Value": 10.99, "Truth": None},
    {"Category": "E", "ID": 4, "Value": 33.87, "Truth": False},
]

cdf = spark.createDataFrame(data)
type(cdf)