In [0]:
from pyspark.sql.functions import first, rand, count

# Generowanie dat
dates = spark.sql("SELECT explode(sequence(DATE'2024-01-01', DATE'2024-03-24', INTERVAL 1 DAY)) as calendar_date")

# Generowanie identyfikatorów klientów
c_id = spark.sql("SELECT explode(sequence(1,200, 1)) as client_id")

# Generowanie nazw kolumn
types = spark.sql("""SELECT concat("col_", colName) as col_name FROM (SELECT explode(sequence(1,20, 1)) as colName)""")

# Repartycjonowanie danych
dates = dates.repartition(99)
c_id = c_id.repartition(11)
types = types.repartition(1)

# Tworzenie pełnego cross join
df_cartesian = c_id.crossJoin(dates.select("calendar_date")).crossJoin(types.select("col_name")).select("client_id","calendar_date","col_name")

# Zliczanie liczby klientów dla każdej daty (grupowanie po datach)
df_cartesian2 = df_cartesian.groupBy("calendar_date").agg(count("client_id"))

# display(df_cartesian2.limit(1000))

# Dodanie kolumny z losowymi wartościami (wartość numeryczna)
df_cartesian = df_cartesian.withColumn("val", (rand()*10).cast("int"))

# Tworzenie tabeli z pivotowaniem na kolumnie `col_name`
df_grp = df_cartesian.groupBy("client_id", "calendar_date").pivot("col_name").agg(first("val").alias("val"))

# Wyświetlanie wyników
display(df_grp)

client_id,calendar_date,col_1,col_10,col_11,col_12,col_13,col_14,col_15,col_16,col_17,col_18,col_19,col_2,col_20,col_3,col_4,col_5,col_6,col_7,col_8,col_9
170,2024-03-19,3,8,0,1,2,9,2,9,4,6,4,3,6,9,0,1,1,5,7,2
170,2024-03-07,5,1,6,9,5,4,9,9,4,5,9,3,4,9,5,9,4,4,5,8
170,2024-01-17,6,8,1,8,1,0,7,4,4,1,3,9,2,3,5,8,8,0,3,2
170,2024-01-31,8,1,6,3,0,9,0,6,7,2,2,5,3,2,9,5,2,9,5,4
170,2024-02-26,0,3,5,2,3,5,9,5,7,4,3,0,7,0,2,2,5,1,2,4
170,2024-03-21,0,0,9,6,3,1,4,6,5,7,3,8,9,0,7,5,7,1,8,2
170,2024-01-29,5,9,7,7,7,2,7,6,3,2,0,3,5,8,1,8,8,4,4,3
170,2024-03-24,1,3,1,8,0,8,4,7,1,0,8,3,4,1,5,1,4,8,0,6
170,2024-02-24,8,3,1,9,9,0,5,6,0,5,8,0,1,4,3,4,6,9,9,6
170,2024-02-29,5,9,4,2,2,5,9,5,6,8,8,2,6,9,2,8,3,2,8,7


`inner join` - łączy tylko te wiersze, które mają dopasowane wartości w obu tabelach

`left join` łączy wszystkie wiersze z tabeli po lewej stronie (tutaj df_cartesian), a te, które nie mają dopasowania w tabeli po prawej, dostaną wartości NULL

In [0]:
# zmiana nazwy kolumny, która by się powtarzała
df_cartesian2_renamed = df_cartesian2.withColumnRenamed("calendar_date", "calendar_date_right")

# Złączenie inner
inner_join_result = df_cartesian.join(df_cartesian2_renamed, df_cartesian["calendar_date"] == df_cartesian2_renamed["calendar_date_right"], "inner")

# Usuwanie duplikatów
inner_without_duplicates = inner_join_result.drop("calendar_date_right")

# Wyświetlenie wyników złączenia
display(inner_without_duplicates.limit(10))


client_id,calendar_date,col_name,val,count(client_id)
170,2024-03-19,col_1,3,4000
170,2024-03-19,col_2,5,4000
170,2024-03-19,col_3,6,4000
170,2024-03-19,col_4,8,4000
170,2024-03-19,col_5,0,4000
170,2024-03-19,col_6,0,4000
170,2024-03-19,col_7,5,4000
170,2024-03-19,col_8,1,4000
170,2024-03-19,col_9,8,4000
170,2024-03-19,col_10,5,4000


In [0]:
# Złączenie left
left_join_result = df_cartesian.join(df_cartesian2, df_cartesian["calendar_date"] == df_cartesian2["calendar_date"], "left")

# drop "calendar_date"
left_without_duplicates = left_join_result.drop(df_cartesian2["calendar_date"])

display(left_without_duplicates.limit(10))

client_id,col_name,val,calendar_date,count(client_id)
170,col_1,3,2024-03-19,4000
170,col_2,5,2024-03-19,4000
170,col_3,6,2024-03-19,4000
170,col_4,8,2024-03-19,4000
170,col_5,0,2024-03-19,4000
170,col_6,0,2024-03-19,4000
170,col_7,5,2024-03-19,4000
170,col_8,1,2024-03-19,4000
170,col_9,8,2024-03-19,4000
170,col_10,5,2024-03-19,4000
