# Chapter 14: Joining and Concatenating

In [None]:
import polars as pl
pl.show_versions()  # The book is built with Polars version 1.13.1

## Joining

### Join Strategies

In [None]:
df_left = pl.DataFrame({"key": ["A", "B", "C", "D"], "value": [1, 2, 3, 4]})

df_right = pl.DataFrame({"key": ["B", "C", "D", "E"], "value": [5, 6, 7, 8]})

#### Inner

In [None]:
df_left.join(df_right, on="key", how="inner")

#### Full

In [None]:
df_left.join(df_right, on="key", how="full", suffix="_other")

#### Left

In [None]:
df_left.join(df_right, on="key", how="left")

#### Right

In [None]:
df_left.join(df_right, on="key", how="right")

#### Cross

In [None]:
df_left.join(df_right, how="cross")

#### Semi

In [None]:
df_left.join(df_right, on="key", how="semi")

#### Anti

In [None]:
df_left.join(df_right, on="key", how="anti")

### Joining on Multiple Columns

In [None]:
residences_left = pl.DataFrame(
    {
        "name": ["Alice", "Bob", "Charlie", "Dave"],
        "city": ["NY", "LA", "NY", "SF"],
        "age": [25, 30, 35, 40],
    }
)

departments_right = pl.DataFrame(
    {
        "name": ["Alice", "Bob", "Charlie", "Dave"],
        "city": ["NY", "LA", "NY", "Chicago"],
        "department": ["Finance", "Marketing", "Engineering", "Operations"],
    }
)

residences_left.join(departments_right, on=["name", "city"], how="inner")

### Validation

#### Many-to-many

#### One-to-many

#### Many-to-one

#### One-to-one

In [None]:
employees = pl.DataFrame(
    {
        "employee_id": [1, 2, 3, 4],
        "name": ["Alice", "Bob", "Charlie", "Dave"],
        "department_id": [10, 10, 30, 10],
    }
)

departments = pl.DataFrame(
    {
        "department_id": [10, 20, 30],
        "department_name": [
            "Information Technology",
            "Finance",
            "Human Resources",
        ],
    }
)

employees.join(departments, on="department_id", how="left", validate="m:1")

In [None]:
# This raises a ComputeError:
# departments = pl.DataFrame(
#     {
#         "department_id": [10, 20, 10],
#         "department_name": [
#             "Information Technology",
#             "Finance",
#             "Human Resources",
#         ],
#     }
# )

# employees.join(
#     departments, on="department_id", how="left", validate="m:1"
# )

## Inexact Joining

In [None]:
df_left = pl.DataFrame({"int_id": [10, 5], "value": ["b", "a"]})

df_right = pl.DataFrame({"int_id": [4, 7, 12], "value": [1, 2, 3]})

In [None]:
# This raises an InvalidOperationError:
# df_left.join_asof(df_right, on="int_id", tolerance=3)

In [None]:
df_left = df_left.sort("int_id")
df_right = df_right

df_left.join_asof(df_right, on="int_id")

In [None]:
df_left.join_asof(
    df_right,
    on="int_id",
    coalesce=False,
)

In [None]:
df_left.join_asof(
    df_right.rename({"int_id": "int_id_right"}),
    left_on="int_id",
    right_on="int_id_right",
)

### Inexact Join Strategies

In [None]:
print(df_left)
print(df_right)

In [None]:
df_left.join_asof(
    df_right,
    on="int_id",
    tolerance=3,
    strategy="backward",
)

In [None]:
df_left.join_asof(
    df_right,
    on="int_id",
    tolerance=3,
    strategy="forward",
)

In [None]:
df_left.join_asof(
    df_right,
    on="int_id",
    tolerance=3,
    strategy="nearest",
)

### Additional Finetuning

### Use Case: Marketing Campaign Attribution

In [None]:
campaigns = pl.scan_csv("data/campaigns.csv")
campaigns.head(1).collect()

In [None]:
campaigns.select(pl.col("Product Type").unique()).collect()

In [None]:
transactions = pl.scan_csv("data/transactions.csv")
transactions.head(1).collect()

In [None]:
transactions = transactions.with_columns(
    pl.col("Sale Date")
    .str.to_datetime("%Y-%m-%d %H:%M:%S%.f")
    .cast(pl.Datetime("us")),
)
campaigns = campaigns.with_columns(
    pl.col("Campaign Date").str.to_datetime("%Y-%m-%d %H:%M:%S"),
)

sales_with_campaign_df = (
    transactions.sort("Sale Date")
    .join_asof(
        campaigns.sort("Campaign Date"),
        left_on="Sale Date",
        right_on="Campaign Date",
        by="Product Type",
        strategy="backward",
        tolerance="60d",
    )
    .collect()
)
sales_with_campaign_df

In [None]:
(
    sales_with_campaign_df.group_by("Product Type", "Campaign Name")
    .agg(pl.col("Quantity").mean())
    .sort("Product Type", "Campaign Name")
)

In [None]:
campaigns.filter(pl.col("Product Type") == "Books").collect()

In [None]:
(
    transactions.filter(
        (pl.col("Product Type") == "Books")
        & (
            pl.col("Sale Date")
            > pl.lit("2023-12-31 21:00:00").str.to_datetime()
        )
    ).collect()
)

## Vertical and Horizontal Concatenation

### Vertical

In [None]:
df1 = pl.DataFrame(
    {
        "id": [1, 2, 3],
        "value": ["a", "b", "c"],
    }
)
df2 = pl.DataFrame(
    {
        "id": [4, 5],
        "value": ["d", "e"],
    }
)
pl.concat([df1, df2], how="vertical")

### Horizontal

In [None]:
df1 = pl.DataFrame(
    {
        "id": [1, 2, 3],
        "value": ["a", "b", "c"],
    }
)
df2 = pl.DataFrame(
    {
        "value2": ["x", "y"],
    }
)
pl.concat([df1, df2], how="horizontal")

### Diagonal

In [None]:
df1 = pl.DataFrame(
    {
        "id": [1, 2, 3],
        "value": ["a", "b", "c"],
    }
)
df2 = pl.DataFrame(
    {
        "value": ["d", "e"],
        "value2": ["x", "y"],
    }
)
pl.concat([df1, df2], how="diagonal")

### Align

In [None]:
df1 = pl.DataFrame(
    {
        "id": [1, 2, 3],
        "value": ["a", "b", "c"],
    }
)
df2 = pl.DataFrame(
    {
        "value": ["a", "c", "d"],
        "value2": ["x", "y", "z"],
    }
)
pl.concat([df1, df2], how="align")

In [None]:
df1 = pl.DataFrame(
    {
        "id": [1, 2, 2],
        "value": ["a", "c", "b"],
    }
)
df2 = pl.DataFrame(
    {
        "id": [2, 2],
        "value": ["x", "y"],
    }
)
pl.align_frames(df1, df2, on="id")

### Relaxed

In [None]:
# This raises a SchemaError:
# df1 = pl.DataFrame(
#     {
#         "id": [1, 2, 3],
#         "value": ["a", "b", "c"],
#     }
# )
# df2 = pl.DataFrame(
#     {
#         "id": [4.0, 5.0],
#         "value": [1, 2],
#     }
# )
# pl.concat([df1, df2], how="vertical")

In [None]:
pl.concat([df1, df2], how="vertical_relaxed")

### Stacking

In [None]:
df1 = pl.DataFrame(
    {
        "id": [1, 2],
        "value": ["a", "b"],
    }
)
df2 = pl.DataFrame(
    {
        "id": [3, 4],
        "value": ["c", "d"],
    }
)
df1.vstack(df2)

In [None]:
df1 = pl.DataFrame(
    {
        "id": [1, 2],
        "value": ["a", "b"],
    }
)
df2 = pl.DataFrame(
    {
        "value2": ["x", "y"],
    }
)
df1.hstack(df2)

### Appending

In [None]:
s1 = pl.Series("a", [1, 2])
s2 = pl.Series("b", [3, 4])
s1.append(s2)

### Extending

In [None]:
df1 = pl.DataFrame(
    {
        "id": [1, 2],
        "value": ["a", "b"],
    }
)
df2 = pl.DataFrame(
    {
        "id": [3, 4],
        "value": ["c", "d"],
    }
)
df1.extend(df2)

## Takeaways