# Chapter 14: Joining and Concatenating

In [1]:
import polars as pl
pl.show_versions()  # The book is built with Polars version 1.0.0

## Joining

### Join Strategies

In [4]:
import polars as pl

df_left = pl.DataFrame({
    "key": ["A", "B", "C", "D"],
    "value": [1, 2, 3, 4]
})

df_right = pl.DataFrame({
    "key": ["B", "C", "D", "E"],
    "value": [5, 6, 7, 8]
})


In [5]:
df_left.join(df_right, on="key", how="inner")

In [6]:
df_left.join(df_right, on="key", how="outer", suffix="_other")

In [7]:
df_left.join(df_right, on="key", how="left")

In [8]:
df_left.join(df_right, how="cross")

In [9]:
df_left.join(df_right, on="key", how="semi")

In [10]:
df_left.join(df_right, on="key", how="anti")

### Joining on Multiple Columns

In [12]:
df_left = pl.DataFrame({
    "name": ["Alice", "Bob", "Charlie", "Dave"],
    "city": ["NY", "LA", "NY", "SF"],
    "age": [25, 30, 35, 40]
})

df_right = pl.DataFrame({
    "name": ["Alice", "Bob", "Charlie", "Dave"],
    "city": ["NY", "LA", "NY", "Chicago"],
    "department": ["Finance", "Marketing", "Engineering", "Operations"]
})

df_left.join(df_right, on=["name", "city"], how="inner")

### Validation

In [14]:
df_employees = pl.DataFrame({
    "employee_id": [1, 2, 3, 4],
    "name": ["Alice", "Bob", "Charlie", "Dave"],
    "department_id": [10, 10, 30, 10],
})

df_departments = pl.DataFrame({
    "department_id": [10, 20, 30],
    "department_name": ["Information Technology", "Finance", "Human Resources"],
})

df_employees.join(
    df_departments,
    on="department_id",
    how="left",
    validate="m:1"
)

In [15]:
df_departments = pl.DataFrame({
    "department_id": [10, 20, 10],
    "department_name": ["Information Technology", "Finance", "Human Resources"],
})

df_employees.join(
    df_departments,
    on="department_id",
    how="left",
    validate="m:1"
)

## Inexact Joining

In [17]:
df_left = pl.DataFrame({
    "int_id": [5, 10],
    "value": ["1", "2"]
})

df_right = pl.DataFrame({
    "int_id": [4, 7, 12],
    "value": [1, 2, 3]
})

df_left.join_asof(df_right, on="int_id", tolerance=3)

In [18]:
df_left = df_left.set_sorted("int_id")
df_right = df_right.set_sorted("int_id")

df_left.join_asof(df_right, on="int_id")

In [19]:
df_right = df_right.rename({"int_id": "int_id_right"})

df_left.join_asof(
    df_right,
    left_on="int_id",
    right_on="int_id_right",
)

In [20]:
df_left.join_asof(
    df_right,
    left_on="int_id",
    right_on="int_id_right",
    tolerance=3,
    strategy="backward"
)

In [21]:
df_left.join_asof(
    df_right,
    left_on="int_id",
    right_on="int_id_right",
    tolerance=3,
    strategy="forward"
)

In [22]:
df_left.join_asof(
    df_right,
    left_on="int_id",
    right_on="int_id_right",
    tolerance=3,
    strategy="nearest"
)

### Additional Finetuning with `tolerance` and `by`

### Use Case: Marketing Campaign Attribution

In [25]:
marketing_lf = pl.scan_csv("data/marketing use case/marketing_campaigns.csv")
marketing_lf.fetch(1)

In [26]:
marketing_lf.select(pl.col("Product Type").unique()).collect()

In [27]:
sales_lf = pl.scan_csv("data/marketing use case/sales_data.csv")
sales_lf.fetch(1)

In [28]:
sales_lf = sales_lf.with_columns(
    pl.col("Sale Date")
    .str.to_datetime("%Y-%m-%d %H:%M:%S%.f")
    .cast(pl.Datetime("us")),
)
marketing_lf = marketing_lf.with_columns(
    pl.col("Campaign Date").str.to_datetime("%Y-%m-%d %H:%M:%S"),
)

sales_with_campaign_df = (
    sales_lf
    .sort("Sale Date")
    .join_asof(
        marketing_lf
        .sort("Campaign Date"),
        left_on="Sale Date",
        right_on="Campaign Date",
        by="Product Type",
        strategy="backward",
        tolerance="60d"
    )
    .collect()
)
sales_with_campaign_df

In [29]:
(
    sales_with_campaign_df
    .group_by("Product Type", "Campaign Name")
    .agg(pl.col("Quantity").mean())
    .sort("Product Type", "Campaign Name")
)

In [30]:
marketing_lf.filter(pl.col("Product Type") == "Books").collect()

In [31]:
(
    sales_lf
    .filter(
        (pl.col("Product Type") == "Books") &
        (
            pl.col("Sale Date") >
            pl.lit("2023-12-31 21:00:00").str.to_datetime()
        )
    )
    .collect()
)

## Vertical and Horizontal Concatenation

In [33]:
df1 = pl.DataFrame({
    "id": [1, 2, 3],
    "value": ["a", "b", "c"],
})
df2 = pl.DataFrame({
    "id": [4, 5],
    "value": ["d", "e"],
})
pl.concat([df1,df2], how="vertical")

In [34]:
df1 = pl.DataFrame({
    "id": [1, 2, 3],
    "value": ["a", "b", "c"],
})
df2 = pl.DataFrame({
    "value2": ["x", "y"],
})
pl.concat([df1,df2], how="horizontal")

In [35]:
df1 = pl.DataFrame({
    "id": [1, 2, 3],
    "value": ["a", "b", "c"],
})
df2 = pl.DataFrame({
    "value": ["d", "e"],
    "value2": ["x", "y"],
})
pl.concat([df1,df2], how="diagonal")

In [36]:
df1 = pl.DataFrame({
    "id": [1, 2, 3],
    "value": ["a", "b", "c"],
})
df2 = pl.DataFrame({
    "value": ["a", "c", "d"],
    "value2": ["x", "y", "z"],
})
pl.concat([df1,df2], how="align")

In [37]:
df1 = pl.DataFrame({
    "id": [1, 2, 3],
    "value": ["a", "b", "c"],
})
df2 = pl.DataFrame({
    "id": [4.0, 5.0],
    "value": [1, 2],
})
pl.concat([df1,df2], how="vertical")

In [38]:
pl.concat([df1,df2], how="vertical_relaxed")

In [39]:
df1 = pl.DataFrame({
    "id": [1, 2, 2],
    "value": ["a", "c", "b"],
})
df2 = pl.DataFrame({
    "id": [2, 2],
    "value": ["x", "y"],
})
pl.align_frames(df1,df2, on="id")

In [40]:
df1 = pl.DataFrame({
    "id": [1, 2],
    "value": ["a", "b"],
})
df2 = pl.DataFrame({
    "id": [3, 4],
    "value": ["c", "d"],
})
df1.vstack(df2)

In [41]:
df1 = pl.DataFrame({
    "id": [1, 2],
    "value": ["a", "b"],
})
df2 = pl.DataFrame({
    "value2": ["x", "y"],
})
df1.hstack(df2)

In [42]:
s1 = pl.Series("a", [1, 2])
s2 = pl.Series("b", [3, 4])
s1.append(s2)

In [43]:
df1 = pl.DataFrame({
    "id": [1, 2],
    "value": ["a", "b"],
})
df2 = pl.DataFrame({
    "id": [3, 4],
    "value": ["c", "d"],
})
df1.extend(df2)

## Conclusion