# Examples

## Basic

In [1]:
import polars as pl
from polars_utils import register_extensions
import time
import random

register_extensions()

# Register the extensions
register_extensions()

# Create sample DataFrames
df1 = pl.DataFrame({
    "id": [1, 2, 3, 4, None],
    "name": ["A", "B", "C", "D", "E"],
    "value": [10, 20, 30, 40, 50],
    "mixed": ["1", "2", "3", "4", "5"]
})

df2 = pl.DataFrame({
    "id": [1, 2, 3, 3, None],
    "name": ["A", "B", "C", "C", "F"],
    "score": [100, 200, 300, 400, 500],
    "mixed": [1, 2, 3, 4, 5]
})

# Analyze potential join relationships
print("Analyzing join possibilities between df1 and df2:")
df1.polars_utils.join_analysis(df2)

# Perform a join using the identified key
joined_df = df1.join(
    df2,
    left_on="id",
    right_on="id",
    how="inner"
)

print("\nSample of joined data:")
print(joined_df)

Output()

Analyzing join possibilities between df1 and df2:



Sample of joined data:
shape: (4, 7)
┌─────┬──────┬───────┬───────┬────────────┬───────┬─────────────┐
│ id  ┆ name ┆ value ┆ mixed ┆ name_right ┆ score ┆ mixed_right │
│ --- ┆ ---  ┆ ---   ┆ ---   ┆ ---        ┆ ---   ┆ ---         │
│ i64 ┆ str  ┆ i64   ┆ str   ┆ str        ┆ i64   ┆ i64         │
╞═════╪══════╪═══════╪═══════╪════════════╪═══════╪═════════════╡
│ 1   ┆ A    ┆ 10    ┆ 1     ┆ A          ┆ 100   ┆ 1           │
│ 2   ┆ B    ┆ 20    ┆ 2     ┆ B          ┆ 200   ┆ 2           │
│ 3   ┆ C    ┆ 30    ┆ 3     ┆ C          ┆ 300   ┆ 3           │
│ 3   ┆ C    ┆ 30    ┆ 3     ┆ C          ┆ 400   ┆ 4           │
└─────┴──────┴───────┴───────┴────────────┴───────┴─────────────┘


## Larger dataframes

In [2]:
# Create DataFrames with realistic data patterns
size = 1_000_000  # Base size for df1
random.seed(42)  # For reproducibility

# Generate unique IDs for df1
df1_ids = list(range(1, size + 1))
random.shuffle(df1_ids)

# Create df1 with unique IDs
df1 = pl.DataFrame({
    "id": df1_ids,
    "value": [f"value_{i}" for i in range(size)],
    "category": random.choices(["A", "B", "C"], k=size)
})

# Create df2 with repeated IDs (one-to-many relationship)
# Some IDs will appear multiple times, others won't appear at all
repeated_ids = []
for id in df1_ids[:int(size * 0.7)]:  # 70% of df1 IDs will have matches
    # Each ID appears 1-5 times
    repeats = random.randint(1, 5)
    repeated_ids.extend([id] * repeats)

# Create df2 with repeated IDs
df2 = pl.DataFrame({
    "id": repeated_ids,
    "score": [random.random() * 100 for _ in range(len(repeated_ids))],
    "status": random.choices(["active", "inactive"], k=len(repeated_ids))
})

# Analyze join possibilities
start_time = time.time()
print("Join Analysis:")
df1.polars_utils.join_analysis(df2)
print(f"Analysis completed in {time.time() - start_time:.2f} seconds")

# Print additional statistics
print("\nDataFrame Statistics:")
print(f"df1 total rows: {len(df1):,}")
print(f"df2 total rows: {len(df2):,}")
print(f"df1 unique IDs: {df1['id'].n_unique():,}")
print(f"df2 unique IDs: {df2['id'].n_unique():,}")


Output()

Join Analysis:


Analysis completed in 2.90 seconds

DataFrame Statistics:
df1 total rows: 1,000,000
df2 total rows: 2,098,758
df1 unique IDs: 1,000,000
df2 unique IDs: 700,000


## Null handling

In [3]:

# Create DataFrames with null values
customers = pl.DataFrame({
    "customer_id": [1, 2, None, 4, 5],
    "email": ["a@ex.com", None, "c@ex.com", "d@ex.com", "e@ex.com"],
})

purchases = pl.DataFrame({
    "customer_id": [1, 2, 3, None, 5],
    "amount": [100, 200, 300, 400, 500],
})

# Analyze join possibilities
print("Join Analysis:")
customers.polars_utils.join_analysis(purchases)

Output()

Join Analysis:


## Type Coercion

In [4]:
import polars as pl
from polars_utils import register_extensions

register_extensions()

# Create DataFrames with different types
users = pl.DataFrame({
    "user_id": ["1", "2", "3", "4"],  # String IDs
    "name": ["Alice", "Bob", "Charlie", "David"],
})

orders = pl.DataFrame({
    "user_id": [1, 2, 2, 3],  # Integer IDs
    "order_amount": [100, 200, 150, 300],
})

# Analyze join possibilities - will show type differences and potential coercions
print("Join Analysis:")
users.polars_utils.join_analysis(orders)

# Convert string IDs to integers for joining
users_converted = users.with_columns([
    pl.col("user_id").cast(pl.Int64)
])

# Perform the join with converted types
joined = users_converted.join(
    orders,
    on="user_id",
    how="left"
)

print("\nJoined Result:")
print(joined)

print("\nJoined DataFrame Schema:")
print(joined.schema) 

Output()

Join Analysis:



Joined Result:
shape: (5, 3)
┌─────────┬─────────┬──────────────┐
│ user_id ┆ name    ┆ order_amount │
│ ---     ┆ ---     ┆ ---          │
│ i64     ┆ str     ┆ i64          │
╞═════════╪═════════╪══════════════╡
│ 1       ┆ Alice   ┆ 100          │
│ 2       ┆ Bob     ┆ 200          │
│ 2       ┆ Bob     ┆ 150          │
│ 3       ┆ Charlie ┆ 300          │
│ 4       ┆ David   ┆ null         │
└─────────┴─────────┴──────────────┘

Joined DataFrame Schema:
Schema({'user_id': Int64, 'name': String, 'order_amount': Int64})
