In [13]:
import holidays
import numpy as np
import pandas as pd
import sklearn
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler

from utils.config import load_config
from utils.data import download_competition_data

config = load_config("config.yaml")
sklearn.set_config(transform_output="pandas")

train, test = download_competition_data(
    competition_name=config.competition_name,
    data_path=config.data_path,
)
train, test = train.set_index("id"), test.set_index("id")

Config loaded from config.yaml.
Data for competition 'playground-series-s5e8' already exists at 'data/playground-series-s5e8.zip'.


In [14]:
# map months from string to integer
month_map = {
    "jan": 1,
    "feb": 2,
    "mar": 3,
    "apr": 4,
    "may": 5,
    "jun": 6,
    "jul": 7,
    "aug": 8,
    "sep": 9,
    "oct": 10,
    "nov": 11,
    "dec": 12,
}
train["month"] = train["month"].map(month_map)
test["month"] = test["month"].map(month_map)


# fix invalid day/month combinations
def fix_days_and_months(row):
    if row["day"] == 31 and row["month"] in [4, 6, 9, 11]:
        row["day"] = 30
    if row["day"] >= 29 and row["month"] == 2:
        row["day"] = 28
    return row


train = train.apply(fix_days_and_months, axis=1)
test = test.apply(fix_days_and_months, axis=1)


# convert day and month to datetime and add date-related features
train["date"] = pd.to_datetime(
    dict(year=2024, month=train["month"], day=train["day"]), errors="coerce"
)
test["date"] = pd.to_datetime(
    dict(year=2024, month=test["month"], day=test["day"]), errors="coerce"
)
train["day_of_year"] = train["date"].dt.dayofyear
test["day_of_year"] = test["date"].dt.dayofyear
train["day_of_week"] = train["date"].dt.dayofweek
test["day_of_week"] = test["date"].dt.dayofweek
train["week_of_year"] = train["date"].dt.isocalendar().week
test["week_of_year"] = test["date"].dt.isocalendar().week
train["quarter"] = train["date"].dt.quarter
test["quarter"] = test["date"].dt.quarter


# extract season from month
def get_season(month):
    if month in [12, 1, 2]:
        return "Winter"
    elif month in [3, 4, 5]:
        return "Spring"
    elif month in [6, 7, 8]:
        return "Summer"
    else:
        return "Autumn"


train["season"] = train["month"].apply(get_season)
test["season"] = test["month"].apply(get_season)


# extract holiday from date
us_holidays = holidays.US(years=2024)
train["is_holiday"] = train["date"].isin(us_holidays).astype(int)
test["is_holiday"] = test["date"].isin(us_holidays).astype(int)


# create cyclical features for date-related features
train["sin_dow"] = np.sin(2 * np.pi * train["day_of_week"] / 7)
train["cos_dow"] = np.cos(2 * np.pi * train["day_of_week"] / 7)
test["sin_dow"] = np.sin(2 * np.pi * test["day_of_week"] / 7)
test["cos_dow"] = np.cos(2 * np.pi * test["day_of_week"] / 7)
train["sin_dom"] = np.sin(2 * np.pi * train["day"] / 31)
train["cos_dom"] = np.cos(2 * np.pi * train["day"] / 31)
test["sin_dom"] = np.sin(2 * np.pi * test["day"] / 31)
test["cos_dom"] = np.cos(2 * np.pi * test["day"] / 31)
train["sin_doy"] = np.sin(2 * np.pi * train["day_of_year"] / 365)
train["cos_doy"] = np.cos(2 * np.pi * train["day_of_year"] / 365)
test["sin_doy"] = np.sin(2 * np.pi * test["day_of_year"] / 365)
test["cos_doy"] = np.cos(2 * np.pi * test["day_of_year"] / 365)
train["sin_woy"] = np.sin(2 * np.pi * train["week_of_year"] / 52)
train["cos_woy"] = np.cos(2 * np.pi * train["week_of_year"] / 52)
test["sin_woy"] = np.sin(2 * np.pi * test["week_of_year"] / 52)
test["cos_woy"] = np.cos(2 * np.pi * test["week_of_year"] / 52)
train["sin_moy"] = np.sin(2 * np.pi * train["month"] / 12)
train["cos_moy"] = np.cos(2 * np.pi * train["month"] / 12)
test["sin_moy"] = np.sin(2 * np.pi * test["month"] / 12)
test["cos_moy"] = np.cos(2 * np.pi * test["month"] / 12)


# drop original date-related features
train.drop(
    columns=["day", "month", "date", "day_of_year", "day_of_week", "week_of_year"],
    inplace=True,
)
test.drop(
    columns=["day", "month", "date", "day_of_year", "day_of_week", "week_of_year"],
    inplace=True,
)


# create flag if the client has been contacted before
# and map pdays to a more meaningful value
train["never_contacted_before"] = train.apply(
    lambda row: 1 if row["pdays"] == -1 else 0, axis=1
)
test["never_contacted_before"] = test.apply(
    lambda row: 1 if row["pdays"] == -1 else 0, axis=1
)
max_pdays = train["pdays"].max()
train["pdays"] = train["pdays"].apply(lambda x: x if x != -1 else max_pdays)
test["pdays"] = test["pdays"].apply(lambda x: x if x != -1 else max_pdays)


# create features based on interactions
train["balance_per_age"] = train["balance"] / (train["age"])
test["balance_per_age"] = test["balance"] / (test["age"])
train["housing_and_loan"] = train.apply(
    lambda row: "yes" if row["housing"] == "yes" and row["loan"] == "yes" else "no",
    axis=1,
)
test["housing_and_loan"] = test.apply(
    lambda row: "yes" if row["housing"] == "yes" and row["loan"] == "yes" else "no",
    axis=1,
)

  train["is_holiday"] = train["date"].isin(us_holidays).astype(int)
  test["is_holiday"] = test["date"].isin(us_holidays).astype(int)


In [15]:
# split datasets into features/target, as well as train/validation
X_train_raw = train.drop(columns=[config.target_column])
y_train_raw = train[config.target_column]
X_test_raw = test.copy()

X_train_raw, X_val_raw, y_train_raw, y_val_raw = train_test_split(
    X_train_raw, y_train_raw, test_size=0.1, random_state=42, stratify=y_train_raw
)

In [17]:
# sanity check to ensure that the target variable is balanced
features = X_train_raw.columns.tolist()
binary_features = [
    "default",
    "housing",
    "loan",
    "never_contacted_before",
    "is_holiday",
    "housing_and_loan",
]
categorical_features = [
    "job",
    "marital",
    "education",
    "contact",
    "poutcome",
    "quarter",
    "season",
]
numerical_features = [
    "age",
    "balance",
    "duration",
    "campaign",
    "pdays",
    "previous",
    "sin_dow",
    "cos_dow",
    "sin_dom",
    "cos_dom",
    "sin_doy",
    "cos_doy",
    "sin_woy",
    "cos_woy",
    "sin_moy",
    "cos_moy",
    "balance_per_age",
]

# assert every feature is in one of the feature categories
assert set(features) == set(binary_features + categorical_features + numerical_features)

# assert every feature is in only one feature category
assert len(set(numerical_features) & set(binary_features)) == 0
assert len(set(binary_features) & set(categorical_features)) == 0
assert len(set(categorical_features) & set(numerical_features)) == 0

In [18]:
pipeline = Pipeline(
    [
        (
            "column_transform",
            ColumnTransformer(
                [
                    (
                        "binary_encode",
                        OrdinalEncoder(
                            handle_unknown="use_encoded_value", unknown_value=np.nan
                        ),
                        binary_features,
                    ),
                    (
                        "categorical_encode",
                        OneHotEncoder(
                            handle_unknown="ignore",
                            sparse_output=False,
                            drop="first",
                        ),
                        categorical_features,
                    ),
                    (
                        "scale_numerical",
                        StandardScaler(),
                        numerical_features,
                    ),
                ],
                remainder="drop",
                verbose_feature_names_out=False,
            ),
        ),
    ]
)

X_train = pipeline.fit_transform(X_train_raw)
X_val = pipeline.transform(X_val_raw)
X_test = pipeline.transform(X_test_raw)

oe = OrdinalEncoder()
y_train = oe.fit_transform(pd.DataFrame(y_train_raw)).iloc[:, 0]
y_val = oe.transform(pd.DataFrame(y_val_raw)).iloc[:, 0]

In [None]:
X_train.to_csv(config.data_path / config.X_train_file)
y_train.to_csv(config.data_path / config.y_train_file)
X_val.to_csv(config.data_path / config.X_val_file)
y_val.to_csv(config.data_path / config.y_val_file)
X_test.to_csv(config.data_path / config.X_test_file)