In [1]:
from pathlib import Path

import dill as pickle
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

pd.set_option("display.max_columns", None)  # Display all columns
plt.style.use("ggplot")

## Transforming test set

Thius notebook will take all of the cleaning and transformation steps undertaken in 2_cleaning notebook and apply them to the test set. 
Here we have to be careful not to apply any knowledge from the training set ot the test set (dat aleakage)

In [2]:
processed_data_dir = Path(Path.cwd().parent, "data", "processed")

interim_data_dir = Path(Path.cwd().parent, "data", "interim")
X_test = pd.read_pickle(Path(interim_data_dir, "X_test.pkl"))
y_test = pd.read_pickle(Path(interim_data_dir, "y_test.pkl"))

with open(Path(interim_data_dir, "sales_revenue_imputer.pkl"), "rb") as f:
    sales_revenue_imputer = pickle.load(f)
with open(Path(interim_data_dir, "employee_count_imputer.pkl"), "rb") as f:
    employee_count_imputer = pickle.load(f)

with open(Path(interim_data_dir, "OHE.pkl"), "rb") as f:
    ohe_enc = pickle.load(f)
with open(Path(interim_data_dir, "scaler.pkl"), "rb") as f:
    scaler = pickle.load(f)

In [3]:
class CleanXTest:
    def __init__(self, X_test: pd.DataFrame):
        self.X_test = X_test

    def impute_cols(self):
        sales_revenue_imputed_vals = sales_revenue_imputer.transform(
            X_test["DNB_GLOBAL_SALES_REVENUE"].values.reshape(-1, 1)
        )
        self.X_test["DNB_GLOBAL_SALES_REVENUE"] = sales_revenue_imputed_vals

        employee_count_imputed_vals = employee_count_imputer.transform(
            X_test["DNB_GLOBAL_EMPLOYEE_COUNT"].values.reshape(-1, 1)
        )
        self.X_test["DNB_GLOBAL_EMPLOYEE_COUNT"] = employee_count_imputed_vals

        self.X_test["SECTOR"] = self.X_test["SECTOR"].fillna("missing")

    def fill_ARR_zeroes(self):
        """Since we are only interrested in filling ARR zeroes
        when a customer has churned, and we dont have that information at inference
        We do not need to handle this in the test set.
        """
        pass

    def fix_datatypes(self):
        self.X_test["ACCOUNTING_MONTH"] = pd.to_datetime(
            self.X_test["ACCOUNTING_MONTH"], utc=False
        )
        self.X_test["RENEWAL_MONTH"] = pd.to_datetime(
            self.X_test["RENEWAL_MONTH"], utc=False
        )
        self.X_test["CONTRACT_START_DATE"] = pd.to_datetime(
            self.X_test["CONTRACT_START_DATE"], utc=False
        )

    def decompose_time_features(self):
        self.X_test["ACCOUNTING_MONTH"] = pd.to_datetime(
            self.X_test["ACCOUNTING_MONTH"], utc=False
        )
        self.X_test["RENEWAL_MONTH"] = pd.to_datetime(
            self.X_test["RENEWAL_MONTH"], utc=False
        )
        self.X_test["CONTRACT_START_DATE"] = pd.to_datetime(
            self.X_test["CONTRACT_START_DATE"], utc=False
        )
        self.X_test["DAYS_TO_CONTRACT_END"] = pd.to_timedelta(
            (self.X_test["RENEWAL_MONTH"] - self.X_test["ACCOUNTING_MONTH"]).values
        ).days

        self.X_test["DAYS_FROM_CONTRACT_START"] = pd.to_timedelta(
            (
                self.X_test["ACCOUNTING_MONTH"] - self.X_test["CONTRACT_START_DATE"]
            ).values
        ).days

        # Years
        self.X_test["ACCOUNTING_YEAR"] = self.X_test["ACCOUNTING_MONTH"].dt.year
        self.X_test["RENEWAL_YEAR"] = self.X_test["RENEWAL_MONTH"].dt.year
        self.X_test["CONTRACT_START_DATE_YEAR"] = self.X_test[
            "CONTRACT_START_DATE"
        ].dt.year
        # Days - Most of these are just 1 but I'll handle this later when inspecting number of unique vals per col
        self.X_test["ACCOUNTING_DAY"] = self.X_test["ACCOUNTING_MONTH"].dt.day
        self.X_test["RENEWAL_DAY"] = self.X_test["RENEWAL_MONTH"].dt.day
        self.X_test["CONTRACT_START_DATE_DAY"] = self.X_test[
            "CONTRACT_START_DATE"
        ].dt.day

        # Months
        self.X_test["ACCOUNTING_MONTH"] = self.X_test["ACCOUNTING_MONTH"].dt.month
        self.X_test["RENEWAL_MONTH"] = self.X_test["RENEWAL_MONTH"].dt.month
        self.X_test["CONTRACT_START_DATE_MONTH"] = self.X_test[
            "CONTRACT_START_DATE"
        ].dt.month

        self.X_test = self.X_test.drop("CONTRACT_START_DATE", axis=1)

    def circular_time_feature_transform(self):
        self.X_test["RENEWAL_MONTH_SIN"] = self.X_test["RENEWAL_MONTH"].apply(
            lambda x: np.sin(x / 12 * 2 * np.pi)
        )
        self.X_test["RENEWAL_MONTH_COS"] = self.X_test["RENEWAL_MONTH"].apply(
            lambda x: np.cos(x / 12 * 2 * np.pi)
        )

        self.X_test["ACCOUNTING_MONTH_SIN"] = self.X_test["ACCOUNTING_MONTH"].apply(
            lambda x: np.sin(x / 12 * 2 * np.pi)
        )
        self.X_test["ACCOUNTING_MONTH_COS"] = self.X_test["ACCOUNTING_MONTH"].apply(
            lambda x: np.cos(x / 12 * 2 * np.pi)
        )

        self.X_test["CONTRACT_START_DATE_MONTH_SIN"] = self.X_test[
            "CONTRACT_START_DATE_MONTH"
        ].apply(lambda x: np.sin(x / 12 * 2 * np.pi))
        self.X_test["CONTRACT_START_DATE_MONTH_COS"] = self.X_test[
            "CONTRACT_START_DATE_MONTH"
        ].apply(lambda x: np.cos(x / 12 * 2 * np.pi))

        self.X_test["CONTRACT_START_DATE_DAY_SIN"] = self.X_test[
            "CONTRACT_START_DATE_DAY"
        ].apply(lambda x: np.sin(x / 30 * 2 * np.pi))
        self.X_test["CONTRACT_START_DATE_DAY_COS"] = self.X_test[
            "CONTRACT_START_DATE_DAY"
        ].apply(lambda x: np.cos(x / 30 * 2 * np.pi))

    def remove_useless_features(self):
        self.X_test = self.X_test.drop(
            [
                "SURVEY_AVG_CXI_SCORE",
                "SURVEY_AVG_NPS_SCORE",
                "SURVEY_AVG_CASE_MOOD_SCORE",
                "ACCOUNTING_MONTH",
                "RENEWAL_MONTH",
                "CONTRACT_START_DATE_MONTH",
                "CONTRACT_START_DATE_DAY",
                "RENEWAL_DAY",
                "ACCOUNTING_DAY",
                "PRODUCT_TWENTYONE",
                "PRODUCT_TWENTYTWO",
                "SALESFORCEACCOUNTID",
            ],
            axis=1,
        )

    def scale_features(self):
        cols_to_scale = list(scaler.get_feature_names_out())
        scaled_X = pd.DataFrame(
            columns=cols_to_scale, data=scaler.transform(self.X_test[cols_to_scale])
        )
        self.X_test = self.X_test.drop(cols_to_scale, axis=1, errors="ignore")
        self.X_test = pd.concat(
            [self.X_test.reset_index(drop=True), scaled_X.reset_index(drop=True)],
            axis=1,
        )

    def encode_categorical_features(self):
        X_test_categorical = self.X_test.select_dtypes(exclude="number")
        self.X_test = self.X_test.drop(X_test_categorical.columns, axis=1)
        ohe_enc.transform(X_test_categorical)
        ohe_vals = ohe_enc.transform(X_test_categorical).toarray()
        self.X_test_categorical_ohe = pd.DataFrame(
            columns=ohe_enc.get_feature_names_out(), data=ohe_vals
        )
        self.X_test = pd.concat(
            [
                self.X_test.reset_index(drop=True),
                self.X_test_categorical_ohe.reset_index(drop=True),
            ],
            axis=1,
        )

    def run_all_transformations(self):
        self.impute_cols()
        self.decompose_time_features()
        self.circular_time_feature_transform()
        self.remove_useless_features()
        self.scale_features()
        self.encode_categorical_features()

In [4]:
cleaner = CleanXTest(X_test)
cleaner.run_all_transformations()

In [5]:
X_test = cleaner.X_test

In [6]:
X_test.to_pickle(Path(processed_data_dir, "X_test_transformed.pkl"))