In [1]:
import pandas as pd
import numpy as np

import featuretools as ft

from woodwork.logical_types import Categorical, Boolean, Datetime, Double

# from featuretools_sklearn_transformer import DFSTransformer => did not work
from featuretools.selection import (
    remove_highly_correlated_features,
    remove_highly_null_features,
    remove_single_value_features,
)

from datetime import datetime
from uuid import uuid4
from dateutil.relativedelta import relativedelta
from sklearn.model_selection import train_test_split
import xgboost as xgb

from scipy.stats import uniform, randint


from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.base import BaseEstimator, TransformerMixin
from typing import List, Any
from enum import Enum
from dataclasses import dataclass

pd.option_context("max_columns", None)

<pandas._config.config.option_context at 0x107800190>

In [None]:
# https://github.com/IBM/telco-customer-churn-on-icp4d
URL_TO_DATA = "https://raw.githubusercontent.com/IBM/telco-customer-churn-on-icp4d/master/data/Telco-Customer-Churn.csv"


BACK_COUNT_DATE = datetime.fromisoformat("2022-01-01")
TEST_SIZE = 0.2
VALID_SIZE = 0.25
RANDOM_STATE = 42
NUMERIC_TRANSFORMER_REPLACEMENT = "median"

In [None]:
numeric_transformer = Pipeline(
    steps=[
        (
            "imputer",
            SimpleImputer(
                missing_values=np.nan, strategy=NUMERIC_TRANSFORMER_REPLACEMENT
            ),
        ),
        ("scaler", StandardScaler()),
    ]
)


def get_cols_by_type(
    in_df: pd.DataFrame, types_to_search: list, exclude_ids: Boolean = True
) -> list:
    cols = in_df.select_dtypes(include=types_to_search).columns.to_list()

    if exclude_ids:
        return list(filter(lambda x: not x.endswith("ID"), cols))

    return cols

- ft titanic https://www.kaggle.com/code/liananapalkova/automated-feature-engineering-for-titanic-dataset/notebook
- the dataset https://github.com/IBM/telco-customer-churn-on-icp4d/blob/master/data/Telco-Customer-Churn.csv
- titanic https://medium.com/dataexplorations/tool-review-can-featuretools-simplify-the-process-of-feature-engineering-5d165100b0c3
- time indexing recomandations https://stackoverflow.com/questions/49711987/how-do-i-prevent-data-leakage-with-featuretools




### Denormalize into 
- customer_df:      customer_id (PK), subscription_id (FK), gender, SeniorCitizen, Partner, Dependents, tenure, Churn
- services_df:      service_id (PK), service_name
- subscription_df:  subscription_id (PK), service_id, customer_id
- billing_df:       billing_id, Contract, PaperlessBilling, PaymentMethod, MonthlyCharges, TotalCharges


In [None]:
# in case of CERTIFICATE_VERIFY_FAILED run Install Certificates.command
# see also https://stackoverflow.com/questions/50236117/scraping-ssl-certificate-verify-failed-error-for-http-en-wikipedia-org
df = pd.read_csv(filepath_or_buffer=URL_TO_DATA, index_col=0)

In [2]:
# convert
df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors="coerce")

# generate syntetic time index
df["ContractStartDate"] = list(
    map(
        lambda tenure, dat=BACK_COUNT_DATE: dat - relativedelta(months=-tenure),
        df["tenure"],
    )
)

# convert to categoric for pipline processing


df["customerID"] = df.index
df["billingID"] = [str(uuid4()) for _ in range(df.shape[0])]
df["subscriptionID"] = [str(uuid4()) for _ in range(df.shape[0])]

# convert to 1/0
df["Churn"] = np.where(df["Churn"] == "Yes", 1, 0)

NameError: name 'df' is not defined

In [None]:
# TODO: write nice dataclass to map featertools type, pandas and sklearn type
# add this info also

CUSTOMERS = [
    # "customerID",
    "gender",
    "SeniorCitizen",
    "Partner",
    "Dependents",
]


SUBSCRIPTIONS = [
    # "customerID",
    "PhoneService",
    "MultipleLines",
    "InternetService",
    "OnlineSecurity",
    "OnlineBackup",
    "DeviceProtection",
    "TechSupport",
    "StreamingTV",
    "StreamingMovies",
]


BILLINGS = [
    # "customerID",
    "tenure",
    "Contract",
    "PaperlessBilling",
    "PaymentMethod",
    "MonthlyCharges",
    "TotalCharges",
    # "Churn",
]


class EntitiesEnum(Enum):
    cu = "customer'"
    BILLINGS = "billings"


@dataclass
class EntitySetColumns:
    new_dataframe_name: str
    index: str
    additional_columns: list


entity_set_columns = {
    "customers": EntitySetColumns(
        index="customerID", new_dataframe_name=None, additional_columns=CUSTOMERS
    ),
    "subscriptions": EntitySetColumns(
        index="subscriptionID",
        new_dataframe_name="subscriptions",
        additional_columns=SUBSCRIPTIONS,
    ),
    "billings": EntitySetColumns(
        index="billingID", new_dataframe_name="billings", additional_columns=BILLINGS
    ),
}

In [None]:
y = df["Churn"]
X = df.drop(columns=["Churn"])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE
)
X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=VALID_SIZE, random_state=RANDOM_STATE
)  # 0.25 x 0.8 = 0.2

In [None]:
variable_types = {
    "gender": Boolean,
    "SeniorCitizen": Boolean,
    "Partner": Boolean,
    "Dependents": Boolean,
    "PhoneService": Boolean,
    "PaperlessBilling": Boolean,
    "MultipleLines": Categorical,
    "InternetService": Categorical,
    "OnlineSecurity": Categorical,
    "OnlineBackup": Categorical,
    "DeviceProtection": Categorical,
    "TechSupport": Categorical,
    "StreamingTV": Categorical,
    "StreamingMovies": Categorical,
    "Contract": Categorical,
    "PaymentMethod": Categorical,
    "tenure": Double,
    "MonthlyCharges": Double,
    "TotalCharges": Double,
    "subscriptionID": Categorical,
    "customerID": Categorical,
    "billingID": Categorical,
}

In [None]:
def entityset_wrapper(
    id: str,
    feature_dataset: pd.DataFrame,
    variable_types: dict,
    entity_set_columns: dict,
) -> ft.EntitySet:
    """Non generic wrapper for making ft.EntitySet"""
    es = ft.EntitySet(id=id)

    es.add_dataframe(
        dataframe_name="customers",
        index=entity_set_columns.get("customers").index,
        logical_types=variable_types,
        dataframe=feature_dataset,
    )

    es.normalize_dataframe(
        base_dataframe_name="customers",
        new_dataframe_name=entity_set_columns.get("billings").new_dataframe_name,
        index=entity_set_columns.get("billings").index,
        additional_columns=entity_set_columns.get("billings").additional_columns,
    )

    es.normalize_dataframe(
        base_dataframe_name="customers",
        new_dataframe_name=entity_set_columns.get("subscriptions").new_dataframe_name,
        index=entity_set_columns.get("subscriptions").index,
        additional_columns=entity_set_columns.get("subscriptions").additional_columns,
    )

    return es

In [None]:
es = entityset_wrapper(
    id="customers_train",
    feature_dataset=X_train,
    variable_types=variable_types,
    entity_set_columns=entity_set_columns,
)
es.plot()

In [None]:
# es = ft.EntitySet(id="customer_churn")

# es.add_dataframe(
#     dataframe_name="customers",
#     index="customerID",
#     # time_index="ContractStartDate",
#     logical_types=variable_types,
#     dataframe=X_train,
# )

# es.normalize_dataframe(
#     base_dataframe_name="customers",
#     new_dataframe_name="subscriptions",
#     index="subscriptionID",
#     additional_columns=SUBSCRIPTIONS,
# )

# es.normalize_dataframe(
#     base_dataframe_name="customers",
#     new_dataframe_name="billings",
#     index="billingID",
#     additional_columns=BILLING,
# )

In [None]:
feature_matrix, feature_defs = ft.dfs(
    entityset=es,
    target_dataframe_name="customers",
    max_depth=2,
    include_cutoff_time=False,
    agg_primitives=None,
    cutoff_time=None,
    instance_ids=None,
)

feature_matrix.shape

In [None]:
feature_matrix_enc, features_enc = ft.encode_features(feature_matrix, feature_defs)

In [None]:
all_features = feature_matrix_enc.columns.to_list()

In [None]:
# fmt: off
feature_selector = Pipeline(
    steps=[
      #  ("rm_highly_correlated_features",   FunctionTransformer(remove_highly_correlated_features)),
      #  ("rm_highly_null_features",         FunctionTransformer(remove_highly_null_features)),
       # ("rm_single_value_features",        FunctionTransformer(fun)),
    ]
)
# fmt: on
numeric_features_ft = get_cols_by_type(
    in_df=feature_matrix_enc,
    types_to_search=[np.float64, np.int64],
    exclude_ids=False,
)
preprocessor = ColumnTransformer(
    transformers=[
        # ("feature_selector", feature_selector, all_features),
        ("num", numeric_transformer, numeric_features_ft),
        # ('test', FunctionTransformer(remove_single_value_features), all_features),
    ]
)
preprocessor

In [None]:
# fmt: off
#preprocessor = ColumnTransformer(
#    transformers=[
#    ("num", numeric_transformer, get_categoric_cols(in_df=X_train, types_to_search=["dtype('float64')", "dtype('float64')"]))
#    ]
#)


clf = Pipeline(
    # steps=[("preprocessor", preprocessor), ("classifier", xgb.XGBClassifier(objective="binary:logistic", random_state=RANDOM_STATE))]
    steps=[ ("preprocessor", preprocessor), 
            ("classifier", RandomForestClassifier())]  # ("preprocessor", preprocessor),
            
)
# fmt: on
clf.fit(feature_matrix_enc, y_train)
# print("model score: %.3f" % clf.score(X_val, y_val))
# print("model score: %.3f" % clf.score(X_test, y_test))

## Estimating baseline Model

In [None]:
# fmt: off
one_hot_enc = OneHotEncoder(handle_unknown="ignore")
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, get_cols_by_type(in_df=X_train, types_to_search=[np.float64])),
        ("onehot", one_hot_enc,      get_cols_by_type(in_df=X_train, types_to_search=[np.object0, np.object_])),
    ]
)
# fmt: on
preprocessor

In [None]:
from sklearn.ensemble import RandomForestClassifier

clf = Pipeline(
    # steps=[("preprocessor", preprocessor), ("classifier", xgb.XGBClassifier(objective="binary:logistic", random_state=RANDOM_STATE))]
    steps=[("preprocessor", preprocessor), ("classifier", RandomForestClassifier())]
)

clf.fit(X_train, y_train)
print("model score: %.3f" % clf.score(X_val, y_val))
print("model score: %.3f" % clf.score(X_test, y_test))

In [None]:
print("test pre commit")

____