In [6]:
import datarobot as dr
import pandas as pd
import numpy as np
from datetime import datetime

In [2]:
dr.Client()

<datarobot.rest.RESTClientObject at 0x113ff7ed0>

In [52]:
training = pd.read_csv('Customer Churn - Primary Table.csv')
training_copy = pd.read_csv('Secondary Churn.csv')
reviews = pd.read_csv('Customer Reviews - Churn Demo.csv') 
usage = pd.read_csv('Product Usage - Churn Demo.csv')

## Load datasets to DataRobot's AI Catalog

In [53]:
today = datetime.now().date()

#primary
primary = dr.Dataset.create_from_in_memory_data(training)
primary.modify(name=f'Customers - Primary Churn Dataset - {today}')

#copy of primary for a self join
primary_copy = dr.Dataset.create_from_in_memory_data(training_copy)
primary_copy.modify(name=f'Customers - Self Join Dataset - {today}')

#secondary
secondary = dr.Dataset.create_from_in_memory_data(usage)
secondary.modify(name=f'Product Usage - {today}')

#tertiary
tertiary = dr.Dataset.create_from_in_memory_data(reviews)
tertiary.modify(name=f'Customer Reviews - {today}')

## Set up environment vars

In [54]:
target = 'Churn'
series_id = 'Customer_Name'
date_col = 'Prediction_Point'

## Create our project

In [55]:
project_name = f'Customer Churn with AFE - {today}'
project = dr.Project.create_from_dataset(
    primary.id, 
    project_name=project_name,
)

In [56]:
# primary_copy_catalog_id = primary_copy.id
# primary_copy_catalog_version_id = primary_copy.version_id

# secondary_id = secondary.id
# secondary_version_id = secondary.version_id

# tertiary_id = tertiary.id
# tertiary_version_id = tertiary.version_id

## Create featurelists for each of our datasets

In [57]:
primary_features = [
    f.name for f in primary_copy.get_all_features() if f.name not in [series_id, date_col]
]
primary_featurelist = primary_copy.create_featurelist(
    name='v1',
    features=primary_features,
)

secondary_features = [
    f.name for f in secondary.get_all_features() if f.name not in [series_id]
]
secondary_featurelist = secondary.create_featurelist(
    name='v1',
    features=secondary_features,
)

tertiary_features = [
    f.name for f in tertiary.get_all_features() if f.name not in [series_id,'Renewal_Date']
]
tertiary_featurelist = tertiary.create_featurelist(
    name='v1',
    features=tertiary_features,
)

In [58]:
primary_dataset_definition = dr.DatasetDefinition(
    identifier='Customers',
    catalog_id=primary_copy.id,
    catalog_version_id=primary_copy.version_id,
    primary_temporal_key=date_col,
    feature_list_id=primary_featurelist.id,
)

secondary_dataset_definition = dr.DatasetDefinition(
    identifier='Product_Usage',
    catalog_id=secondary.id,
    catalog_version_id=secondary.version_id,
    primary_temporal_key='Activity_Timestamp',
    feature_list_id=secondary_featurelist.id,
)

tertiary_dataset_definition = dr.DatasetDefinition(
    identifier='CSAT',
    catalog_id=tertiary.id,
    catalog_version_id=tertiary.version_id,
    primary_temporal_key='Renewal_Date',
    feature_list_id=tertiary_featurelist.id,
)

## Define the secondary and tertiary relationships to our primary dataset

In [59]:
primary_relationship = dr.Relationship(
    dataset2_identifier='Customers',
    dataset1_keys=[series_id],
    dataset2_keys=[series_id],
    feature_derivation_window_start=-5,
    feature_derivation_window_end=0,
    feature_derivation_window_time_unit='YEAR',
    prediction_point_rounding=1,
    prediction_point_rounding_time_unit='DAY',
)

secondary_relationship = dr.Relationship(
    dataset2_identifier='CSAT',
    dataset1_keys=[series_id],
    dataset2_keys=[series_id],
    feature_derivation_window_start=-270,
    feature_derivation_window_end=0,
    feature_derivation_window_time_unit='DAY',
    prediction_point_rounding=1,
    prediction_point_rounding_time_unit='DAY',
)

tertiary_relationship = dr.Relationship(
    dataset2_identifier='Product_Usage',
    dataset1_keys=[series_id],
    dataset2_keys=[series_id],
    feature_derivation_window_start=-180,
    feature_derivation_window_end=0,
    feature_derivation_window_time_unit='DAY',
    prediction_point_rounding=1,
    prediction_point_rounding_time_unit='DAY',
)

## Set dataset definitions and relationships

In [60]:
dataset_definitions = [
    primary_dataset_definition, 
    secondary_dataset_definition,
    tertiary_dataset_definition,
]

relationships = [
    primary_relationship,
    secondary_relationship,
    tertiary_relationship,
]

## Configure optional settings to control which types of features are explored and created

In [61]:
feature_discovery_settings = [
    {'name': 'enable_token_counts', 'value': True},
    {'name': 'enable_day', 'value': False},
    {'name': 'enable_numeric_minimum', 'value': False},
    {'name': 'enable_numeric_standard_deviation', 'value': False},
    {'name': 'enable_numeric_maximum', 'value': False},
    {'name': 'enable_word_count', 'value': False},
    {'name': 'enable_latest_value', 'value': False},
    {'name': 'enable_categorical_num_unique', 'value': True},
    {'name': 'enable_hour', 'value': False},
    {'name': 'enable_numeric_kurtosis', 'value': False},
    {'name': 'enable_day_of_week', 'value': False},
    {'name': 'enable_numeric_skewness', 'value': False},
    {'name': 'enable_numeric_median', 'value': False},
    {'name': 'enable_categorical_statistics', 'value': True},
    {'name': 'enable_entropy', 'value': False},
    {'name': 'enable_numeric_sum', 'value': False},
    {'name': 'enable_latest_without_window', 'value': True}
]

## Create a relationship configuration object to pass to our DataRobot project

In [62]:
relationship_config = dr.RelationshipsConfiguration.create(
    dataset_definitions=dataset_definitions, 
    relationships=relationships,
    feature_discovery_settings=feature_discovery_settings
)

## Create a partitioning specification object to pass to our DataRobot project

In [63]:
partitioning_spec = dr.DatetimePartitioningSpecification(
    datetime_partition_column='Prediction_Point',
    disable_holdout=True,
    number_of_backtests=3,
    use_time_series=False,
)

project.set_datetime_partitioning(datetime_partition_spec=partitioning_spec)

<datarobot.helpers.partitioning_methods.DatetimePartitioning at 0x11e457e90>

## Create advanced options and kickoff a project

In [64]:
advanced_options = dr.AdvancedOptions(
    shap_only_mode=True,
    primary_location_column='geometry',
)

In [65]:
project.analyze_and_model(
    target='Churn', 
    relationships_configuration_id=relationship_config.id, 
    mode='manual', 
    advanced_options=advanced_options,
)

Project(Customer Churn with AFE - 2023-03-27)

In [66]:
project.get_uri()

'https://app.datarobot.com/projects/6421c64304a2ba9346d56208/models'

## Create a new featurelist and start Autopilot in quick mode

In [67]:
informative_features = [f for f in project.get_featurelists() if f.name=='Informative Features'][0]

In [68]:
features_to_drop = [
    series_id,
    'Renewal_Date (Year)',
    'Customer_Since_Date (Year)',
    'Customer_Since_Date (Day of Week)',
    'Customer_Since_Date (Month)',
    'Prediction_Point (Day of Week)',
    'Prediction_Point (Month)',
    'Renewal_Date (Day of Week)',
    'Prediction_Point',
    'Product_Usage (days since previous event by Customer_Name) (1 month avg)',
    'Product_Usage (days since previous event by Customer_Name) (1 month missing count)',
    'Product_Usage (days since previous event by Customer_Name) (180 days avg)',
    'Product_Usage (days since previous event by Customer_Name) (180 days missing count)',
    'CSAT (270 days count)',
    'Customers[ARR] (5 years missing count)',
    
]

In [69]:
new_features = [f for f in informative_features.features if f not in features_to_drop]

In [70]:
v1 = project.create_featurelist(
    name='v1',
    features=new_features,
)

In [71]:
project.start_autopilot(
    featurelist_id = v1.id,
    mode='quick',
    blend_best_models=False,
)

In [72]:
top_model = project.get_top_model()
top_model

Model('Light Gradient Boosting on ElasticNet Predictions ')

## Trigger the feature impact calculation

In [75]:
shap_impact_job = dr.ShapImpact.create(project_id=project.id, model_id=top_model.id)
shap_impact = shap_impact_job.get_result_when_complete()