## Walkthrough of how to output helpful insights for ML models

In [1113]:
import datarobotx as drx
import datarobot as dr
import pandas as pd
import numpy as np
import datetime
from dateutil.relativedelta import relativedelta

from feature_impact_helpers import *
from lift_chart_helpers import *
from insights_over_time_helpers import *
from data_table_helpers import *
from error_metric_helpers import *

from importlib import reload
%reload_ext autoreload
%autoreload 2

import warnings
warnings.filterwarnings('ignore')

### Authenticate credentials

In [173]:
dr.Client()

<datarobot.rest.RESTClientObject at 0x134915750>

## 1. Retrieve and prep data

In [1107]:
project_id = '64306ff55e9a61743b67f2cf' #'64289f6808f30af7340d660c'
model_id = '643188a7cc3f5dc763b27dcc' #'6428a0d8763f72552338e7d6' 
deployment_id = '6431e17c9ef7e6fca717bb92'

# grab DataRobot project
project = dr.Project.get(project_id)

In [1108]:
# read in dataset
filename = 'feature_discovery_data.csv'
project.download_feature_discovery_dataset(file_name=filename)
df = pd.read_csv(filename)

In [1109]:
# subset data to after June 2021
df2 = df.loc[pd.to_datetime(df['Prediction_Point'])>=pd.to_datetime('2020-01-01'), :]

In [1111]:
df2.to_csv('rows_to_score.csv',index=False)

### Retrive DRX model object and deploy model

In [902]:
drx_model = drx.AutoMLModel.from_project_id(project_id)

VBox()

In [903]:
if deployment_id:
    drx_deployment = drx.Deployment.from_url(f'https://app.datarobot.com/deployments/{deployment_id}/')
else:
    today = datetime.datetime.now()
    drx_deployment = drx_model.deploy(name=f'Customer Churn - {today}')

### Get batch predictions and prediction explanations

In [904]:
drx.Context()._deployments_use_rt_endpoint=False
preds = drx_deployment.predict_proba(
    X=df2, 
    max_explanations=20
)

VBox()

In [905]:
# merge preds with original data
preds2 = preds.merge(
    df2.reset_index(drop=True),
    how='left',
    left_index=True,
    right_index=True,
)

### Use DRX helper functions to melt and reshape our predictions

In [906]:
melted = drx.melt_explanations(preds2, id_vars=list(df2.columns)+['class_1'])
reshaped = drx.featurize_explanations(preds2)

## 2. Now it's time to create our insights

### Plot feature impact

In [907]:
# aggregate feature impact
plot_feature_impact(melted, height=500)

In [908]:
# feature impact split by strength
plot_signed_feature_impact(melted, height=500, n=25)

In [915]:
# feature impact split by positive class predictions
# threshold = 0.15
# positive_preds = melted.loc[melted['class_1']>=threshold, :].copy()
industry='banking'
industry_preds = melted.loc[melted['Industry']==industry, :].copy()
plot_signed_feature_impact(industry_preds, height=500, n=25)

### Plot lift chart and prediction explanations

In [1065]:
# plot lift chart
plot_lift_chart(preds2, project_id, bins=10)

In [917]:
# plot lift chart with overlaid prediction explanations
plot_prediction_explanations_and_lift_chart(melted, project_id, showlegend=False, max_features=6, bins=12)

In [1165]:
plot_histogram(
    preds2,
    project.id,
    feature='Customer_Succes_Manager',
    bins=20,
    cutoff=0.3,
    split_by_class=True,
    class_type='predictions',
    showlegend=True,
    height=600,
)

In [943]:
# plot predictions and prediction explanations over time
date_col = 'Prediction_Point'
prep_and_plot_pe_over_time(melted, project_id, date_col, freq='QS', showlegend=False, max_features=6, height=600)

### Output pandas df with highlighted prediction explanations

In [1096]:
cols_to_keep = [
    'row_id',
    'Customer_Name', 
    #'Customer_Since_Date',
    #'Renewal_Date',
    'ARR', 
    #'Contract_Duration',
    'Product_Usage[UserID] (1 month unique count)',
    #'Customers[Customer_Succes_Manager] (5 years unique count)',
    'Industry',
    'Employee_Count', 
    'Annual_Revenue', 
    'Products_Purchased', 
    #'Zip_Code', 
    'City', 
    'State',
    f'{target}',
    'Latitude', 
    'Longitude',
    #'Customers[Products_Purchased] (latest)',
    'feature_name',
    'strength',
]

In [1097]:
#melted.columns

In [1098]:
melted_sample = melted.loc[:, melted.columns.isin(cols_to_keep)][cols_to_keep]
plot_overlaid_prediction_explanations(melted_sample, sample=21)

Unnamed: 0_level_0,Customer_Name,ARR,Product_Usage[UserID] (1 month unique count),Industry,Employee_Count,Annual_Revenue,Products_Purchased,City,State,Churn,Latitude,Longitude
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,Advanced Analysis Data,44916,3.0,healthcare,41368,3814084822,enterprise,Toledo,OH,0,41.652914,-83.537817
1,Advanced Application Consulting,89315,2.0,retail,46506,3246251142,starter_pack,Golden,CO,1,12.66562,
2,Advanced Construction Net,290890,1.0,telecom,5371,580268357,business_essentials,Portland,TX,0,27.881883,
3,Advanced Consulting Net,22369,1.0,manufacturing,224389,20591295545,starter_pack,Springtown,PA,1,40.556489,
4,Advanced Future Pacific,104154,2.0,other,79230,4786179478,enterprise,Brunswick,NE,0,42.337469,-97.970608
5,Advanced Future Pacific,424657,2.0,other,79230,4786179478,starter_pack,Brunswick,NE,0,42.337469,-97.970608
6,Advanced Future Pacific,181765,0.0,other,79230,4786179478,starter_pack,Brunswick,NE,0,42.337469,-97.970608
7,Advanced Innovation Federated,194413,2.0,telecom,222709,28549495408,starter_pack,Akron,OH,1,41.083064,-81.518485
8,Advanced Network Medicine,57022,2.0,manufacturing,58299,6041523250,starter_pack,West Mineral,KS,1,37.284225,-94.923018
9,Advanced Omega,99865,3.0,financial services,159844,23454823203,business_essentials,Maitland,MO,0,40.200826,-95.074419


## Let's look at how our features may have drifted over time

In [1042]:
plot_values_over_time(
    melted, 
    project_id=project_id,
    freq='MS',
    date_col=date_col, 
    feature='Product_Usage (1 month count)',
    class_type='actuals',
    showlegend=True,
)

In [1100]:
plot_values_over_time(
    melted, 
    project_id=project_id,
    freq='QS',
    date_col=date_col, 
    feature='Products_Purchased',
    showlegend=True,
)

In [1159]:
plot_error(
    preds2,
    project.id,
    feature='Industry',
    metric=log_loss,
    bins=10,
    showlegend=True,
)