In [0]:
from sklearn.metrics import roc_auc_score
import shap

In [0]:
def get_binary_target_summary(df, target_column, scores_col, n_buckets = 10, create_copy = False, feats = None):
  
    """Returns metrics of the model per decile (accuracy, uplift, rolling rent and average values of the variables)

    Parameters:
    mt: Master table, stored in a Pandas Data Frame. It must contain the target, the scores, and the features.
    target_column: string indicating the name of the column containing the target variable.
    scores_col: string indicating the name of the column containing the scores of the model.
    n_buckets: Number of buckets to divide the data frame in (for deciles, set this value to 10 and for percentiles to 100).
    create_copy: indicates whether to create a copy in memory of the Data Frame.
    feats: If provided, computes the average values of this features for the score buckets (deciles, percentiles, ...).

    Returns:
    Tuple of a data frame containing model performance per decile, and another one containing average values of the variables per decile

    """
    
    if create_copy:
        df = df.copy()
    
    total_obs = df.shape[0]
    total_pos_obs = df[target_column].sum()
    avg_target_occurrence = df[target_column].mean()
    n_elements_per_group = round(total_obs / n_buckets)
    rolling_rent = df[df[target_column].values].crent.sum()
    df.sort_values(scores_col, ascending = False, inplace = True)
    df.loc[:, 'ntile'] = np.cumsum([((i % n_elements_per_group) == 0) for i in range(total_obs)])
    df.loc[df.ntile > n_buckets, 'ntile'] = n_buckets
    
    # number of observations per ntile
    summary_per_ntile = df.groupby('ntile')['hunit'].count().reset_index()
    summary_per_ntile.rename(columns = {'hunit': 'n_obs'}, inplace = True)
    
    # number of positive observations per ntile
    pos_obs_per_ntile = df[df[target_column].values].groupby('ntile')['hunit'].count().reset_index()
    pos_obs_per_ntile.rename(columns = {'hunit': 'n_pos_obs'}, inplace = True)
    
    summary_per_ntile = pd.merge(summary_per_ntile, pos_obs_per_ntile, on = 'ntile', how = 'left')
    summary_per_ntile.n_pos_obs.fillna(0, inplace = True)
    
    # rolling rent per ntile
    rolling_rent_per_ntile = df[df[target_column].values].groupby('ntile')['crent'].sum().reset_index()
    rolling_rent_per_ntile.rename(columns = {'crent': 'rolling_rent'}, inplace = True)
    rolling_rent_per_ntile['rolling_rent'] = rolling_rent_per_ntile['rolling_rent'] / rolling_rent
    
    summary_per_ntile = pd.merge(summary_per_ntile, rolling_rent_per_ntile, on = 'ntile', how = 'left')
    summary_per_ntile.rolling_rent.fillna(0, inplace = True)    
    
    # standard modelling metrics
    summary_per_ntile.loc[:, 'accuracy'] = summary_per_ntile.n_pos_obs / summary_per_ntile.n_obs
    summary_per_ntile.loc[:, 'uplift'] = summary_per_ntile.accuracy / avg_target_occurrence
    summary_per_ntile.loc[:, 'recall'] = np.cumsum(summary_per_ntile.n_pos_obs) / total_pos_obs
    summary_per_ntile.loc[:, 'acc_accuracy'] = (np.cumsum(summary_per_ntile.n_pos_obs) / np.cumsum(summary_per_ntile.n_obs))
    summary_per_ntile.loc[:, 'acc_uplift'] =  summary_per_ntile.acc_accuracy / avg_target_occurrence
    summary_per_ntile.loc[:, 'acc_rolling_rent'] = (np.cumsum(summary_per_ntile.rolling_rent))
    
#     min_score_per_bucket = df.groupby('ntile')[scores_col].min().reset_index().rename(columns = {scores_col: 'min_score'})
#     avg_score_per_bucket = df.groupby('ntile')[scores_col].mean().reset_index().rename(columns = {scores_col: 'avg_score'})
#     max_score_per_bucket = df.groupby('ntile')[scores_col].max().reset_index().rename(columns = {scores_col: 'max_score'})
    
    
#     summary_per_ntile = pd.merge(summary_per_ntile, min_score_per_bucket, on = 'ntile', how = 'left')
#     summary_per_ntile = pd.merge(summary_per_ntile, avg_score_per_bucket, on = 'ntile', how = 'left')
#     summary_per_ntile = pd.merge(summary_per_ntile, max_score_per_bucket, on = 'ntile', how = 'left')

    if feats is not None:
      avg_feats_per_ntile =  df.groupby('ntile')[feats].mean().reset_index()
    else:
      avg_feats_per_ntile = None
    
    
    return summary_per_ntile, avg_feats_per_ntile

In [0]:
def get_imp_df(feats, model):
  
  """Creates data frame with the gain of each variable of the model. Please not that this function might not work as is for all models (e.g., lightgbm), due to the difference in gettting the feature importances.

  Parameters:
  feats: list containing the feature names.
  model: model on which the gain is to be computed

  Returns:
  Data Frame with two columns: feature name and feature importance

  """
    
  imp_df = pd.DataFrame({
      'name': feats,
      'imp': model.feature_importances_
  }).sort_values('imp', ascending = False)
  imp_df.loc[:, 'imp'] = imp_df.imp / imp_df.imp.sum()
  return imp_df

In [0]:
def get_shap_df(model, X):
  
  """Calculates shap values of the features for the observations passed

  Parameters:
  model: model on which the shaps are to be computed.
  X: Pandas Data Frame, in the same format as the ones passed for model training / predicting. 

  Returns:
  Data Frame with as many rows as X and one more column: it contains the shap for each variable plus the baseline value (the same for all observations).

  """  
  
  explainer = shap.TreeExplainer(model),
  shap_values = explainer.shap_values(X)
  shap_values_df = pd.DataFrame(shap_values[1], columns = ['shap_' + i for i in X.columns])
  shap_values_df['baseline_value'] = explainer.expected_value[1]
  return shap_values_df

In [0]:
def add_model_scores(mt, model, feats, target_col, X_train, X_dev, X_test, X_leads):
  
  """Gets master table with scores added. It only scores observations in a set (train, dev, test or leads)

  Parameters:
  mt: Pandas Data Frame containing the master table, for all sets.
  model: model on which the shaps are to be computed.
  feats: list containing the feature names.
  X_train: Pandas Data Frame containing the features and only the train set. It could also be computed inside the function.
  X_dev: Pandas Data Frame containing the features and only the dev set. It could also be computed inside the function.
  X_test: Pandas Data Frame containing the features and only the test set. It could also be computed inside the function.
  X_leads: Pandas Data Frame containing the features and only the leads set. It could also be computed inside the function.

  Returns:
  Data Frame one row per observation included in train, dev, test or leads, with the ID columns, the target, the features, the set column and a scores column named preds.

  """  
  
  output_df = mt.loc[~mt.set.isna()][['htent', 'hunit', 'id_date',  target_col, 'set'] + feats].copy()
  output_df.loc[output_df.set == 'train', 'preds'] = model.predict_proba(X_train)[:, 1]
  output_df.loc[output_df.set == 'dev', 'preds'] = model.predict_proba(X_dev)[:, 1]
  output_df.loc[output_df.set == 'test', 'preds'] = model.predict_proba(X_test)[:, 1]
  output_df.loc[output_df.set == 'leads', 'preds'] = model.predict_proba(X_leads)[:, 1]
  return output_df

In [0]:
def get_performance_dict(mt_with_scores, target_col, feats, model):
  
  """Evaluates the performance of the model, using get_binary_target_summary and get_imp_df.

  Parameters:
  mt_with_scores: Pandas Data Frame as returned by add_model_scores.
  target_col: string indicating the name of the column containing the target variable.
  feats: list containing the feature names.
  model: model on which the shaps are to be computed.

  Returns:
  Dictionary with five elements, each of them being a data frame:
  - Performance per decile (accuracy, uplift, recall, ...) for the traning set, as returned by get_binary_target_summary.
  - Performance per decile (accuracy, uplift, recall, ...) for the dev set, as returned by get_binary_target_summary.
  - Performance per decile (accuracy, uplift, recall, ...) for the test set, as returned by get_binary_target_summary.
  - Data Frame with feature importances, as returned by get_imp_df.
  - Data Frame with average feature values per decile, as returned by get_binary_target_summary.

  """  
  performance_dict = {}
  for i in ['train', 'dev', 'test']:
    performance_dict[i], avg_feats = get_binary_target_summary(df = mt_with_scores.loc[mt.set == i], target_column = target_col, scores_col = 'preds', n_buckets = 10, feats = feats)
    
  performance_dict['top_vars'] = get_imp_df(feats, model)
  performance_dict['avg_values_per_decile'] = avg_feats    
  
  return performance_dict

In [0]:
def get_leads_df(mt_with_scores):
  """Gets the leads Data Frame

  Parameters:
  mt_with_scores: Pandas Data Frame as returned by add_model_scores.

  Returns:
  Pandas Data Frame, with all leads (customers scored on the leads date), containing the ID columns and the scores.

  """  
  leads_df = mt_with_scores.loc[mt_with_scores.set == 'leads'][['htent', 'hunit', 'id_date', 'set', 'preds']]
  return leads_df