## This notebook contains the functions which can be used to monitor the ml models deployed on production environment.

### In order to monitor the ml models, we will consider two paramters-
### 1.PSI (Probability stability index)
### 2. Measure of central tendency (Mean, Median and mode)

## Using PSI/CSI

In [1]:
def psi(score_initial, score_new, num_bins = 10, mode = 'fixed'):
    
    epsilon = 1e-4
    
    # Sort the data
    score_initial.sort()
    score_new.sort()
    
    # Prepare the bins
    min_val = min(min(score_initial), min(score_new))
    max_val = max(max(score_initial), max(score_new))
    if mode == 'fixed':
        bins = [min_val + (max_val - min_val)*(i)/num_bins for i in range(num_bins+1)]
    elif mode == 'quantile':
        bins = pd.qcut(score_initial, q = num_bins, retbins = True)[1] # Create the quantiles based on the initial population
    else:
        raise ValueError(f"Mode \'{mode}\' not recognized. Your options are \'fixed\' and \'quantile\'")
    bins[0] = min_val - epsilon # Correct the lower boundary
    bins[-1] = max_val + epsilon # Correct the higher boundary
        
        
    # Bucketize the initial population and count the sample inside each bucket
    bins_initial = pd.cut(score_initial, bins = bins, labels = range(1,num_bins+1))
    df_initial = pd.DataFrame({'train': score_initial, 'bin': bins_initial})
    grp_initial = df_initial.groupby('bin').count()
    grp_initial['percent_train'] = grp_initial['train'] / sum(grp_initial['train'])
    
    # Bucketize the new population and count the sample inside each bucket
    bins_new = pd.cut(score_new, bins = bins, labels = range(1,num_bins+1))
    df_new = pd.DataFrame({'prod': score_new, 'bin': bins_new})
    grp_new = df_new.groupby('bin').count()
    grp_new['percent_prod'] = grp_new['prod'] / sum(grp_new['prod'])
    
    # Compare the bins to calculate PSI
    psi_df = grp_initial.join(grp_new, on = "bin", how = "inner")
    
    # Add a small value for when the percent is zero
    psi_df['percent_train'] = psi_df['percent_train'].apply(lambda x: epsilon if x == 0 else x)
    psi_df['percent_prod'] = psi_df['percent_prod'].apply(lambda x: epsilon if x == 0 else x)
    
    # Calculate the psi
    psi_df['psi'] = (psi_df['percent_train'] - psi_df['percent_prod']) * np.log(psi_df['percent_train'] / psi_df['percent_prod'])
    
    # Return the psi values
    # print(psi_df)
    return psi_df,psi_df['psi'].values

### Function to plot the bucket wise distribution

In [None]:
import matplotlib.pyplot as plt
plt.rcParams.update({'figure.max_open_warning': 0})
def plot_train_vs_prod(df:pd.DataFrame,train_distribution_col:str,prod_distribution_col:str,title:str)->None:
    df[[train_distribution_col,prod_distribution_col]].plot(kind = 'bar')
    plot_title = f"{title} train vs prod"
    plt.title(plot_title)
    plt.ylabel("score frequency")
    plt.xlabel("score buckets")
    plt.rcParams["figure.figsize"] = (15,15)

### Function to just calculate PSI when bucketization is already done.

In [None]:
def calculate_psi_overall(training_probs,prod_probs):
    from math import log
    return (sum((training_probs[i]-prod_probs[i])*log(training_probs[i]/prod_probs[i]) for i in range(len(training_probs))))/len(training_probs)

### Function to calculate the kl divergence when bucketization is already done.

In [None]:
def calculate_kl_divergence_overall(trained_scores,prod_scores):
    from math import log
    return (sum(trained_scores[i] * log(trained_scores[i]/prod_scores[i]) for i in range(len(trained_scores))))/len(trained_scores)

## Using the measures of central tendency