<a href="https://colab.research.google.com/github/inderkaur19/Knowledge_Graphs/blob/main/Untitled22.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd

# Simulating a DataFrame similar to the snippet shared
# Creating a date range and random volume data
np.random.seed(42)
dates = pd.date_range(start="2023-08-08", end="2023-12-29", freq="B")  # Business days as shown in the sample
volume_data = np.random.randint(2000, 12000, size=len(dates))

# Constructing the DataFrame
data_df = pd.DataFrame({
    "pay_date": dates,
    "volume": volume_data,
})

# Adding a 'period' column to simulate training and new periods for PSI calculation
# We'll split the data in half by date as an example
split_date = "2023-10-15"
data_df['period'] = np.where(data_df['pay_date'] < split_date, 'train', 'test')

# Displaying the sample DataFrame
data_df.head(), data_df.tail()


(    pay_date  volume period
 0 2023-08-08    9270  train
 1 2023-08-09    2860  train
 2 2023-08-10    7390  train
 3 2023-08-11    7191  train
 4 2023-08-14    7734  train,
       pay_date  volume period
 99  2023-12-25    8184   test
 100 2023-12-26    5099   test
 101 2023-12-27    8278   test
 102 2023-12-28   10392   test
 103 2023-12-29    5104   test)

In [2]:
import numpy as np
import pandas as pd

class PSICalculator:
    def __init__(self, data, quantiles=10, training_period='train', new_period='test', channel=None, column=None):
        self.data = data
        self.quantiles = quantiles
        self.training_period = training_period
        self.new_period = new_period
        self.channel = channel
        self.column = column

    def filter_data(self):
        if self.channel:
            return self.data[self.data['channel'] == self.channel]
        return self.data

    def calculate_quantiles(self, data, column):
        return pd.qcut(data[column], q=self.quantiles, duplicates='drop', retbins=True)

    def calculate_psi(self):
        filtered_data = self.filter_data()

        train_data = filtered_data[filtered_data['period'] == self.training_period]
        new_data = filtered_data[filtered_data['period'] == self.new_period]

        # Calculate quantiles and get bin breakpoints
        train_binned, breakpoints = self.calculate_quantiles(train_data, self.column)
        new_binned = pd.cut(new_data[self.column], bins=breakpoints)

        # Count values in each bucket
        initial_counts = train_binned.value_counts(sort=False)
        new_counts = new_binned.value_counts(sort=False)

        # Calculate PSI summary DataFrame
        df = pd.DataFrame({
            'Bucket': np.arange(1, len(breakpoints)),  # 1 to number of quantiles
            'Breakpoint Value': breakpoints[1:],       # Exclude the minimum value for bins
            'Initial Count': initial_counts.values,
            'New Count': new_counts.values
        })
        df['Initial Percent'] = df['Initial Count'] / len(train_data)
        df['New Percent'] = df['New Count'] / len(new_data)

        # PSI calculation
        psi_values = (df['Initial Percent'] - df['New Percent']) * np.log(df['Initial Percent'] / df['New Percent'])
        psi = psi_values.sum()

        return psi, df

# Example usage (assuming data_df is prepared as shown earlier):
# psi_calculator = PSICalculator(data=data_df, quantiles=10, training_period='train', new_period='test', column='volume')
# psi_value, summary_df = psi_calculator.calculate_psi()
# print("PSI:", psi_value)
# print(summary_df)


In [4]:
# Example usage would look like this (assuming data is loaded into `data_df`):
psi_calculator = PSICalculator(data=data_df, quantiles=10, training_period='train', new_period='test', column='volume')
psi_value,summary_df = psi_calculator.calculate_psi()
print("PSI:", psi_value)
summary_df

# This code refactoring is designed to provide flexibility for PSI calculation based on user-defined parameters.
# Let me know if you would like me to run this with a sample dataset or make further modifications.


PSI: 0.1825335552296513


Unnamed: 0,Bucket,Breakpoint Value,Initial Count,New Count,Initial Percent,New Percent
0,1,3250.4,5,6,0.102041,0.109091
1,2,4278.6,5,3,0.102041,0.054545
2,3,4850.2,5,2,0.102041,0.036364
3,4,6451.8,5,8,0.102041,0.145455
4,5,7311.0,5,4,0.102041,0.072727
5,6,7702.8,4,3,0.081633,0.054545
6,7,8737.4,5,6,0.102041,0.109091
7,8,9780.0,5,6,0.102041,0.109091
8,9,10801.2,5,10,0.102041,0.181818
9,10,11998.0,5,5,0.102041,0.090909
