In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import linregress
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler

In [None]:
data = {
    "Year": ["2016-17", "2017-18", "2018-19", "2019-20", "2020-21", "2021-22", "2022-23"],
    "Total Financial Txn Volume (Mn)": [7138.40, 9857.60, 16806.25, 26493.23, 37512.73, 65299.45, 105412.75],
    "UPI Txn Volume (Mn)": [17.86, 915.23, 5353.40, 12518.62, 22330.65, 39440.44, 81622.10],
    "Total Financial Txn Value (Bn)": [96626.07, 113552.76, 136719.23, 160923.65, 165529.97, 236828.28, 321919.49],
    "UPI Txn Value (Bn)": [69.47, 1098.32, 8769.70, 21317.30, 41036.54, 68294.38, 138691.68]
}

df = pd.DataFrame(data)
df["% of UPI Volume to Total"] = (df["UPI Txn Volume (Mn)"] / df["Total Financial Txn Volume (Mn)"]) * 100
df["% of UPI Value to Total"] = (df["UPI Txn Value (Bn)"] / df["Total Financial Txn Value (Bn)"]) * 100

# print(df)

In [None]:
def calculate_cagr(end_value, start_value, periods):
    return (end_value/start_value) ** (1/periods) - 1

# Calculate CAGR for UPI transaction volume and value
volume_cagr = calculate_cagr(df["% of UPI Volume to Total"].iloc[-1], df["% of UPI Volume to Total"].iloc[0], len(df) - 1)
value_cagr = calculate_cagr(df["% of UPI Value to Total"].iloc[-1], df["% of UPI Value to Total"].iloc[0], len(df) - 1)

print(f"Compound Annual Growth Rate for UPI Transaction Volume: {volume_cagr:.2%}")
print(f"Compound Annual Growth Rate for UPI Transaction Value: {value_cagr:.2%}")

Compound Annual Growth Rate for UPI Transaction Volume: 160.08%
Compound Annual Growth Rate for UPI Transaction Value: 190.36%


In [None]:
# hypothetical data
additional_data = {
    "Internet Penetration": [17, 18, 20, 30, 43, 46, 52],
    "Smartphone Usage": [23, 29, 35, 46, 54, 61, 66],
    # "GDP": [2295, 2651, 2702, 2835, 2671, 3150, 3390],
    "Digital Literacy": [20, 28, 33, 35, 38, 41, 52]
}

for key, value in additional_data.items():
    df[key] = value


Correlation b/w various variables

In [None]:
# Calculate correlation matrix for only numeric columns
numeric_columns = df.select_dtypes(include=[np.number])  # This automatically excludes non-numeric columns like year in this case
correlation_matrix = numeric_columns.corr()

print(correlation_matrix[["UPI Txn Volume (Mn)", "UPI Txn Value (Bn)"]])

                                 UPI Txn Volume (Mn)  UPI Txn Value (Bn)
Total Financial Txn Volume (Mn)             0.994738            0.995402
UPI Txn Volume (Mn)                         1.000000            0.999687
Total Financial Txn Value (Bn)              0.985943            0.985468
UPI Txn Value (Bn)                          0.999687            1.000000
% of UPI Volume to Total                    0.837099            0.845318
% of UPI Value to Total                     0.951747            0.958271
Internet Penetration                        0.887245            0.897441
Smartphone Usage                            0.870888            0.879256
Digital Literacy                            0.917663            0.920279


Data preprocessing

In [None]:
X_train = np.column_stack((
    additional_data["Internet Penetration"],
    additional_data["Smartphone Usage"],
    # additional_data["GDP"],
    additional_data["Digital Literacy"]
))
y1 = np.array([0.25, 9.28, 31.85, 47.25, 59.53, 60.39, 77.43])
y2 = np.array([0.07, 0.97, 6.41, 13.25, 24.79, 28.83, 43.08])

weights1 = correlation_matrix["UPI Txn Volume (Mn)"][-4:].values * 10
weights2 = correlation_matrix["UPI Txn Value (Bn)"][-4:].values * 10

In [None]:
import numpy as np

def project_features(current_values, growth_rates, n_years):
    projected_matrix = [list(current_values.values())]
    for year in range(1, n_years):
        current_values_dict = dict(zip(current_values.keys(), projected_matrix[-1]))
        next_values = {feature: value * (1 + growth_rates[feature]) for feature, value in current_values_dict.items()}
        projected_matrix.append(list(next_values.values()))
    return np.array(projected_matrix)

current_values = {
    "Internet Penetration": additional_data["Internet Penetration"][-1],
    "Smartphone Usage": additional_data["Smartphone Usage"][-1],
    # "GDP": additional_data["GDP"][-1],
    "Digital Literacy": additional_data["Digital Literacy"][-1]
}
growth_rates = {
    "Internet Penetration": 0.087,  # From 14% in 2014 to over 52% in 2024, approximately 8.7% annual growth rate
    "Smartphone Usage": 0.06,       # Expected to increase at a CAGR of 6% from 2021 to 2026
    # "GDP": 0.079,                   # Real GDP is estimated to grow at an average of 7.9% between FY22 and FY24
    "Digital Literacy": 0.034       # Assuming a linear growth from 15% in 2018 to 65% in 2023, approximately 3.4% annual growth rate
}

n_years = 10

X_test = project_features(current_values, growth_rates, n_years)
print(X_test)

In [None]:
X_combined = np.vstack((X_train, X_test))

scaler = StandardScaler()
X_combined_normalized = scaler.fit_transform(X_combined)

split_index = X_train.shape[0]
X_train_normalized = X_combined_normalized[:split_index, :]
X_test_normalized = X_combined_normalized[split_index:, :]
# X_train_normalized, X_test_normalized

Prediction using Weighted Linear Regression



1. UPI Txn Volume %age

In [None]:
model1 = LinearRegression()
model1.fit(X_train_normalized, y1)
print("Training predictions yearwise from 2016-2023: ", model1.predict(X_train_normalized))
print("Test predictions yearwise 2023-24 onwards: ", model1.predict(X_test_normalized))

Training predictions yearwise from 2016-2023:  [-7.94919890e-03  1.52699925e+01  2.83309366e+01  4.44231223e+01
  5.32223926e+01  6.65463804e+01  7.81951249e+01]
Test predictions yearwise 2023-24 onwards:  [ 78.19512486  83.93743107  89.92106308  96.15279498 102.63918025
 109.38648544 116.40061531 123.68702848 131.25064272 139.09572856]


2. UPI Txn Value %age

In [None]:
model2 = LinearRegression()
model1.fit(X_train_normalized, y2)
print("Training predictions yearwise from 2016-2023: ", model1.predict(X_train_normalized))
print("Test predictions yearwise 2023-24 onwards: ", model1.predict(X_test_normalized))

Training predictions yearwise from 2016-2023:  [-1.50877052  2.89985628  5.90004219 12.93605775 26.53384269 28.358775
 42.28019661]
Test predictions yearwise 2023-24 onwards:  [ 42.28019661  46.88707192  51.8814905   57.29852602  63.17645819
  69.55706344  76.48593173  84.0128117   92.19198677 101.0826849 ]


As per our hypothesis, UPI captures the digital market in the following way:

- By volume, UPI transactions will cross 85% in 1 year (by the end of FY 2024-25)
- By value, UPI transactions will cross 65% in 4 years (by the end of FY 2027-28)


